From 408ffae03ed743a0506fdf9e8afd2931672a012d Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 3 Feb 2025 09:39:16 +0100 Subject: [PATCH] revert all new sampling functionality --- .../aggregate/holistic/reservoir_quantile.cpp | 2 +- .../catalog_entry/duck_table_entry.cpp | 4 - .../catalog_entry/table_catalog_entry.cpp | 4 - src/common/enum_util.cpp | 18 - src/execution/CMakeLists.txt | 4 +- src/execution/reservoir_sample.cpp | 324 ++++++ src/execution/sample/CMakeLists.txt | 5 - .../sample/base_reservoir_sample.cpp | 136 --- src/execution/sample/reservoir_sample.cpp | 931 ------------------ src/function/table/system/CMakeLists.txt | 1 - .../table/system/pragma_table_sample.cpp | 95 -- src/function/table/system_functions.cpp | 1 - .../catalog_entry/duck_table_entry.hpp | 2 - .../catalog_entry/table_catalog_entry.hpp | 3 - src/include/duckdb/common/enum_util.hpp | 8 - src/include/duckdb/common/random_engine.hpp | 6 +- .../duckdb/common/serializer/serializer.hpp | 1 - src/include/duckdb/common/types/uuid.hpp | 2 +- .../duckdb/execution/physical_operator.hpp | 2 +- .../duckdb/execution/reservoir_sample.hpp | 197 +--- .../function/table/system_functions.hpp | 4 - src/include/duckdb/main/client_data.hpp | 2 +- src/include/duckdb/storage/data_table.hpp | 3 - .../duckdb/storage/serialization/nodes.json | 3 +- .../storage/table/row_group_collection.hpp | 1 - .../duckdb/storage/table/table_statistics.hpp | 9 +- src/storage/data_table.cpp | 6 +- src/storage/serialization/serialize_nodes.cpp | 4 +- src/storage/table/row_group_collection.cpp | 18 +- src/storage/table/table_statistics.cpp | 96 +- .../can_sample_from_ingested_files.test | 44 - .../sample/reservoir_testing_percentage.test | 85 -- .../reservoir_testing_rows_value.test_slow | 8 +- .../sample/same_seed_same_sample.test_slow | 10 +- .../table_samples/basic_sample_tests.test | 88 -- ...sample_stores_rows_from_later_on.test_slow | 42 - ...table_sample_converts_to_block_sample.test | 59 -- .../table_sample_is_stored.test_slow | 150 --- .../test_sample_is_destroyed_on_updates.test | 125 --- .../table_samples/test_sample_types.test | 80 -- .../test_table_sample_errors.test | 21 - test/sql/sample/test_sample.test_slow | 6 + .../sql/storage/checkpointed_self_append.test | 1 + ...ace_drop_column_overflow_strings.test_slow | 2 +- test/sql/upsert/test_big_insert.test | 2 + 45 files changed, 399 insertions(+), 2216 deletions(-) create mode 100644 src/execution/reservoir_sample.cpp delete mode 100644 src/execution/sample/CMakeLists.txt delete mode 100644 src/execution/sample/base_reservoir_sample.cpp delete mode 100644 src/execution/sample/reservoir_sample.cpp delete mode 100644 src/function/table/system/pragma_table_sample.cpp delete mode 100644 test/sql/sample/can_sample_from_ingested_files.test delete mode 100644 test/sql/sample/reservoir_testing_percentage.test delete mode 100644 test/sql/sample/table_samples/basic_sample_tests.test delete mode 100644 test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow delete mode 100644 test/sql/sample/table_samples/table_sample_converts_to_block_sample.test delete mode 100644 test/sql/sample/table_samples/table_sample_is_stored.test_slow delete mode 100644 test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test delete mode 100644 test/sql/sample/table_samples/test_sample_types.test delete mode 100644 test/sql/sample/table_samples/test_table_sample_errors.test diff --git a/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp b/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp index 8c332500d6e5..e99912762d43 100644 --- a/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp +++ b/extension/core_functions/aggregate/holistic/reservoir_quantile.cpp @@ -39,7 +39,7 @@ struct ReservoirQuantileState { void FillReservoir(idx_t sample_size, T element) { if (pos < sample_size) { v[pos++] = element; - r_samp->InitializeReservoirWeights(pos, len); + r_samp->InitializeReservoir(pos, len); } else { D_ASSERT(r_samp->next_index_to_sample >= r_samp->num_entries_to_skip_b4_next_sample); if (r_samp->next_index_to_sample == r_samp->num_entries_to_skip_b4_next_sample) { diff --git a/src/catalog/catalog_entry/duck_table_entry.cpp b/src/catalog/catalog_entry/duck_table_entry.cpp index 4983710d9a99..7473417db132 100644 --- a/src/catalog/catalog_entry/duck_table_entry.cpp +++ b/src/catalog/catalog_entry/duck_table_entry.cpp @@ -136,10 +136,6 @@ unique_ptr DuckTableEntry::GetStatistics(ClientContext &context, return storage->GetStatistics(context, column.StorageOid()); } -unique_ptr DuckTableEntry::GetSample() { - return storage->GetSample(); -} - unique_ptr DuckTableEntry::AlterEntry(CatalogTransaction transaction, AlterInfo &info) { if (transaction.HasContext()) { return AlterEntry(transaction.GetContext(), info); diff --git a/src/catalog/catalog_entry/table_catalog_entry.cpp b/src/catalog/catalog_entry/table_catalog_entry.cpp index 3070b2e30d48..5a1e0f019c2d 100644 --- a/src/catalog/catalog_entry/table_catalog_entry.cpp +++ b/src/catalog/catalog_entry/table_catalog_entry.cpp @@ -43,10 +43,6 @@ LogicalIndex TableCatalogEntry::GetColumnIndex(string &column_name, bool if_exis return entry; } -unique_ptr TableCatalogEntry::GetSample() { - return nullptr; -} - bool TableCatalogEntry::ColumnExists(const string &name) const { return columns.ColumnExists(name); } diff --git a/src/common/enum_util.cpp b/src/common/enum_util.cpp index 49e7bb2b9b77..15db5f89adf4 100644 --- a/src/common/enum_util.cpp +++ b/src/common/enum_util.cpp @@ -3249,24 +3249,6 @@ SampleType EnumUtil::FromString(const char *value) { return static_cast(StringUtil::StringToEnum(GetSampleTypeValues(), 3, "SampleType", value)); } -const StringUtil::EnumStringLiteral *GetSamplingStateValues() { - static constexpr StringUtil::EnumStringLiteral values[] { - { static_cast(SamplingState::RANDOM), "RANDOM" }, - { static_cast(SamplingState::RESERVOIR), "RESERVOIR" } - }; - return values; -} - -template<> -const char* EnumUtil::ToChars(SamplingState value) { - return StringUtil::EnumToString(GetSamplingStateValues(), 2, "SamplingState", static_cast(value)); -} - -template<> -SamplingState EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetSamplingStateValues(), 2, "SamplingState", value)); -} - const StringUtil::EnumStringLiteral *GetScanTypeValues() { static constexpr StringUtil::EnumStringLiteral values[] { { static_cast(ScanType::TABLE), "TABLE" }, diff --git a/src/execution/CMakeLists.txt b/src/execution/CMakeLists.txt index 87c2a3bcc16e..ef0d0a3eb50e 100644 --- a/src/execution/CMakeLists.txt +++ b/src/execution/CMakeLists.txt @@ -3,7 +3,6 @@ add_subdirectory(nested_loop_join) add_subdirectory(operator) add_subdirectory(physical_plan) add_subdirectory(index) -add_subdirectory(sample) add_library_unity( duckdb_execution @@ -18,7 +17,8 @@ add_library_unity( perfect_aggregate_hashtable.cpp physical_operator.cpp physical_plan_generator.cpp - radix_partitioned_hashtable.cpp) + radix_partitioned_hashtable.cpp + reservoir_sample.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ PARENT_SCOPE) diff --git a/src/execution/reservoir_sample.cpp b/src/execution/reservoir_sample.cpp new file mode 100644 index 000000000000..284e03faef09 --- /dev/null +++ b/src/execution/reservoir_sample.cpp @@ -0,0 +1,324 @@ +#include "duckdb/execution/reservoir_sample.hpp" +#include "duckdb/common/types/data_chunk.hpp" +#include "duckdb/common/pair.hpp" + +namespace duckdb { + +void ReservoirChunk::Serialize(Serializer &serializer) const { + chunk.Serialize(serializer); +} + +unique_ptr ReservoirChunk::Deserialize(Deserializer &deserializer) { + auto result = make_uniq(); + result->chunk.Deserialize(deserializer); + return result; +} + +ReservoirSample::ReservoirSample(Allocator &allocator, idx_t sample_count, int64_t seed) + : BlockingSample(seed), allocator(allocator), sample_count(sample_count), reservoir_initialized(false) { +} + +ReservoirSample::ReservoirSample(idx_t sample_count, int64_t seed) + : ReservoirSample(Allocator::DefaultAllocator(), sample_count, seed) { +} + +void ReservoirSample::AddToReservoir(DataChunk &input) { + if (sample_count == 0) { + // sample count is 0, means no samples were requested + return; + } + old_base_reservoir_sample.num_entries_seen_total += input.size(); + // Input: A population V of n weighted items + // Output: A reservoir R with a size m + // 1: The first m items of V are inserted into R + // first we need to check if the reservoir already has "m" elements + if (!reservoir_data_chunk || reservoir_data_chunk->size() < sample_count) { + if (FillReservoir(input) == 0) { + // entire chunk was consumed by reservoir + return; + } + } + D_ASSERT(reservoir_data_chunk); + D_ASSERT(reservoir_data_chunk->size() == sample_count); + // Initialize the weights if they have not been already + if (old_base_reservoir_sample.reservoir_weights.empty()) { + old_base_reservoir_sample.InitializeReservoir(reservoir_data_chunk->size(), sample_count); + } + // find the position of next_index_to_sample relative to number of seen entries (num_entries_to_skip_b4_next_sample) + idx_t remaining = input.size(); + idx_t base_offset = 0; + while (true) { + idx_t offset = old_base_reservoir_sample.next_index_to_sample - + old_base_reservoir_sample.num_entries_to_skip_b4_next_sample; + if (offset >= remaining) { + // not in this chunk! increment current count and go to the next chunk + old_base_reservoir_sample.num_entries_to_skip_b4_next_sample += remaining; + return; + } + // in this chunk! replace the element + ReplaceElement(input, base_offset + offset); + // shift the chunk forward + remaining -= offset; + base_offset += offset; + } +} + +unique_ptr ReservoirSample::GetChunk() { + if (!reservoir_data_chunk || reservoir_data_chunk->size() == 0) { + return nullptr; + } + auto collected_sample_count = reservoir_data_chunk->size(); + if (collected_sample_count > STANDARD_VECTOR_SIZE) { + // get from the back to avoid creating two selection vectors + // one to return the first STANDARD_VECTOR_SIZE + // another to replace the reservoir_data_chunk with the first STANDARD VECTOR SIZE missing + auto ret = make_uniq(); + auto samples_remaining = collected_sample_count - STANDARD_VECTOR_SIZE; + auto reservoir_types = reservoir_data_chunk->GetTypes(); + SelectionVector sel(STANDARD_VECTOR_SIZE); + for (idx_t i = samples_remaining; i < collected_sample_count; i++) { + sel.set_index(i - samples_remaining, i); + } + ret->Initialize(allocator, reservoir_types); + ret->Slice(*reservoir_data_chunk, sel, STANDARD_VECTOR_SIZE); + ret->SetCardinality(STANDARD_VECTOR_SIZE); + // reduce capacity and cardinality of the sample data chunk + reservoir_data_chunk->SetCardinality(samples_remaining); + return ret; + } + return std::move(reservoir_data_chunk); +} + +void ReservoirSample::ReplaceElement(DataChunk &input, idx_t index_in_chunk, double with_weight) { + // replace the entry in the reservoir + // 8. The item in R with the minimum key is replaced by item vi + D_ASSERT(input.ColumnCount() == reservoir_data_chunk->ColumnCount()); + for (idx_t col_idx = 0; col_idx < input.ColumnCount(); col_idx++) { + reservoir_data_chunk->SetValue(col_idx, old_base_reservoir_sample.min_weighted_entry_index, + input.GetValue(col_idx, index_in_chunk)); + } + old_base_reservoir_sample.ReplaceElement(with_weight); +} + +void ReservoirSample::InitializeReservoir(DataChunk &input) { + reservoir_data_chunk = make_uniq(); + reservoir_data_chunk->Initialize(allocator, input.GetTypes(), sample_count); + for (idx_t col_idx = 0; col_idx < reservoir_data_chunk->ColumnCount(); col_idx++) { + FlatVector::Validity(reservoir_data_chunk->data[col_idx]).Initialize(sample_count); + } + reservoir_initialized = true; +} + +idx_t ReservoirSample::FillReservoir(DataChunk &input) { + idx_t chunk_count = input.size(); + input.Flatten(); + auto num_added_samples = reservoir_data_chunk ? reservoir_data_chunk->size() : 0; + D_ASSERT(num_added_samples <= sample_count); + + // required count is what we still need to add to the reservoir + idx_t required_count; + if (num_added_samples + chunk_count >= sample_count) { + // have to limit the count of the chunk + required_count = sample_count - num_added_samples; + } else { + // we copy the entire chunk + required_count = chunk_count; + } + input.SetCardinality(required_count); + + // initialize the reservoir + if (!reservoir_initialized) { + InitializeReservoir(input); + } + reservoir_data_chunk->Append(input, false, nullptr, required_count); + old_base_reservoir_sample.InitializeReservoir(required_count, sample_count); + + // check if there are still elements remaining in the Input data chunk that should be + // randomly sampled and potentially added. This happens if we are on a boundary + // for example, input.size() is 1024, but our sample size is 10 + if (required_count == chunk_count) { + // we are done here + return 0; + } + // we still need to process a part of the chunk + // create a selection vector of the remaining elements + SelectionVector sel(STANDARD_VECTOR_SIZE); + for (idx_t i = required_count; i < chunk_count; i++) { + sel.set_index(i - required_count, i); + } + // slice the input vector and continue + input.Slice(sel, chunk_count - required_count); + return input.size(); +} + +void ReservoirSample::Finalize() { + return; +} + +ReservoirSamplePercentage::ReservoirSamplePercentage(Allocator &allocator, double percentage, int64_t seed) + : BlockingSample(seed), allocator(allocator), sample_percentage(percentage / 100.0), current_count(0), + is_finalized(false) { + reservoir_sample_size = idx_t(sample_percentage * RESERVOIR_THRESHOLD); + current_sample = make_uniq(allocator, reservoir_sample_size, random.NextRandomInteger()); +} + +ReservoirSamplePercentage::ReservoirSamplePercentage(double percentage, int64_t seed) + : ReservoirSamplePercentage(Allocator::DefaultAllocator(), percentage, seed) { +} + +void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) { + old_base_reservoir_sample.num_entries_seen_total += input.size(); + if (current_count + input.size() > RESERVOIR_THRESHOLD) { + // we don't have enough space in our current reservoir + // first check what we still need to append to the current sample + idx_t append_to_current_sample_count = RESERVOIR_THRESHOLD - current_count; + idx_t append_to_next_sample = input.size() - append_to_current_sample_count; + if (append_to_current_sample_count > 0) { + // we have elements remaining, first add them to the current sample + if (append_to_next_sample > 0) { + // we need to also add to the next sample + DataChunk new_chunk; + new_chunk.InitializeEmpty(input.GetTypes()); + new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count); + new_chunk.Flatten(); + current_sample->AddToReservoir(new_chunk); + } else { + input.Flatten(); + input.SetCardinality(append_to_current_sample_count); + current_sample->AddToReservoir(input); + } + } + if (append_to_next_sample > 0) { + // slice the input for the remainder + SelectionVector sel(append_to_next_sample); + for (idx_t i = append_to_current_sample_count; i < append_to_next_sample + append_to_current_sample_count; + i++) { + sel.set_index(i - append_to_current_sample_count, i); + } + input.Slice(sel, append_to_next_sample); + } + // now our first sample is filled: append it to the set of finished samples + finished_samples.push_back(std::move(current_sample)); + + // allocate a new sample, and potentially add the remainder of the current input to that sample + current_sample = make_uniq(allocator, reservoir_sample_size, random.NextRandomInteger()); + if (append_to_next_sample > 0) { + current_sample->AddToReservoir(input); + } + current_count = append_to_next_sample; + } else { + // we can just append to the current sample + current_count += input.size(); + current_sample->AddToReservoir(input); + } +} + +unique_ptr ReservoirSamplePercentage::GetChunk() { + if (!is_finalized) { + Finalize(); + } + while (!finished_samples.empty()) { + auto &front = finished_samples.front(); + auto chunk = front->GetChunk(); + if (chunk && chunk->size() > 0) { + return chunk; + } + // move to the next sample + finished_samples.erase(finished_samples.begin()); + } + return nullptr; +} + +void ReservoirSamplePercentage::Finalize() { + // need to finalize the current sample, if any + // we are finializing, so we are starting to return chunks. Our last chunk has + // sample_percentage * RESERVOIR_THRESHOLD entries that hold samples. + // if our current count is less than the sample_percentage * RESERVOIR_THRESHOLD + // then we have sampled too much for the current_sample and we need to redo the sample + // otherwise we can just push the current sample back + // Imagine sampling 70% of 100 rows (so 70 rows). We allocate sample_percentage * RESERVOIR_THRESHOLD + // ----------------------------------------- + auto sampled_more_than_required = + static_cast(current_count) > sample_percentage * RESERVOIR_THRESHOLD || finished_samples.empty(); + if (current_count > 0 && sampled_more_than_required) { + // create a new sample + auto new_sample_size = idx_t(round(sample_percentage * static_cast(current_count))); + auto new_sample = make_uniq(allocator, new_sample_size, random.NextRandomInteger()); + while (true) { + auto chunk = current_sample->GetChunk(); + if (!chunk || chunk->size() == 0) { + break; + } + new_sample->AddToReservoir(*chunk); + } + finished_samples.push_back(std::move(new_sample)); + } else { + finished_samples.push_back(std::move(current_sample)); + } + // when finalizing, current_sample is null. All samples are now in finished samples. + current_sample = nullptr; + is_finalized = true; +} + +BaseReservoirSampling::BaseReservoirSampling(int64_t seed) : random(seed) { + next_index_to_sample = 0; + min_weight_threshold = 0; + min_weighted_entry_index = 0; + num_entries_to_skip_b4_next_sample = 0; + num_entries_seen_total = 0; +} + +BaseReservoirSampling::BaseReservoirSampling() : BaseReservoirSampling(-1) { +} + +void BaseReservoirSampling::InitializeReservoir(idx_t cur_size, idx_t sample_size) { + //! 1: The first m items of V are inserted into R + //! first we need to check if the reservoir already has "m" elements + if (cur_size == sample_size) { + //! 2. For each item vi ∈ R: Calculate a key ki = random(0, 1) + //! we then define the threshold to enter the reservoir T_w as the minimum key of R + //! we use a priority queue to extract the minimum key in O(1) time + for (idx_t i = 0; i < sample_size; i++) { + double k_i = random.NextRandom(); + reservoir_weights.emplace(-k_i, i); + } + SetNextEntry(); + } +} + +void BaseReservoirSampling::SetNextEntry() { + //! 4. Let r = random(0, 1) and Xw = log(r) / log(T_w) + auto &min_key = reservoir_weights.top(); + double t_w = -min_key.first; + double r = random.NextRandom(); + double x_w = log(r) / log(t_w); + //! 5. From the current item vc skip items until item vi , such that: + //! 6. wc +wc+1 +···+wi−1 < Xw <= wc +wc+1 +···+wi−1 +wi + //! since all our weights are 1 (uniform sampling), we can just determine the amount of elements to skip + min_weight_threshold = t_w; + min_weighted_entry_index = min_key.second; + next_index_to_sample = MaxValue(1, idx_t(round(x_w))); + num_entries_to_skip_b4_next_sample = 0; +} + +void BaseReservoirSampling::ReplaceElement(double with_weight) { + //! replace the entry in the reservoir + //! pop the minimum entry + reservoir_weights.pop(); + //! now update the reservoir + //! 8. Let tw = Tw i , r2 = random(tw,1) and vi’s key: ki = (r2)1/wi + //! 9. The new threshold Tw is the new minimum key of R + //! we generate a random number between (min_weight_threshold, 1) + double r2 = random.NextRandom(min_weight_threshold, 1); + + //! if we are merging two reservoir samples use the weight passed + if (with_weight >= 0) { + r2 = with_weight; + } + //! now we insert the new weight into the reservoir + reservoir_weights.emplace(-r2, min_weighted_entry_index); + //! we update the min entry with the new min entry in the reservoir + SetNextEntry(); +} + +} // namespace duckdb diff --git a/src/execution/sample/CMakeLists.txt b/src/execution/sample/CMakeLists.txt deleted file mode 100644 index 6f69a205a0a9..000000000000 --- a/src/execution/sample/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library_unity(duckdb_sample OBJECT base_reservoir_sample.cpp - reservoir_sample.cpp) -set(ALL_OBJECT_FILES - ${ALL_OBJECT_FILES} $ - PARENT_SCOPE) diff --git a/src/execution/sample/base_reservoir_sample.cpp b/src/execution/sample/base_reservoir_sample.cpp deleted file mode 100644 index 0f0fcdf7a387..000000000000 --- a/src/execution/sample/base_reservoir_sample.cpp +++ /dev/null @@ -1,136 +0,0 @@ -#include "duckdb/execution/reservoir_sample.hpp" -#include - -namespace duckdb { - -double BaseReservoirSampling::GetMinWeightFromTuplesSeen(idx_t rows_seen_total) { - // this function was obtained using https://mycurvefit.com. Inputting multiple x, y values into - // The - switch (rows_seen_total) { - case 0: - return 0; - case 1: - return 0.000161; - case 2: - return 0.530136; - case 3: - return 0.693454; - default: { - return (0.99 - 0.355 * std::exp(-0.07 * static_cast(rows_seen_total))); - } - } -} - -BaseReservoirSampling::BaseReservoirSampling(int64_t seed) : random(seed) { - next_index_to_sample = 0; - min_weight_threshold = 0; - min_weighted_entry_index = 0; - num_entries_to_skip_b4_next_sample = 0; - num_entries_seen_total = 0; -} - -BaseReservoirSampling::BaseReservoirSampling() : BaseReservoirSampling(1) { -} - -unique_ptr BaseReservoirSampling::Copy() { - auto ret = make_uniq(1); - ret->reservoir_weights = reservoir_weights; - ret->next_index_to_sample = next_index_to_sample; - ret->min_weight_threshold = min_weight_threshold; - ret->min_weighted_entry_index = min_weighted_entry_index; - ret->num_entries_to_skip_b4_next_sample = num_entries_to_skip_b4_next_sample; - ret->num_entries_seen_total = num_entries_seen_total; - return ret; -} - -void BaseReservoirSampling::InitializeReservoirWeights(idx_t cur_size, idx_t sample_size) { - //! 1: The first m items of V are inserted into R - //! first we need to check if the reservoir already has "m" elements - //! 2. For each item vi ∈ R: Calculate a key ki = random(0, 1) - //! we then define the threshold to enter the reservoir T_w as the minimum key of R - //! we use a priority queue to extract the minimum key in O(1) time - if (cur_size == sample_size) { - //! 2. For each item vi ∈ R: Calculate a key ki = random(0, 1) - //! we then define the threshold to enter the reservoir T_w as the minimum key of R - //! we use a priority queue to extract the minimum key in O(1) time - for (idx_t i = 0; i < sample_size; i++) { - double k_i = random.NextRandom(); - reservoir_weights.emplace(-k_i, i); - } - SetNextEntry(); - } -} - -void BaseReservoirSampling::SetNextEntry() { - D_ASSERT(!reservoir_weights.empty()); - //! 4. Let r = random(0, 1) and Xw = log(r) / log(T_w) - auto &min_key = reservoir_weights.top(); - double t_w = -min_key.first; - double r = random.NextRandom32(); - double x_w = log(r) / log(t_w); - //! 5. From the current item vc skip items until item vi , such that: - //! 6. wc +wc+1 +···+wi−1 < Xw <= wc +wc+1 +···+wi−1 +wi - //! since all our weights are 1 (uniform sampling), we can just determine the amount of elements to skip - min_weight_threshold = t_w; - min_weighted_entry_index = min_key.second; - next_index_to_sample = MaxValue(1, idx_t(round(x_w))); - num_entries_to_skip_b4_next_sample = 0; -} - -void BaseReservoirSampling::ReplaceElementWithIndex(idx_t entry_index, double with_weight, bool pop) { - - if (pop) { - reservoir_weights.pop(); - } - double r2 = with_weight; - //! now we insert the new weight into the reservoir - reservoir_weights.emplace(-r2, entry_index); - //! we update the min entry with the new min entry in the reservoir - SetNextEntry(); -} - -void BaseReservoirSampling::ReplaceElement(double with_weight) { - //! replace the entry in the reservoir - //! pop the minimum entry - reservoir_weights.pop(); - //! now update the reservoir - //! 8. Let tw = Tw i , r2 = random(tw,1) and vi’s key: ki = (r2)1/wi - //! 9. The new threshold Tw is the new minimum key of R - //! we generate a random number between (min_weight_threshold, 1) - double r2 = random.NextRandom(min_weight_threshold, 1); - - //! if we are merging two reservoir samples use the weight passed - if (with_weight >= 0) { - r2 = with_weight; - } - //! now we insert the new weight into the reservoir - reservoir_weights.emplace(-r2, min_weighted_entry_index); - //! we update the min entry with the new min entry in the reservoir - SetNextEntry(); -} - -void BaseReservoirSampling::UpdateMinWeightThreshold() { - if (!reservoir_weights.empty()) { - min_weight_threshold = -reservoir_weights.top().first; - min_weighted_entry_index = reservoir_weights.top().second; - return; - } - min_weight_threshold = 1; -} - -void BaseReservoirSampling::FillWeights(SelectionVector &sel, idx_t &sel_size) { - if (!reservoir_weights.empty()) { - return; - } - D_ASSERT(reservoir_weights.empty()); - auto num_entries_seen_normalized = num_entries_seen_total / FIXED_SAMPLE_SIZE; - auto min_weight = GetMinWeightFromTuplesSeen(num_entries_seen_normalized); - for (idx_t i = 0; i < sel_size; i++) { - auto weight = random.NextRandom(min_weight, 1); - reservoir_weights.emplace(-weight, i); - } - D_ASSERT(reservoir_weights.size() <= sel_size); - SetNextEntry(); -} - -} // namespace duckdb diff --git a/src/execution/sample/reservoir_sample.cpp b/src/execution/sample/reservoir_sample.cpp deleted file mode 100644 index 58056920eaf8..000000000000 --- a/src/execution/sample/reservoir_sample.cpp +++ /dev/null @@ -1,931 +0,0 @@ -#include "duckdb/execution/reservoir_sample.hpp" -#include "duckdb/common/types/data_chunk.hpp" -#include "duckdb/common/vector_operations/vector_operations.hpp" -#include - -namespace duckdb { - -std::pair BlockingSample::PopFromWeightQueue() { - D_ASSERT(base_reservoir_sample && !base_reservoir_sample->reservoir_weights.empty()); - auto ret = base_reservoir_sample->reservoir_weights.top(); - base_reservoir_sample->reservoir_weights.pop(); - - base_reservoir_sample->UpdateMinWeightThreshold(); - D_ASSERT(base_reservoir_sample->min_weight_threshold > 0); - return ret; -} - -double BlockingSample::GetMinWeightThreshold() { - return base_reservoir_sample->min_weight_threshold; -} - -idx_t BlockingSample::GetPriorityQueueSize() { - return base_reservoir_sample->reservoir_weights.size(); -} - -void BlockingSample::Destroy() { - destroyed = true; -} - -void ReservoirChunk::Serialize(Serializer &serializer) const { - chunk.Serialize(serializer); -} - -unique_ptr ReservoirChunk::Deserialize(Deserializer &deserializer) { - auto result = make_uniq(); - result->chunk.Deserialize(deserializer); - return result; -} - -unique_ptr ReservoirChunk::Copy() const { - auto copy = make_uniq(); - copy->chunk.Initialize(Allocator::DefaultAllocator(), chunk.GetTypes()); - - chunk.Copy(copy->chunk); - return copy; -} - -ReservoirSample::ReservoirSample(idx_t sample_count, unique_ptr reservoir_chunk) - : ReservoirSample(Allocator::DefaultAllocator(), sample_count, 1) { - if (reservoir_chunk) { - this->reservoir_chunk = std::move(reservoir_chunk); - sel_size = this->reservoir_chunk->chunk.size(); - sel = SelectionVector(FIXED_SAMPLE_SIZE); - for (idx_t i = 0; i < sel_size; i++) { - sel.set_index(i, i); - } - ExpandSerializedSample(); - } - stats_sample = true; -} - -ReservoirSample::ReservoirSample(Allocator &allocator, idx_t sample_count, int64_t seed) - : BlockingSample(seed), sample_count(sample_count), allocator(allocator) { - base_reservoir_sample = make_uniq(seed); - type = SampleType::RESERVOIR_SAMPLE; - reservoir_chunk = nullptr; - stats_sample = false; - sel = SelectionVector(sample_count); - sel_size = 0; -} - -idx_t ReservoirSample::GetSampleCount() { - return sample_count; -} - -idx_t ReservoirSample::NumSamplesCollected() const { - if (!reservoir_chunk) { - return 0; - } - return reservoir_chunk->chunk.size(); -} - -SamplingState ReservoirSample::GetSamplingState() const { - if (base_reservoir_sample->reservoir_weights.empty()) { - return SamplingState::RANDOM; - } - return SamplingState::RESERVOIR; -} - -idx_t ReservoirSample::GetActiveSampleCount() const { - switch (GetSamplingState()) { - case SamplingState::RANDOM: - return sel_size; - case SamplingState::RESERVOIR: - return base_reservoir_sample->reservoir_weights.size(); - default: - throw InternalException("Sampling State is INVALID"); - } -} - -idx_t ReservoirSample::GetTuplesSeen() const { - return base_reservoir_sample->num_entries_seen_total; -} - -DataChunk &ReservoirSample::Chunk() { - D_ASSERT(reservoir_chunk); - return reservoir_chunk->chunk; -} - -unique_ptr ReservoirSample::GetChunk() { - if (destroyed || !reservoir_chunk || Chunk().size() == 0) { - return nullptr; - } - // cannot destory internal samples. - auto ret = make_uniq(); - - SelectionVector ret_sel(STANDARD_VECTOR_SIZE); - idx_t collected_samples = GetActiveSampleCount(); - - if (collected_samples == 0) { - return nullptr; - } - - idx_t samples_remaining; - idx_t return_chunk_size; - if (collected_samples > STANDARD_VECTOR_SIZE) { - samples_remaining = collected_samples - STANDARD_VECTOR_SIZE; - return_chunk_size = STANDARD_VECTOR_SIZE; - } else { - samples_remaining = 0; - return_chunk_size = collected_samples; - } - - for (idx_t i = samples_remaining; i < collected_samples; i++) { - // pop samples and reduce size of selection vector. - if (GetSamplingState() == SamplingState::RESERVOIR) { - auto top = PopFromWeightQueue(); - ret_sel.set_index(i - samples_remaining, sel.get_index(top.second)); - } else { - ret_sel.set_index(i - samples_remaining, sel.get_index(i)); - } - sel_size -= 1; - } - - auto reservoir_types = Chunk().GetTypes(); - - ret->Initialize(allocator, reservoir_types, STANDARD_VECTOR_SIZE); - ret->Slice(Chunk(), ret_sel, return_chunk_size); - ret->SetCardinality(return_chunk_size); - return ret; -} - -unique_ptr ReservoirSample::CreateNewSampleChunk(vector &types, idx_t size) const { - auto new_sample_chunk = make_uniq(); - new_sample_chunk->chunk.Initialize(Allocator::DefaultAllocator(), types, size); - - // set the NULL columns correctly - for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { - if (!ValidSampleType(types[col_idx]) && stats_sample) { - new_sample_chunk->chunk.data[col_idx].SetVectorType(VectorType::CONSTANT_VECTOR); - ConstantVector::SetNull(new_sample_chunk->chunk.data[col_idx], true); - } - } - return new_sample_chunk; -} - -void ReservoirSample::Vacuum() { - Verify(); - if (NumSamplesCollected() <= FIXED_SAMPLE_SIZE || !reservoir_chunk || destroyed) { - // sample is destroyed or too small to shrink - return; - } - - auto ret = Copy(); - auto ret_reservoir = duckdb::unique_ptr_cast(std::move(ret)); - reservoir_chunk = std::move(ret_reservoir->reservoir_chunk); - sel = std::move(ret_reservoir->sel); - sel_size = ret_reservoir->sel_size; - - Verify(); - // We should only have one sample chunk now. - D_ASSERT(Chunk().size() > 0 && Chunk().size() <= sample_count); -} - -unique_ptr ReservoirSample::Copy() const { - - auto ret = make_uniq(sample_count); - ret->stats_sample = stats_sample; - - ret->base_reservoir_sample = base_reservoir_sample->Copy(); - ret->destroyed = destroyed; - - if (!reservoir_chunk || destroyed) { - return unique_ptr_cast(std::move(ret)); - } - - D_ASSERT(reservoir_chunk); - - // create a new sample chunk to store new samples - auto types = reservoir_chunk->chunk.GetTypes(); - // how many values should be copied - idx_t values_to_copy = MinValue(GetActiveSampleCount(), sample_count); - - auto new_sample_chunk = CreateNewSampleChunk(types, GetReservoirChunkCapacity()); - - SelectionVector sel_copy(sel); - - ret->reservoir_chunk = std::move(new_sample_chunk); - ret->UpdateSampleAppend(ret->reservoir_chunk->chunk, reservoir_chunk->chunk, sel_copy, values_to_copy); - ret->sel = SelectionVector(values_to_copy); - for (idx_t i = 0; i < values_to_copy; i++) { - ret->sel.set_index(i, i); - } - ret->sel_size = sel_size; - D_ASSERT(ret->reservoir_chunk->chunk.size() <= sample_count); - ret->Verify(); - return unique_ptr_cast(std::move(ret)); -} - -void ReservoirSample::ConvertToReservoirSample() { - D_ASSERT(sel_size <= sample_count); - base_reservoir_sample->FillWeights(sel, sel_size); -} - -vector ReservoirSample::GetRandomizedVector(uint32_t range, uint32_t size) const { - vector ret; - ret.reserve(range); - for (uint32_t i = 0; i < range; i++) { - ret.push_back(i); - } - for (uint32_t i = 0; i < size; i++) { - uint32_t random_shuffle = base_reservoir_sample->random.NextRandomInteger32(i, range); - if (random_shuffle == i) { - // leave the value where it is - continue; - } - uint32_t tmp = ret[random_shuffle]; - // basically replacing the tuple that was at index actual_sample_indexes[random_shuffle] - ret[random_shuffle] = ret[i]; - ret[i] = tmp; - } - return ret; -} - -void ReservoirSample::SimpleMerge(ReservoirSample &other) { - D_ASSERT(GetPriorityQueueSize() == 0); - D_ASSERT(other.GetPriorityQueueSize() == 0); - D_ASSERT(GetSamplingState() == SamplingState::RANDOM); - D_ASSERT(other.GetSamplingState() == SamplingState::RANDOM); - - if (other.GetActiveSampleCount() == 0 && other.GetTuplesSeen() == 0) { - return; - } - - if (GetActiveSampleCount() == 0 && GetTuplesSeen() == 0) { - sel = SelectionVector(other.sel); - sel_size = other.sel_size; - base_reservoir_sample->num_entries_seen_total = other.GetTuplesSeen(); - return; - } - - idx_t total_seen = GetTuplesSeen() + other.GetTuplesSeen(); - - auto weight_tuples_this = static_cast(GetTuplesSeen()) / static_cast(total_seen); - auto weight_tuples_other = static_cast(other.GetTuplesSeen()) / static_cast(total_seen); - - // If weights don't add up to 1, most likely a simple merge occured and no new samples were added. - // if that is the case, add the missing weight to the lower weighted sample to adjust. - // this is to avoid cases where if you have a 20k row table and add another 20k rows row by row - // then eventually the missing weights will add up, and get you a more even distribution - if (weight_tuples_this + weight_tuples_other < 1) { - weight_tuples_other += 1 - (weight_tuples_other + weight_tuples_this); - } - - idx_t keep_from_this = 0; - idx_t keep_from_other = 0; - D_ASSERT(stats_sample); - D_ASSERT(sample_count == FIXED_SAMPLE_SIZE); - D_ASSERT(sample_count == other.sample_count); - auto sample_count_double = static_cast(sample_count); - - if (weight_tuples_this > weight_tuples_other) { - keep_from_this = MinValue(static_cast(round(sample_count_double * weight_tuples_this)), - GetActiveSampleCount()); - keep_from_other = MinValue(sample_count - keep_from_this, other.GetActiveSampleCount()); - } else { - keep_from_other = MinValue(static_cast(round(sample_count_double * weight_tuples_other)), - other.GetActiveSampleCount()); - keep_from_this = MinValue(sample_count - keep_from_other, GetActiveSampleCount()); - } - - D_ASSERT(keep_from_this <= GetActiveSampleCount()); - D_ASSERT(keep_from_other <= other.GetActiveSampleCount()); - D_ASSERT(keep_from_other + keep_from_this <= FIXED_SAMPLE_SIZE); - idx_t size_after_merge = MinValue(keep_from_other + keep_from_this, FIXED_SAMPLE_SIZE); - - // Check if appending the other samples to this will go over the sample chunk size - if (reservoir_chunk->chunk.size() + keep_from_other > GetReservoirChunkCapacity()) { - Vacuum(); - } - - D_ASSERT(size_after_merge <= other.GetActiveSampleCount() + GetActiveSampleCount()); - SelectionVector chunk_sel(keep_from_other); - auto offset = reservoir_chunk->chunk.size(); - for (idx_t i = keep_from_this; i < size_after_merge; i++) { - if (i >= GetActiveSampleCount()) { - D_ASSERT(sel_size >= GetActiveSampleCount()); - sel.set_index(GetActiveSampleCount(), offset); - sel_size += 1; - } else { - sel.set_index(i, offset); - } - chunk_sel.set_index(i - keep_from_this, other.sel.get_index(i - keep_from_this)); - offset += 1; - } - - D_ASSERT(GetActiveSampleCount() == size_after_merge); - - // Copy the rows that make it to the sample from other and put them into this. - UpdateSampleAppend(reservoir_chunk->chunk, other.reservoir_chunk->chunk, chunk_sel, keep_from_other); - base_reservoir_sample->num_entries_seen_total += other.GetTuplesSeen(); - - // if THIS has too many samples now, we conver it to a slower sample. - if (GetTuplesSeen() >= FIXED_SAMPLE_SIZE * FAST_TO_SLOW_THRESHOLD) { - ConvertToReservoirSample(); - } - Verify(); -} - -void ReservoirSample::WeightedMerge(ReservoirSample &other_sample) { - D_ASSERT(GetSamplingState() == SamplingState::RESERVOIR); - D_ASSERT(other_sample.GetSamplingState() == SamplingState::RESERVOIR); - - // Find out how many samples we want to keep. - idx_t total_samples = GetActiveSampleCount() + other_sample.GetActiveSampleCount(); - idx_t total_samples_seen = - base_reservoir_sample->num_entries_seen_total + other_sample.base_reservoir_sample->num_entries_seen_total; - idx_t num_samples_to_keep = MinValue(total_samples, MinValue(sample_count, total_samples_seen)); - - D_ASSERT(GetActiveSampleCount() <= num_samples_to_keep); - D_ASSERT(total_samples <= FIXED_SAMPLE_SIZE * 2); - - // pop from base base_reservoir weights until there are num_samples_to_keep left. - vector this_indexes_to_replace; - for (idx_t i = num_samples_to_keep; i < total_samples; i++) { - auto min_weight_this = base_reservoir_sample->min_weight_threshold; - auto min_weight_other = other_sample.base_reservoir_sample->min_weight_threshold; - // min weight threshol is always positive - if (min_weight_this > min_weight_other) { - // pop from other - other_sample.base_reservoir_sample->reservoir_weights.pop(); - other_sample.base_reservoir_sample->UpdateMinWeightThreshold(); - } else { - auto top_this = PopFromWeightQueue(); - this_indexes_to_replace.push_back(top_this.second); - base_reservoir_sample->UpdateMinWeightThreshold(); - } - } - - D_ASSERT(other_sample.GetPriorityQueueSize() + GetPriorityQueueSize() <= FIXED_SAMPLE_SIZE); - D_ASSERT(other_sample.GetPriorityQueueSize() + GetPriorityQueueSize() == num_samples_to_keep); - D_ASSERT(other_sample.reservoir_chunk->chunk.GetTypes() == reservoir_chunk->chunk.GetTypes()); - - // Prepare a selection vector to copy data from the other sample chunk to this sample chunk - SelectionVector sel_other(other_sample.GetPriorityQueueSize()); - D_ASSERT(GetPriorityQueueSize() <= num_samples_to_keep); - D_ASSERT(other_sample.GetPriorityQueueSize() >= this_indexes_to_replace.size()); - idx_t chunk_offset = 0; - - // Now push weights from other.base_reservoir_sample to this - // Depending on how many sample values "this" has, we either need to add to the selection vector - // Or replace values in "this'" selection vector - idx_t i = 0; - while (other_sample.GetPriorityQueueSize() > 0) { - auto other_top = other_sample.PopFromWeightQueue(); - idx_t index_for_new_pair = chunk_offset + reservoir_chunk->chunk.size(); - - // update the sel used to copy values from other to this - sel_other.set_index(chunk_offset, other_top.second); - if (i < this_indexes_to_replace.size()) { - auto replacement_index = this_indexes_to_replace[i]; - sel.set_index(replacement_index, index_for_new_pair); - other_top.second = replacement_index; - } else { - sel.set_index(sel_size, index_for_new_pair); - other_top.second = sel_size; - sel_size += 1; - } - - // make sure that the sample indexes are (this.sample_chunk.size() + chunk_offfset) - base_reservoir_sample->reservoir_weights.push(other_top); - chunk_offset += 1; - i += 1; - } - - D_ASSERT(GetPriorityQueueSize() == num_samples_to_keep); - - base_reservoir_sample->UpdateMinWeightThreshold(); - D_ASSERT(base_reservoir_sample->min_weight_threshold > 0); - base_reservoir_sample->num_entries_seen_total = GetTuplesSeen() + other_sample.GetTuplesSeen(); - - UpdateSampleAppend(reservoir_chunk->chunk, other_sample.reservoir_chunk->chunk, sel_other, chunk_offset); - if (reservoir_chunk->chunk.size() > FIXED_SAMPLE_SIZE * (FIXED_SAMPLE_SIZE_MULTIPLIER - 3)) { - Vacuum(); - } - - Verify(); -} - -void ReservoirSample::Merge(unique_ptr other) { - if (destroyed || other->destroyed) { - Destroy(); - return; - } - - D_ASSERT(other->type == SampleType::RESERVOIR_SAMPLE); - auto &other_sample = other->Cast(); - - // if the other sample has not collected anything yet return - if (!other_sample.reservoir_chunk || other_sample.reservoir_chunk->chunk.size() == 0) { - return; - } - - // this has not collected samples, take over the other - if (!reservoir_chunk || reservoir_chunk->chunk.size() == 0) { - base_reservoir_sample = std::move(other->base_reservoir_sample); - reservoir_chunk = std::move(other_sample.reservoir_chunk); - sel = SelectionVector(other_sample.sel); - sel_size = other_sample.sel_size; - Verify(); - return; - } - //! Both samples are still in "fast sampling" method - if (GetSamplingState() == SamplingState::RANDOM && other_sample.GetSamplingState() == SamplingState::RANDOM) { - SimpleMerge(other_sample); - return; - } - - // One or none of the samples are in "Fast Sampling" method. - // When this is the case, switch both to slow sampling - ConvertToReservoirSample(); - other_sample.ConvertToReservoirSample(); - WeightedMerge(other_sample); -} - -void ReservoirSample::ShuffleSel(SelectionVector &sel, idx_t range, idx_t size) const { - auto randomized = GetRandomizedVector(static_cast(range), static_cast(size)); - SelectionVector original_sel(range); - for (idx_t i = 0; i < range; i++) { - original_sel.set_index(i, sel.get_index(i)); - } - for (idx_t i = 0; i < size; i++) { - sel.set_index(i, original_sel.get_index(randomized[i])); - } -} - -void ReservoirSample::NormalizeWeights() { - vector> tmp_weights; - while (!base_reservoir_sample->reservoir_weights.empty()) { - auto top = base_reservoir_sample->reservoir_weights.top(); - tmp_weights.push_back(std::move(top)); - base_reservoir_sample->reservoir_weights.pop(); - } - std::sort(tmp_weights.begin(), tmp_weights.end(), - [&](std::pair a, std::pair b) { return a.second < b.second; }); - for (idx_t i = 0; i < tmp_weights.size(); i++) { - base_reservoir_sample->reservoir_weights.emplace(tmp_weights.at(i).first, i); - } - base_reservoir_sample->SetNextEntry(); -} - -void ReservoirSample::EvictOverBudgetSamples() { - Verify(); - if (!reservoir_chunk || destroyed) { - return; - } - - // since this is for serialization, we really need to make sure keep a - // minimum of 1% of the rows or 2048 rows - idx_t num_samples_to_keep = - MinValue(FIXED_SAMPLE_SIZE, static_cast(SAVE_PERCENTAGE * static_cast(GetTuplesSeen()))); - - if (num_samples_to_keep <= 0) { - reservoir_chunk->chunk.SetCardinality(0); - return; - } - - if (num_samples_to_keep == sample_count) { - return; - } - - // if we over sampled, make sure we only keep the highest percentage samples - std::unordered_set selections_to_delete; - - while (num_samples_to_keep < GetPriorityQueueSize()) { - auto top = PopFromWeightQueue(); - D_ASSERT(top.second < sel_size); - selections_to_delete.emplace(top.second); - } - - // set up reservoir chunk for the reservoir sample - D_ASSERT(reservoir_chunk->chunk.size() <= sample_count); - // create a new sample chunk to store new samples - auto types = reservoir_chunk->chunk.GetTypes(); - D_ASSERT(num_samples_to_keep <= sample_count); - D_ASSERT(stats_sample); - D_ASSERT(sample_count == FIXED_SAMPLE_SIZE); - auto new_reservoir_chunk = CreateNewSampleChunk(types, sample_count); - - // The current selection vector can potentially have 2048 valid mappings. - // If we need to save a sample with less rows than that, we need to do the following - // 1. Create a new selection vector that doesn't point to the rows we are evicting - SelectionVector new_sel(num_samples_to_keep); - idx_t offset = 0; - for (idx_t i = 0; i < num_samples_to_keep + selections_to_delete.size(); i++) { - if (selections_to_delete.find(i) == selections_to_delete.end()) { - D_ASSERT(i - offset < num_samples_to_keep); - new_sel.set_index(i - offset, sel.get_index(i)); - } else { - offset += 1; - } - } - // 2. Update row_ids in our weights so that they don't store rows ids to - // indexes in the selection vector that have been evicted. - if (!selections_to_delete.empty()) { - NormalizeWeights(); - } - - D_ASSERT(reservoir_chunk->chunk.GetTypes() == new_reservoir_chunk->chunk.GetTypes()); - - UpdateSampleAppend(new_reservoir_chunk->chunk, reservoir_chunk->chunk, new_sel, num_samples_to_keep); - // set the cardinality - new_reservoir_chunk->chunk.SetCardinality(num_samples_to_keep); - reservoir_chunk = std::move(new_reservoir_chunk); - sel_size = num_samples_to_keep; - base_reservoir_sample->UpdateMinWeightThreshold(); -} - -void ReservoirSample::ExpandSerializedSample() { - if (!reservoir_chunk) { - return; - } - - auto types = reservoir_chunk->chunk.GetTypes(); - auto new_res_chunk = CreateNewSampleChunk(types, GetReservoirChunkCapacity()); - auto copy_count = reservoir_chunk->chunk.size(); - SelectionVector tmp_sel = SelectionVector(0, copy_count); - UpdateSampleAppend(new_res_chunk->chunk, reservoir_chunk->chunk, tmp_sel, copy_count); - new_res_chunk->chunk.SetCardinality(copy_count); - std::swap(reservoir_chunk, new_res_chunk); -} - -idx_t ReservoirSample::GetReservoirChunkCapacity() const { - return sample_count + (FIXED_SAMPLE_SIZE_MULTIPLIER * FIXED_SAMPLE_SIZE); -} - -idx_t ReservoirSample::FillReservoir(DataChunk &chunk) { - - idx_t ingested_count = 0; - if (!reservoir_chunk) { - if (chunk.size() > FIXED_SAMPLE_SIZE) { - throw InternalException("Creating sample with DataChunk that is larger than the fixed sample size"); - } - auto types = chunk.GetTypes(); - // create a new sample chunk to store new samples - reservoir_chunk = CreateNewSampleChunk(types, GetReservoirChunkCapacity()); - } - - idx_t actual_sample_index_start = GetActiveSampleCount(); - D_ASSERT(reservoir_chunk->chunk.ColumnCount() == chunk.ColumnCount()); - - if (reservoir_chunk->chunk.size() < sample_count) { - ingested_count = MinValue(sample_count - reservoir_chunk->chunk.size(), chunk.size()); - auto random_other_sel = - GetRandomizedVector(static_cast(ingested_count), static_cast(ingested_count)); - SelectionVector sel_for_input_chunk(ingested_count); - for (idx_t i = 0; i < ingested_count; i++) { - sel.set_index(actual_sample_index_start + i, actual_sample_index_start + i); - sel_for_input_chunk.set_index(i, random_other_sel[i]); - } - UpdateSampleAppend(reservoir_chunk->chunk, chunk, sel_for_input_chunk, ingested_count); - sel_size += ingested_count; - } - D_ASSERT(GetActiveSampleCount() <= sample_count); - D_ASSERT(GetActiveSampleCount() >= ingested_count); - // always return how many tuples were ingested - return ingested_count; -} - -void ReservoirSample::Destroy() { - destroyed = true; -} - -SelectionVectorHelper ReservoirSample::GetReplacementIndexes(idx_t sample_chunk_offset, - idx_t theoretical_chunk_length) { - if (GetSamplingState() == SamplingState::RANDOM) { - return GetReplacementIndexesFast(sample_chunk_offset, theoretical_chunk_length); - } - return GetReplacementIndexesSlow(sample_chunk_offset, theoretical_chunk_length); -} - -SelectionVectorHelper ReservoirSample::GetReplacementIndexesFast(idx_t sample_chunk_offset, idx_t chunk_length) { - - // how much weight to the other tuples have compared to the ones in this chunk? - auto weight_tuples_other = static_cast(chunk_length) / static_cast(GetTuplesSeen() + chunk_length); - auto num_to_pop = static_cast(round(weight_tuples_other * static_cast(sample_count))); - D_ASSERT(num_to_pop <= sample_count); - D_ASSERT(num_to_pop <= sel_size); - SelectionVectorHelper ret; - - if (num_to_pop == 0) { - ret.sel = SelectionVector(num_to_pop); - ret.size = 0; - return ret; - } - std::unordered_map replacement_indexes; - SelectionVector chunk_sel(num_to_pop); - - auto random_indexes_chunk = GetRandomizedVector(static_cast(chunk_length), num_to_pop); - auto random_sel_indexes = GetRandomizedVector(static_cast(sel_size), num_to_pop); - for (idx_t i = 0; i < num_to_pop; i++) { - // update the selection vector for the reservoir sample - chunk_sel.set_index(i, random_indexes_chunk[i]); - // sel is not guaratneed to be random, so we update the indexes according to our - // random sel indexes. - sel.set_index(random_sel_indexes[i], sample_chunk_offset + i); - } - - D_ASSERT(sel_size == sample_count); - - ret.sel = SelectionVector(chunk_sel); - ret.size = num_to_pop; - return ret; -} - -SelectionVectorHelper ReservoirSample::GetReplacementIndexesSlow(const idx_t sample_chunk_offset, - const idx_t chunk_length) { - idx_t remaining = chunk_length; - std::unordered_map ret_map; - idx_t sample_chunk_index = 0; - - idx_t base_offset = 0; - - while (true) { - idx_t offset = - base_reservoir_sample->next_index_to_sample - base_reservoir_sample->num_entries_to_skip_b4_next_sample; - if (offset >= remaining) { - // not in this chunk! increment current count and go to the next chunk - base_reservoir_sample->num_entries_to_skip_b4_next_sample += remaining; - break; - } - // in this chunk! replace the element - // ret[index_in_new_chunk] = index_in_sample_chunk (the sample chunk offset will be applied later) - // D_ASSERT(sample_chunk_index == ret.size()); - ret_map[base_offset + offset] = sample_chunk_index; - double r2 = base_reservoir_sample->random.NextRandom32(base_reservoir_sample->min_weight_threshold, 1); - // replace element in our max_heap - // first get the top most pair - const auto top = PopFromWeightQueue(); - const auto index = top.second; - const auto index_in_sample_chunk = sample_chunk_offset + sample_chunk_index; - sel.set_index(index, index_in_sample_chunk); - base_reservoir_sample->ReplaceElementWithIndex(index, r2, false); - - sample_chunk_index += 1; - // shift the chunk forward - remaining -= offset; - base_offset += offset; - } - - // create selection vector to return - SelectionVector ret_sel(ret_map.size()); - D_ASSERT(sel_size == sample_count); - for (auto &kv : ret_map) { - ret_sel.set_index(kv.second, kv.first); - } - SelectionVectorHelper ret; - ret.sel = SelectionVector(ret_sel); - ret.size = static_cast(ret_map.size()); - return ret; -} - -void ReservoirSample::Finalize() { -} - -bool ReservoirSample::ValidSampleType(const LogicalType &type) { - return type.IsNumeric(); -} - -void ReservoirSample::UpdateSampleAppend(DataChunk &this_, DataChunk &other, SelectionVector &other_sel, - idx_t append_count) const { - idx_t new_size = this_.size() + append_count; - if (other.size() == 0) { - return; - } - D_ASSERT(this_.GetTypes() == other.GetTypes()); - - // UpdateSampleAppend(this_, other, other_sel, append_count); - D_ASSERT(this_.GetTypes() == other.GetTypes()); - auto types = reservoir_chunk->chunk.GetTypes(); - - for (idx_t i = 0; i < reservoir_chunk->chunk.ColumnCount(); i++) { - auto col_type = types[i]; - if (ValidSampleType(col_type) || !stats_sample) { - D_ASSERT(this_.data[i].GetVectorType() == VectorType::FLAT_VECTOR); - VectorOperations::Copy(other.data[i], this_.data[i], other_sel, append_count, 0, this_.size()); - } - } - this_.SetCardinality(new_size); -} - -void ReservoirSample::AddToReservoir(DataChunk &chunk) { - if (destroyed || chunk.size() == 0) { - return; - } - - idx_t tuples_consumed = FillReservoir(chunk); - base_reservoir_sample->num_entries_seen_total += tuples_consumed; - D_ASSERT(sample_count == 0 || reservoir_chunk->chunk.size() >= 1); - - if (tuples_consumed == chunk.size()) { - return; - } - - // the chunk filled the first FIXED_SAMPLE_SIZE chunk but still has tuples remaining - // slice the chunk and call AddToReservoir again. - if (tuples_consumed != chunk.size() && tuples_consumed != 0) { - // Fill reservoir consumed some of the chunk to reach FIXED_SAMPLE_SIZE - // now we need to - // So we slice it and call AddToReservoir - auto slice = make_uniq(); - auto samples_remaining = chunk.size() - tuples_consumed; - auto types = chunk.GetTypes(); - SelectionVector input_sel(samples_remaining); - for (idx_t i = 0; i < samples_remaining; i++) { - input_sel.set_index(i, tuples_consumed + i); - } - slice->Initialize(Allocator::DefaultAllocator(), types, samples_remaining); - slice->Slice(chunk, input_sel, samples_remaining); - slice->SetCardinality(samples_remaining); - AddToReservoir(*slice); - return; - } - - // at this point we should have collected at least sample count samples - D_ASSERT(GetActiveSampleCount() >= sample_count); - - auto chunk_sel = GetReplacementIndexes(reservoir_chunk->chunk.size(), chunk.size()); - - if (chunk_sel.size == 0) { - // not adding any samples - base_reservoir_sample->num_entries_seen_total += chunk.size(); - return; - } - idx_t size = chunk_sel.size; - D_ASSERT(size <= chunk.size()); - - UpdateSampleAppend(reservoir_chunk->chunk, chunk, chunk_sel.sel, size); - - base_reservoir_sample->num_entries_seen_total += chunk.size(); - D_ASSERT(base_reservoir_sample->reservoir_weights.size() == 0 || - base_reservoir_sample->reservoir_weights.size() == sample_count); - - Verify(); - - // if we are over the threshold, we ned to swith to slow sampling. - if (GetSamplingState() == SamplingState::RANDOM && GetTuplesSeen() >= FIXED_SAMPLE_SIZE * FAST_TO_SLOW_THRESHOLD) { - ConvertToReservoirSample(); - } - if (reservoir_chunk->chunk.size() >= (GetReservoirChunkCapacity() - (static_cast(FIXED_SAMPLE_SIZE) * 3))) { - Vacuum(); - } -} - -void ReservoirSample::Verify() { -#ifdef DEBUG - if (destroyed) { - return; - } - if (GetPriorityQueueSize() == 0) { - D_ASSERT(GetActiveSampleCount() <= sample_count); - D_ASSERT(GetTuplesSeen() >= GetActiveSampleCount()); - return; - } - if (NumSamplesCollected() > sample_count) { - D_ASSERT(GetPriorityQueueSize() == sample_count); - } else if (NumSamplesCollected() <= sample_count && GetPriorityQueueSize() > 0) { - // it's possible to collect more samples than your priority queue size. - // see sample_converts_to_reservoir_sample.test - D_ASSERT(NumSamplesCollected() >= GetPriorityQueueSize()); - } - auto base_reservoir_copy = base_reservoir_sample->Copy(); - std::unordered_map index_count; - while (!base_reservoir_copy->reservoir_weights.empty()) { - auto &pair = base_reservoir_copy->reservoir_weights.top(); - if (index_count.find(pair.second) == index_count.end()) { - index_count[pair.second] = 1; - base_reservoir_copy->reservoir_weights.pop(); - } else { - index_count[pair.second] += 1; - base_reservoir_copy->reservoir_weights.pop(); - throw InternalException("Duplicate selection index in reservoir weights"); - } - } - // TODO: Verify the Sel as well. No duplicate indices. - - if (reservoir_chunk) { - reservoir_chunk->chunk.Verify(); - } -#endif -} - -ReservoirSamplePercentage::ReservoirSamplePercentage(double percentage, int64_t seed, idx_t reservoir_sample_size) - : BlockingSample(seed), allocator(Allocator::DefaultAllocator()), sample_percentage(percentage / 100.0), - reservoir_sample_size(reservoir_sample_size), current_count(0), is_finalized(false) { - current_sample = make_uniq(allocator, reservoir_sample_size, base_reservoir_sample->random()); - type = SampleType::RESERVOIR_PERCENTAGE_SAMPLE; -} - -ReservoirSamplePercentage::ReservoirSamplePercentage(Allocator &allocator, double percentage, int64_t seed) - : BlockingSample(seed), allocator(allocator), sample_percentage(percentage / 100.0), current_count(0), - is_finalized(false) { - reservoir_sample_size = (idx_t)(sample_percentage * RESERVOIR_THRESHOLD); - current_sample = make_uniq(allocator, reservoir_sample_size, base_reservoir_sample->random()); - type = SampleType::RESERVOIR_PERCENTAGE_SAMPLE; -} - -ReservoirSamplePercentage::ReservoirSamplePercentage(double percentage, int64_t seed) - : ReservoirSamplePercentage(Allocator::DefaultAllocator(), percentage, seed) { -} - -void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) { - base_reservoir_sample->num_entries_seen_total += input.size(); - if (current_count + input.size() > RESERVOIR_THRESHOLD) { - // we don't have enough space in our current reservoir - // first check what we still need to append to the current sample - idx_t append_to_current_sample_count = RESERVOIR_THRESHOLD - current_count; - idx_t append_to_next_sample = input.size() - append_to_current_sample_count; - if (append_to_current_sample_count > 0) { - // we have elements remaining, first add them to the current sample - if (append_to_next_sample > 0) { - // we need to also add to the next sample - DataChunk new_chunk; - new_chunk.InitializeEmpty(input.GetTypes()); - new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count); - new_chunk.Flatten(); - current_sample->AddToReservoir(new_chunk); - } else { - input.Flatten(); - input.SetCardinality(append_to_current_sample_count); - current_sample->AddToReservoir(input); - } - } - if (append_to_next_sample > 0) { - // slice the input for the remainder - SelectionVector sel(append_to_next_sample); - for (idx_t i = append_to_current_sample_count; i < append_to_next_sample + append_to_current_sample_count; - i++) { - sel.set_index(i - append_to_current_sample_count, i); - } - input.Slice(sel, append_to_next_sample); - } - // now our first sample is filled: append it to the set of finished samples - finished_samples.push_back(std::move(current_sample)); - - // allocate a new sample, and potentially add the remainder of the current input to that sample - current_sample = make_uniq(allocator, reservoir_sample_size, base_reservoir_sample->random()); - if (append_to_next_sample > 0) { - current_sample->AddToReservoir(input); - } - current_count = append_to_next_sample; - } else { - // we can just append to the current sample - current_count += input.size(); - current_sample->AddToReservoir(input); - } -} - -unique_ptr ReservoirSamplePercentage::GetChunk() { - // reservoir sample percentage should never stay - if (!is_finalized) { - Finalize(); - } - while (!finished_samples.empty()) { - auto &front = finished_samples.front(); - auto chunk = front->GetChunk(); - if (chunk && chunk->size() > 0) { - return chunk; - } - // move to the next sample - finished_samples.erase(finished_samples.begin()); - } - return nullptr; -} - -unique_ptr ReservoirSamplePercentage::Copy() const { - throw InternalException("Cannot call Copy on ReservoirSample Percentage"); -} - -void ReservoirSamplePercentage::Finalize() { - // need to finalize the current sample, if any - // we are finializing, so we are starting to return chunks. Our last chunk has - // sample_percentage * RESERVOIR_THRESHOLD entries that hold samples. - // if our current count is less than the sample_percentage * RESERVOIR_THRESHOLD - // then we have sampled too much for the current_sample and we need to redo the sample - // otherwise we can just push the current sample back - // Imagine sampling 70% of 100 rows (so 70 rows). We allocate sample_percentage * RESERVOIR_THRESHOLD - // ----------------------------------------- - auto sampled_more_than_required = - static_cast(current_count) > sample_percentage * RESERVOIR_THRESHOLD || finished_samples.empty(); - if (current_count > 0 && sampled_more_than_required) { - // create a new sample - auto new_sample_size = static_cast(round(sample_percentage * static_cast(current_count))); - auto new_sample = make_uniq(allocator, new_sample_size, base_reservoir_sample->random()); - while (true) { - auto chunk = current_sample->GetChunk(); - if (!chunk || chunk->size() == 0) { - break; - } - new_sample->AddToReservoir(*chunk); - } - finished_samples.push_back(std::move(new_sample)); - } else { - finished_samples.push_back(std::move(current_sample)); - } - // when finalizing, current_sample is null. All samples are now in finished samples. - current_sample = nullptr; - is_finalized = true; -} - -} // namespace duckdb diff --git a/src/function/table/system/CMakeLists.txt b/src/function/table/system/CMakeLists.txt index c475b0112f86..1565132755e6 100644 --- a/src/function/table/system/CMakeLists.txt +++ b/src/function/table/system/CMakeLists.txt @@ -29,7 +29,6 @@ add_library_unity( pragma_metadata_info.cpp pragma_storage_info.cpp pragma_table_info.cpp - pragma_table_sample.cpp pragma_user_agent.cpp test_all_types.cpp test_vector_types.cpp) diff --git a/src/function/table/system/pragma_table_sample.cpp b/src/function/table/system/pragma_table_sample.cpp deleted file mode 100644 index 7f4122b92b62..000000000000 --- a/src/function/table/system/pragma_table_sample.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "duckdb/function/table/system_functions.hpp" - -#include "duckdb/catalog/catalog.hpp" -#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" -#include "duckdb/catalog/catalog_entry/view_catalog_entry.hpp" -#include "duckdb/parser/qualified_name.hpp" -#include "duckdb/parser/constraints/not_null_constraint.hpp" -#include "duckdb/parser/constraints/unique_constraint.hpp" -#include "duckdb/planner/expression/bound_parameter_expression.hpp" -#include "duckdb/planner/binder.hpp" - -#include "duckdb/common/exception.hpp" -#include "duckdb/common/limits.hpp" - -#include - -namespace duckdb { - -struct DuckDBTableSampleFunctionData : public TableFunctionData { - explicit DuckDBTableSampleFunctionData(CatalogEntry &entry_p) : entry(entry_p) { - } - CatalogEntry &entry; -}; - -struct DuckDBTableSampleOperatorData : public GlobalTableFunctionState { - DuckDBTableSampleOperatorData() : sample_offset(0) { - sample = nullptr; - } - idx_t sample_offset; - unique_ptr sample; -}; - -static unique_ptr DuckDBTableSampleBind(ClientContext &context, TableFunctionBindInput &input, - vector &return_types, vector &names) { - - // look up the table name in the catalog - auto qname = QualifiedName::Parse(input.inputs[0].GetValue()); - Binder::BindSchemaOrCatalog(context, qname.catalog, qname.schema); - - auto &entry = Catalog::GetEntry(context, CatalogType::TABLE_ENTRY, qname.catalog, qname.schema, qname.name); - if (entry.type != CatalogType::TABLE_ENTRY) { - throw NotImplementedException("Invalid Catalog type passed to table_sample()"); - } - auto &table_entry = entry.Cast(); - auto types = table_entry.GetTypes(); - for (auto &type : types) { - return_types.push_back(type); - } - for (idx_t i = 0; i < types.size(); i++) { - auto logical_index = LogicalIndex(i); - auto &col = table_entry.GetColumn(logical_index); - names.push_back(col.GetName()); - } - - return make_uniq(entry); -} - -unique_ptr DuckDBTableSampleInit(ClientContext &context, TableFunctionInitInput &input) { - return make_uniq(); -} - -static void DuckDBTableSampleTable(ClientContext &context, DuckDBTableSampleOperatorData &data, - TableCatalogEntry &table, DataChunk &output) { - // if table has statistics. - // copy the sample of statistics into the output chunk - if (!data.sample) { - data.sample = table.GetSample(); - } - if (data.sample) { - auto sample_chunk = data.sample->GetChunk(); - if (sample_chunk) { - sample_chunk->Copy(output, 0); - data.sample_offset += sample_chunk->size(); - } - } -} - -static void DuckDBTableSampleFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &bind_data = data_p.bind_data->Cast(); - auto &state = data_p.global_state->Cast(); - switch (bind_data.entry.type) { - case CatalogType::TABLE_ENTRY: - DuckDBTableSampleTable(context, state, bind_data.entry.Cast(), output); - break; - default: - throw NotImplementedException("Unimplemented catalog type for pragma_table_sample"); - } -} - -void DuckDBTableSample::RegisterFunction(BuiltinFunctions &set) { - set.AddFunction(TableFunction("duckdb_table_sample", {LogicalType::VARCHAR}, DuckDBTableSampleFunction, - DuckDBTableSampleBind, DuckDBTableSampleInit)); -} - -} // namespace duckdb diff --git a/src/function/table/system_functions.cpp b/src/function/table/system_functions.cpp index 972eb7e54a2f..48559545c59d 100644 --- a/src/function/table/system_functions.cpp +++ b/src/function/table/system_functions.cpp @@ -37,7 +37,6 @@ void BuiltinFunctions::RegisterSQLiteFunctions() { DuckDBSequencesFun::RegisterFunction(*this); DuckDBSettingsFun::RegisterFunction(*this); DuckDBTablesFun::RegisterFunction(*this); - DuckDBTableSample::RegisterFunction(*this); DuckDBTemporaryFilesFun::RegisterFunction(*this); DuckDBTypesFun::RegisterFunction(*this); DuckDBVariablesFun::RegisterFunction(*this); diff --git a/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp b/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp index fb9d5ae67d14..ce09c4fe54f1 100644 --- a/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp +++ b/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp @@ -35,8 +35,6 @@ class DuckTableEntry : public TableCatalogEntry { //! Get statistics of a column (physical or virtual) within the table unique_ptr GetStatistics(ClientContext &context, column_t column_id) override; - unique_ptr GetSample() override; - unique_ptr Copy(ClientContext &context) const override; void SetAsRoot() override; diff --git a/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp b/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp index 398e49974b88..b2acb05f1644 100644 --- a/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp +++ b/src/include/duckdb/catalog/catalog_entry/table_catalog_entry.hpp @@ -13,7 +13,6 @@ #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/constraint.hpp" #include "duckdb/planner/bound_constraint.hpp" -#include "duckdb/storage/table/table_statistics.hpp" #include "duckdb/planner/expression.hpp" #include "duckdb/common/case_insensitive_map.hpp" #include "duckdb/catalog/catalog_entry/table_column_type.hpp" @@ -83,8 +82,6 @@ class TableCatalogEntry : public StandardEntry { //! Get statistics of a column (physical or virtual) within the table virtual unique_ptr GetStatistics(ClientContext &context, column_t column_id) = 0; - virtual unique_ptr GetSample(); - //! Returns the column index of the specified column name. //! If the column does not exist: //! If if_column_exists is true, returns DConstants::INVALID_INDEX diff --git a/src/include/duckdb/common/enum_util.hpp b/src/include/duckdb/common/enum_util.hpp index 379a23969c55..0e7a928b42c3 100644 --- a/src/include/duckdb/common/enum_util.hpp +++ b/src/include/duckdb/common/enum_util.hpp @@ -294,8 +294,6 @@ enum class SampleMethod : uint8_t; enum class SampleType : uint8_t; -enum class SamplingState : uint8_t; - enum class ScanType : uint8_t; enum class SecretDisplayType : uint8_t; @@ -784,9 +782,6 @@ const char* EnumUtil::ToChars(SampleMethod value); template<> const char* EnumUtil::ToChars(SampleType value); -template<> -const char* EnumUtil::ToChars(SamplingState value); - template<> const char* EnumUtil::ToChars(ScanType value); @@ -1322,9 +1317,6 @@ SampleMethod EnumUtil::FromString(const char *value); template<> SampleType EnumUtil::FromString(const char *value); -template<> -SamplingState EnumUtil::FromString(const char *value); - template<> ScanType EnumUtil::FromString(const char *value); diff --git a/src/include/duckdb/common/random_engine.hpp b/src/include/duckdb/common/random_engine.hpp index 8a5a3097e4f7..50db4155c04e 100644 --- a/src/include/duckdb/common/random_engine.hpp +++ b/src/include/duckdb/common/random_engine.hpp @@ -18,11 +18,11 @@ namespace duckdb { class ClientContext; struct RandomState; -class RandomEngine { -public: +struct RandomEngine { explicit RandomEngine(int64_t seed = -1); ~RandomEngine(); +public: //! Generate a random number between min and max double NextRandom(double min, double max); @@ -31,8 +31,8 @@ class RandomEngine { //! Generate a random number between 0 and 1, using 32-bits as a base double NextRandom32(); double NextRandom32(double min, double max); - uint32_t NextRandomInteger32(uint32_t min, uint32_t max); uint32_t NextRandomInteger(); + uint32_t NextRandomInteger32(uint32_t min, uint32_t max); uint32_t NextRandomInteger(uint32_t min, uint32_t max); uint64_t NextRandomInteger64(); diff --git a/src/include/duckdb/common/serializer/serializer.hpp b/src/include/duckdb/common/serializer/serializer.hpp index 97aeef51a6f6..58ab8e91a963 100644 --- a/src/include/duckdb/common/serializer/serializer.hpp +++ b/src/include/duckdb/common/serializer/serializer.hpp @@ -16,7 +16,6 @@ #include "duckdb/common/types/uhugeint.hpp" #include "duckdb/common/unordered_map.hpp" #include "duckdb/common/unordered_set.hpp" -#include "duckdb/common/queue.hpp" #include "duckdb/common/optional_idx.hpp" #include "duckdb/common/optionally_owned_ptr.hpp" #include "duckdb/common/value_operations/value_operations.hpp" diff --git a/src/include/duckdb/common/types/uuid.hpp b/src/include/duckdb/common/types/uuid.hpp index bf5ade17a15f..5573aac633c5 100644 --- a/src/include/duckdb/common/types/uuid.hpp +++ b/src/include/duckdb/common/types/uuid.hpp @@ -13,7 +13,7 @@ namespace duckdb { class ClientContext; -class RandomEngine; +struct RandomEngine; //! The UUID class contains static operations for the UUID type class UUID { diff --git a/src/include/duckdb/execution/physical_operator.hpp b/src/include/duckdb/execution/physical_operator.hpp index 50529d594f67..822b55377b74 100644 --- a/src/include/duckdb/execution/physical_operator.hpp +++ b/src/include/duckdb/execution/physical_operator.hpp @@ -164,7 +164,7 @@ class PhysicalOperator { virtual void PrepareFinalize(ClientContext &context, GlobalSinkState &sink_state) const; //! The finalize is called when ALL threads are finished execution. It is called only once per pipeline, and is //! entirely single threaded. - //! If Finalize returns SinkResultType::Finished, the sink is marked as finished + //! If Finalize returns SinkResultType::FINISHED, the sink is marked as finished virtual SinkFinalizeType Finalize(Pipeline &pipeline, Event &event, ClientContext &context, OperatorSinkFinalizeInput &input) const; //! For sinks with RequiresBatchIndex set to true, when a new batch starts being processed this method is called diff --git a/src/include/duckdb/execution/reservoir_sample.hpp b/src/include/duckdb/execution/reservoir_sample.hpp index bc815b11f9de..0edc7e073b9a 100644 --- a/src/include/duckdb/execution/reservoir_sample.hpp +++ b/src/include/duckdb/execution/reservoir_sample.hpp @@ -12,64 +12,25 @@ #include "duckdb/common/common.hpp" #include "duckdb/common/random_engine.hpp" #include "duckdb/common/types/data_chunk.hpp" -#include "duckdb/common/windows_undefs.hpp" #include "duckdb/common/queue.hpp" -// Originally intended to be the vector size, but in order to run on -// vector size = 2, we had to change it. -#define FIXED_SAMPLE_SIZE 2048 - namespace duckdb { enum class SampleType : uint8_t { BLOCKING_SAMPLE = 0, RESERVOIR_SAMPLE = 1, RESERVOIR_PERCENTAGE_SAMPLE = 2 }; -enum class SamplingState : uint8_t { RANDOM = 0, RESERVOIR = 1 }; - -class ReservoirRNG : public RandomEngine { -public: - // return type must be called result type to be a valid URNG - typedef uint32_t result_type; - - explicit ReservoirRNG(int64_t seed) : RandomEngine(seed) {}; - - result_type operator()() { - return NextRandomInteger(); - }; - - static constexpr result_type min() { - return NumericLimits::Minimum(); - }; - static constexpr result_type max() { - return NumericLimits::Maximum(); - }; -}; - -//! Resevoir sampling is based on the 2005 paper "Weighted Random Sampling" by Efraimidis and Spirakis class BaseReservoirSampling { public: explicit BaseReservoirSampling(int64_t seed); BaseReservoirSampling(); - void InitializeReservoirWeights(idx_t cur_size, idx_t sample_size); + void InitializeReservoir(idx_t cur_size, idx_t sample_size); void SetNextEntry(); - void ReplaceElementWithIndex(idx_t entry_index, double with_weight, bool pop = true); void ReplaceElement(double with_weight = -1); - - void UpdateMinWeightThreshold(); - - //! Go from the naive sampling to the reservoir sampling - //! Naive samping will not collect weights, but when we serialize - //! we need to serialize weights again. - void FillWeights(SelectionVector &sel, idx_t &sel_size); - - unique_ptr Copy(); - //! The random generator - ReservoirRNG random; - + RandomEngine random; //! The next element to sample idx_t next_index_to_sample; //! The reservoir threshold of the current min entry @@ -87,13 +48,6 @@ class BaseReservoirSampling { void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); - - static double GetMinWeightFromTuplesSeen(idx_t rows_seen_total); - // static unordered_map tuples_to_min_weight_map; - // Blocking sample is a virtual class. It should be allowed to see the weights and - // of tuples in the sample. The blocking sample can then easily maintain statisitcal properties - // from the sample point of view. - friend class BlockingSample; }; class BlockingSample { @@ -107,31 +61,24 @@ class BlockingSample { bool destroyed; public: - explicit BlockingSample(int64_t seed = -1) - : base_reservoir_sample(make_uniq(seed)), type(SampleType::BLOCKING_SAMPLE), - destroyed(false) { + explicit BlockingSample(int64_t seed) : old_base_reservoir_sample(seed), random(old_base_reservoir_sample.random) { + base_reservoir_sample = nullptr; } virtual ~BlockingSample() { } //! Add a chunk of data to the sample virtual void AddToReservoir(DataChunk &input) = 0; - virtual unique_ptr Copy() const = 0; - virtual void Finalize() = 0; - virtual void Destroy(); - //! Fetches a chunk from the sample. destroy = true should only be used when - //! querying from a sample defined in a query and not a duckdb_table_sample. + virtual void Finalize() = 0; + //! Fetches a chunk from the sample. Note that this method is destructive and should only be used after the + //! sample is completely built. virtual unique_ptr GetChunk() = 0; + BaseReservoirSampling old_base_reservoir_sample; virtual void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); - //! Helper functions needed to merge two reservoirs while respecting weights of sampled rows - std::pair PopFromWeightQueue(); - double GetMinWeightThreshold(); - idx_t GetPriorityQueueSize(); - public: template TARGET &Cast() { @@ -148,6 +95,8 @@ class BlockingSample { } return reinterpret_cast(*this); } + //! The reservoir sampling + RandomEngine &random; }; class ReservoirChunk { @@ -158,126 +107,45 @@ class ReservoirChunk { DataChunk chunk; void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); - - unique_ptr Copy() const; -}; - -struct SelectionVectorHelper { - SelectionVector sel; - uint32_t size; }; +//! The reservoir sample class maintains a streaming sample of fixed size "sample_count" class ReservoirSample : public BlockingSample { public: static constexpr const SampleType TYPE = SampleType::RESERVOIR_SAMPLE; - constexpr static idx_t FIXED_SAMPLE_SIZE_MULTIPLIER = 10; - // size is small enough, then the threshold to switch - // MinValue between std vec size and fixed sample size. - // During 'fast' sampling, we want every new vector to have the potential - // to add to the sample. If the threshold is too far below the standard vector size, then - // samples in the sample have a higher weight than new samples coming in. - // i.e during vector_size=2, 2 new samples will not be significant compared 2048 samples from 204800 tuples. - constexpr static idx_t FAST_TO_SLOW_THRESHOLD = MinValue(STANDARD_VECTOR_SIZE, 60); - - // If the table has less than 204800 rows, this is the percentage - // of values we save when serializing/returning a sample. - constexpr static double SAVE_PERCENTAGE = 0.01; - +public: ReservoirSample(Allocator &allocator, idx_t sample_count, int64_t seed = 1); - explicit ReservoirSample(idx_t sample_count, unique_ptr = nullptr); - - //! methods used to help with serializing and deserializing - void EvictOverBudgetSamples(); - void ExpandSerializedSample(); - - SamplingState GetSamplingState() const; - - //! Vacuum the Reservoir Sample so it throws away tuples that are not in the - //! reservoir weights or in the selection vector - void Vacuum(); - - //! Transform To sample based on reservoir sampling paper - void ConvertToReservoirSample(); - - //! Get the capactiy of the data chunk reserved for storing samples - idx_t GetReservoirChunkCapacity() const; + explicit ReservoirSample(idx_t sample_count, int64_t seed = 1); - //! If for_serialization=true then the sample_chunk is not padded with extra spaces for - //! future sampling values - unique_ptr Copy() const override; - - //! create the first chunk called by AddToReservoir() - idx_t FillReservoir(DataChunk &chunk); //! Add a chunk of data to the sample void AddToReservoir(DataChunk &input) override; - //! Merge two Reservoir Samples. Other must be a reservoir sample - void Merge(unique_ptr other); - - void ShuffleSel(SelectionVector &sel, idx_t range, idx_t size) const; - - //! Update the sample by pushing new sample rows to the end of the sample_chunk. - //! The new sample rows are the tuples rows resulting from applying sel to other - void UpdateSampleAppend(DataChunk &this_, DataChunk &other, SelectionVector &other_sel, idx_t append_count) const; - - idx_t GetTuplesSeen() const; - idx_t NumSamplesCollected() const; - idx_t GetActiveSampleCount() const; - static bool ValidSampleType(const LogicalType &type); - - // get the chunk from Reservoir chunk - DataChunk &Chunk(); //! Fetches a chunk from the sample. Note that this method is destructive and should only be used after the //! sample is completely built. - // unique_ptr GetChunkAndDestroy() override; unique_ptr GetChunk() override; - void Destroy() override; void Finalize() override; - void Verify(); - - idx_t GetSampleCount(); - - // map is [index in input chunk] -> [index in sample chunk]. Both are zero-based - // [index in sample chunk] is incremented by 1 - // index in input chunk have random values, however, they are increasing. - // The base_reservoir_sampling gets updated however, so the indexes point to (sample_chunk_offset + - // index_in_sample_chunk) this data is used to make a selection vector to copy samples from the input chunk to the - // sample chunk - //! Get indexes from current sample that can be replaced. - SelectionVectorHelper GetReplacementIndexes(idx_t sample_chunk_offset, idx_t theoretical_chunk_length); - void Serialize(Serializer &serializer) const override; static unique_ptr Deserialize(Deserializer &deserializer); private: - // when we serialize, we may have collected too many samples since we fill a standard vector size, then - // truncate if the table is still <=204800 values. The problem is, in our weights, we store indexes into - // the selection vector. If throw away values at selection vector index i = 5 , we need to update all indexes - // i > 5. Otherwise we will have indexes in the weights that are greater than the length of our sample. - void NormalizeWeights(); - - SelectionVectorHelper GetReplacementIndexesSlow(const idx_t sample_chunk_offset, const idx_t chunk_length); - SelectionVectorHelper GetReplacementIndexesFast(const idx_t sample_chunk_offset, const idx_t chunk_length); - void SimpleMerge(ReservoirSample &other); - void WeightedMerge(ReservoirSample &other_sample); - - // Helper methods for Shrink(). - // Shrink has different logic depending on if the Reservoir sample is still in - // "Random" mode or in "reservoir" mode. This function creates a new sample chunk - // to copy the old sample chunk into - unique_ptr CreateNewSampleChunk(vector &types, idx_t size) const; - - // Get a vector where each index is a random int in the range 0, size. - // This is used to shuffle selection vector indexes - vector GetRandomizedVector(uint32_t range, uint32_t size) const; + //! Replace a single element of the input + void ReplaceElement(DataChunk &input, idx_t index_in_chunk, double with_weight = -1); + void InitializeReservoir(DataChunk &input); + //! Fills the reservoir up until sample_count entries, returns how many entries are still required + idx_t FillReservoir(DataChunk &input); - idx_t sample_count; +public: Allocator &allocator; + //! The size of the reservoir sample. + //! when calculating percentages, it is set to reservoir_threshold * percentage + //! when explicit number used, sample_count = number + idx_t sample_count; + bool reservoir_initialized; + + //! The current reservoir + unique_ptr reservoir_data_chunk; unique_ptr reservoir_chunk; - bool stats_sample; - SelectionVector sel; - idx_t sel_size; }; //! The reservoir sample sample_size class maintains a streaming sample of variable size @@ -287,16 +155,15 @@ class ReservoirSamplePercentage : public BlockingSample { public: static constexpr const SampleType TYPE = SampleType::RESERVOIR_PERCENTAGE_SAMPLE; +public: ReservoirSamplePercentage(Allocator &allocator, double percentage, int64_t seed = -1); - ReservoirSamplePercentage(double percentage, int64_t seed, idx_t reservoir_sample_size); explicit ReservoirSamplePercentage(double percentage, int64_t seed = -1); //! Add a chunk of data to the sample void AddToReservoir(DataChunk &input) override; - unique_ptr Copy() const override; - - //! Fetches a chunk from the sample. If destory = true this method is descructive + //! Fetches a chunk from the sample. Note that this method is destructive and should only be used after the + //! sample is completely built. unique_ptr GetChunk() override; void Finalize() override; @@ -315,11 +182,9 @@ class ReservoirSamplePercentage : public BlockingSample { //! The set of finished samples of the reservoir sample vector> finished_samples; - //! The amount of tuples that have been processed so far (not put in the reservoir, just processed) idx_t current_count = 0; - //! Whether or not the stream is finalized. The stream is automatically finalized on the first call to - //! GetChunkAndShrink(); + //! Whether or not the stream is finalized. The stream is automatically finalized on the first call to GetChunk(); bool is_finalized; }; diff --git a/src/include/duckdb/function/table/system_functions.hpp b/src/include/duckdb/function/table/system_functions.hpp index c339b386d98e..e1a40cf982f2 100644 --- a/src/include/duckdb/function/table/system_functions.hpp +++ b/src/include/duckdb/function/table/system_functions.hpp @@ -119,10 +119,6 @@ struct DuckDBTablesFun { static void RegisterFunction(BuiltinFunctions &set); }; -struct DuckDBTableSample { - static void RegisterFunction(BuiltinFunctions &set); -}; - struct DuckDBTemporaryFilesFun { static void RegisterFunction(BuiltinFunctions &set); }; diff --git a/src/include/duckdb/main/client_data.hpp b/src/include/duckdb/main/client_data.hpp index 49c281e2d326..27eeb61788df 100644 --- a/src/include/duckdb/main/client_data.hpp +++ b/src/include/duckdb/main/client_data.hpp @@ -27,7 +27,7 @@ class QueryProfiler; class PreparedStatementData; class SchemaCatalogEntry; class HTTPLogger; -class RandomEngine; +struct RandomEngine; struct ClientData { explicit ClientData(ClientContext &context); diff --git a/src/include/duckdb/storage/data_table.hpp b/src/include/duckdb/storage/data_table.hpp index 6a0b97727384..b05fc6b65a3e 100644 --- a/src/include/duckdb/storage/data_table.hpp +++ b/src/include/duckdb/storage/data_table.hpp @@ -188,9 +188,6 @@ class DataTable { //! Get statistics of a physical column within the table unique_ptr GetStatistics(ClientContext &context, column_t column_id); - - //! Get table sample - unique_ptr GetSample(); //! Sets statistics of a physical column within the table void SetDistinct(column_t column_id, unique_ptr distinct_stats); diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index 781cdd9c27e2..7e44380715a5 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -281,7 +281,7 @@ "type": "unique_ptr" } ], - "constructor": ["sample_count", "reservoir_chunk"] + "constructor": ["sample_count"] }, { "class": "ReservoirSamplePercentage", @@ -331,6 +331,7 @@ ], "pointer_type": "none" }, + { "class": "PivotColumnEntry", "members": [ diff --git a/src/include/duckdb/storage/table/row_group_collection.hpp b/src/include/duckdb/storage/table/row_group_collection.hpp index 19aa6452038c..5c47dcd62179 100644 --- a/src/include/duckdb/storage/table/row_group_collection.hpp +++ b/src/include/duckdb/storage/table/row_group_collection.hpp @@ -124,7 +124,6 @@ class RowGroupCollection { void CopyStats(TableStatistics &stats); unique_ptr CopyStats(column_t column_id); - unique_ptr GetSample(); void SetDistinct(column_t column_id, unique_ptr distinct_stats); AttachedDatabase &GetAttached(); diff --git a/src/include/duckdb/storage/table/table_statistics.hpp b/src/include/duckdb/storage/table/table_statistics.hpp index 3bb712f8b2c0..633d469463c2 100644 --- a/src/include/duckdb/storage/table/table_statistics.hpp +++ b/src/include/duckdb/storage/table/table_statistics.hpp @@ -48,14 +48,6 @@ class TableStatistics { //! Get a reference to the stats - this requires us to hold the lock. //! The reference can only be safely accessed while the lock is held ColumnStatistics &GetStats(TableStatisticsLock &lock, idx_t i); - //! Get a reference to the table sample - this requires us to hold the lock. - // BlockingSample &GetTableSampleRef(TableStatisticsLock &lock); - //! Take ownership of the sample, needed for merging. Requires the lock - unique_ptr GetTableSample(TableStatisticsLock &lock); - void SetTableSample(TableStatisticsLock &lock, unique_ptr sample); - - void DestroyTableSample(TableStatisticsLock &lock) const; - void AppendToTableSample(TableStatisticsLock &lock, unique_ptr sample); bool Empty(); @@ -70,6 +62,7 @@ class TableStatistics { //! Column statistics vector> column_stats; //! The table sample + //! Sample for table unique_ptr table_sample; }; diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index fb9baddf9642..c6d0f19bcfae 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -1512,10 +1512,6 @@ void DataTable::SetDistinct(column_t column_id, unique_ptr d row_groups->SetDistinct(column_id, std::move(distinct_stats)); } -unique_ptr DataTable::GetSample() { - return row_groups->GetSample(); -} - //===--------------------------------------------------------------------===// // Checkpoint //===--------------------------------------------------------------------===// @@ -1532,8 +1528,8 @@ void DataTable::Checkpoint(TableDataWriter &writer, Serializer &serializer) { TableStatistics global_stats; row_groups->CopyStats(global_stats); row_groups->Checkpoint(writer, global_stats); + // The row group payload data has been written. Now write: - // sample // column stats // row-group pointers // table pointer diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index 10fe3f949b38..b386d2b9b0db 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -599,8 +599,8 @@ void ReservoirSample::Serialize(Serializer &serializer) const { unique_ptr ReservoirSample::Deserialize(Deserializer &deserializer) { auto sample_count = deserializer.ReadPropertyWithDefault(200, "sample_count"); - auto reservoir_chunk = deserializer.ReadPropertyWithDefault>(201, "reservoir_chunk"); - auto result = duckdb::unique_ptr(new ReservoirSample(sample_count, std::move(reservoir_chunk))); + auto result = duckdb::unique_ptr(new ReservoirSample(sample_count)); + deserializer.ReadPropertyWithDefault>(201, "reservoir_chunk", result->reservoir_chunk); return std::move(result); } diff --git a/src/storage/table/row_group_collection.cpp b/src/storage/table/row_group_collection.cpp index 2629800a4be8..b11b6b1afad6 100644 --- a/src/storage/table/row_group_collection.cpp +++ b/src/storage/table/row_group_collection.cpp @@ -1,4 +1,5 @@ #include "duckdb/storage/table/row_group_collection.hpp" + #include "duckdb/common/serializer/binary_deserializer.hpp" #include "duckdb/execution/expression_executor.hpp" #include "duckdb/execution/index/bound_index.hpp" @@ -396,14 +397,11 @@ bool RowGroupCollection::Append(DataChunk &chunk, TableAppendState &state) { } } state.current_row += row_t(total_append_count); - auto local_stats_lock = state.stats.GetLock(); - for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { auto &column_stats = state.stats.GetStats(*local_stats_lock, col_idx); column_stats.UpdateDistinctStatistics(chunk.data[col_idx], chunk.size(), state.hashes); } - return new_row_group; } @@ -423,8 +421,8 @@ void RowGroupCollection::FinalizeAppend(TransactionData transaction, TableAppend state.total_append_count = 0; state.start_row_group = nullptr; - auto local_stats_lock = state.stats.GetLock(); auto global_stats_lock = stats.GetLock(); + auto local_stats_lock = state.stats.GetLock(); for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { auto &global_stats = stats.GetStats(*global_stats_lock, col_idx); if (!global_stats.HasDistinctStats()) { @@ -584,7 +582,6 @@ idx_t RowGroupCollection::Delete(TransactionData transaction, DataTable &table, } delete_count += row_group->Delete(transaction, table, ids + start, pos - start); } while (pos < count); - return delete_count; } @@ -1142,7 +1139,6 @@ shared_ptr RowGroupCollection::AddColumn(ClientContext &cont result->row_groups->AppendSegment(std::move(new_row_group)); } - return result; } @@ -1155,9 +1151,6 @@ shared_ptr RowGroupCollection::RemoveColumn(idx_t col_idx) { total_rows.load(), row_group_size); result->stats.InitializeRemoveColumn(stats, col_idx); - auto result_lock = result->stats.GetLock(); - result->stats.DestroyTableSample(*result_lock); - for (auto ¤t_row_group : row_groups->Segments()) { auto new_row_group = current_row_group.RemoveColumn(*result, col_idx); result->row_groups->AppendSegment(std::move(new_row_group)); @@ -1204,6 +1197,7 @@ shared_ptr RowGroupCollection::AlterType(ClientContext &cont new_row_group->MergeIntoStatistics(changed_idx, changed_stats.Statistics()); result->row_groups->AppendSegment(std::move(new_row_group)); } + return result; } @@ -1250,7 +1244,7 @@ void RowGroupCollection::VerifyNewConstraint(DataTable &parent, const BoundConst //===--------------------------------------------------------------------===// // Statistics -//===---------------------------------------------------------------r-----===// +//===--------------------------------------------------------------------===// void RowGroupCollection::CopyStats(TableStatistics &other_stats) { stats.CopyStats(other_stats); } @@ -1259,10 +1253,6 @@ unique_ptr RowGroupCollection::CopyStats(column_t column_id) { return stats.CopyStats(column_id); } -unique_ptr RowGroupCollection::GetSample() { - return nullptr; -} - void RowGroupCollection::SetDistinct(column_t column_id, unique_ptr distinct_stats) { D_ASSERT(column_id != COLUMN_IDENTIFIER_ROW_ID); auto stats_lock = stats.GetLock(); diff --git a/src/storage/table/table_statistics.cpp b/src/storage/table/table_statistics.cpp index 61d85319c4a2..b51c445d0dc2 100644 --- a/src/storage/table/table_statistics.cpp +++ b/src/storage/table/table_statistics.cpp @@ -1,23 +1,16 @@ #include "duckdb/storage/table/table_statistics.hpp" - -#include "duckdb/common/serializer/deserializer.hpp" +#include "duckdb/storage/table/persistent_table_data.hpp" #include "duckdb/common/serializer/serializer.hpp" +#include "duckdb/common/serializer/deserializer.hpp" #include "duckdb/execution/reservoir_sample.hpp" -#include "duckdb/storage/table/persistent_table_data.hpp" namespace duckdb { void TableStatistics::Initialize(const vector &types, PersistentTableData &data) { D_ASSERT(Empty()); - D_ASSERT(!table_sample); stats_lock = make_shared_ptr(); column_stats = std::move(data.table_stats.column_stats); - if (data.table_stats.table_sample) { - table_sample = std::move(data.table_stats.table_sample); - } else { - table_sample = make_uniq(static_cast(FIXED_SAMPLE_SIZE)); - } if (column_stats.size() != types.size()) { // LCOV_EXCL_START throw IOException("Table statistics column count is not aligned with table column count. Corrupt file?"); } // LCOV_EXCL_STOP @@ -25,10 +18,8 @@ void TableStatistics::Initialize(const vector &types, PersistentTab void TableStatistics::InitializeEmpty(const vector &types) { D_ASSERT(Empty()); - D_ASSERT(!table_sample); stats_lock = make_shared_ptr(); - table_sample = make_uniq(static_cast(FIXED_SAMPLE_SIZE)); for (auto &type : types) { column_stats.push_back(ColumnStatistics::CreateEmptyStats(type)); } @@ -44,12 +35,6 @@ void TableStatistics::InitializeAddColumn(TableStatistics &parent, const Logical column_stats.push_back(parent.column_stats[i]); } column_stats.push_back(ColumnStatistics::CreateEmptyStats(new_column_type)); - if (parent.table_sample) { - table_sample = std::move(parent.table_sample); - } - if (table_sample) { - table_sample->Destroy(); - } } void TableStatistics::InitializeRemoveColumn(TableStatistics &parent, idx_t removed_column) { @@ -63,12 +48,6 @@ void TableStatistics::InitializeRemoveColumn(TableStatistics &parent, idx_t remo column_stats.push_back(parent.column_stats[i]); } } - if (parent.table_sample) { - table_sample = std::move(parent.table_sample); - } - if (table_sample) { - table_sample->Destroy(); - } } void TableStatistics::InitializeAlterType(TableStatistics &parent, idx_t changed_idx, const LogicalType &new_type) { @@ -84,12 +63,6 @@ void TableStatistics::InitializeAlterType(TableStatistics &parent, idx_t changed column_stats.push_back(parent.column_stats[i]); } } - if (parent.table_sample) { - table_sample = std::move(parent.table_sample); - } - if (table_sample) { - table_sample->Destroy(); - } } void TableStatistics::InitializeAddConstraint(TableStatistics &parent) { @@ -106,21 +79,6 @@ void TableStatistics::InitializeAddConstraint(TableStatistics &parent) { void TableStatistics::MergeStats(TableStatistics &other) { auto l = GetLock(); D_ASSERT(column_stats.size() == other.column_stats.size()); - if (table_sample) { - if (other.table_sample) { - D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); - auto &this_reservoir = table_sample->Cast(); - D_ASSERT(other.table_sample->type == SampleType::RESERVOIR_SAMPLE); - this_reservoir.Merge(std::move(other.table_sample)); - } - // if no other.table sample, do nothig - } else { - if (other.table_sample) { - auto &other_reservoir = other.table_sample->Cast(); - auto other_table_sample_copy = other_reservoir.Copy(); - table_sample = std::move(other_table_sample_copy); - } - } for (idx_t i = 0; i < column_stats.size(); i++) { if (column_stats[i]) { D_ASSERT(other.column_stats[i]); @@ -142,25 +100,6 @@ ColumnStatistics &TableStatistics::GetStats(TableStatisticsLock &lock, idx_t i) return *column_stats[i]; } -// BlockingSample &TableStatistics::GetTableSampleRef(TableStatisticsLock &lock) { -// D_ASSERT(table_sample); -// return *table_sample; -//} - -unique_ptr TableStatistics::GetTableSample(TableStatisticsLock &lock) { - return std::move(table_sample); -} - -void TableStatistics::SetTableSample(TableStatisticsLock &lock, unique_ptr sample) { - table_sample = std::move(sample); -} - -void TableStatistics::DestroyTableSample(TableStatisticsLock &lock) const { - if (table_sample) { - table_sample->Destroy(); - } -} - unique_ptr TableStatistics::CopyStats(idx_t i) { lock_guard l(*stats_lock); auto result = column_stats[i]->Statistics().Copy(); @@ -181,25 +120,11 @@ void TableStatistics::CopyStats(TableStatisticsLock &lock, TableStatistics &othe for (auto &stats : column_stats) { other.column_stats.push_back(stats->Copy()); } - - if (table_sample) { - D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); - auto &res = table_sample->Cast(); - other.table_sample = res.Copy(); - } } void TableStatistics::Serialize(Serializer &serializer) const { serializer.WriteProperty(100, "column_stats", column_stats); - unique_ptr to_serialize = nullptr; - if (table_sample) { - D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); - auto &reservoir_sample = table_sample->Cast(); - to_serialize = unique_ptr_cast(reservoir_sample.Copy()); - auto &res_serialize = to_serialize->Cast(); - res_serialize.EvictOverBudgetSamples(); - } - serializer.WritePropertyWithDefault>(101, "table_sample", to_serialize, nullptr); + serializer.WritePropertyWithDefault>(101, "table_sample", table_sample, nullptr); } void TableStatistics::Deserialize(Deserializer &deserializer, ColumnList &columns) { @@ -217,19 +142,8 @@ void TableStatistics::Deserialize(Deserializer &deserializer, ColumnList &column deserializer.Unset(); }); - table_sample = deserializer.ReadPropertyWithDefault>(101, "table_sample"); - if (table_sample) { - D_ASSERT(table_sample->type == SampleType::RESERVOIR_SAMPLE); -#ifdef DEBUG - if (table_sample) { - auto &reservoir_sample = table_sample->Cast(); - reservoir_sample.Verify(); - } -#endif - } else { - table_sample = make_uniq(static_cast(FIXED_SAMPLE_SIZE)); - table_sample->Destroy(); - } + table_sample = + deserializer.ReadPropertyWithExplicitDefault>(101, "table_sample", nullptr); } unique_ptr TableStatistics::GetLock() { diff --git a/test/sql/sample/can_sample_from_ingested_files.test b/test/sql/sample/can_sample_from_ingested_files.test deleted file mode 100644 index 23c2ef50fd52..000000000000 --- a/test/sql/sample/can_sample_from_ingested_files.test +++ /dev/null @@ -1,44 +0,0 @@ -# name: test/sql/sample/can_sample_from_ingested_files.test -# description: Test reservoir sample crash on large data sets -# group: [sample] - -require parquet - -statement ok -PRAGMA enable_verification; - -statement ok -create table all_types as select * exclude(small_enum, medium_enum, large_enum, "union", bit) from test_all_types(); - -statement ok -copy all_types to '__TEST_DIR__/sample_all_types.csv' (FORMAT CSV); - -statement ok -Create table all_types_csv_1 as select * from read_csv_auto('__TEST_DIR__/sample_all_types.csv'); - -statement ok -Create table all_types_csv_2 as select * from read_csv_auto('__TEST_DIR__/sample_all_types.csv'); - -query T nosort result_1 -select * from all_types_csv_1; - -query T nosort result_1 -select * from all_types_csv_2; - - -statement ok -copy (SELECT * from all_types) to '__TEST_DIR__/sample_all_types.parquet' (FORMAT PARQUET); - -# test parquet -statement ok -Create table all_types_parquet_1 as select * from read_parquet('__TEST_DIR__/sample_all_types.parquet'); - -statement ok -Create table all_types_parquet_2 as select * from read_parquet('__TEST_DIR__/sample_all_types.parquet'); - -query T nosort result_parquet -select * from all_types_parquet_1; - -query T nosort result_paruet -select * from all_types_parquet_2; - diff --git a/test/sql/sample/reservoir_testing_percentage.test b/test/sql/sample/reservoir_testing_percentage.test deleted file mode 100644 index 19217d635461..000000000000 --- a/test/sql/sample/reservoir_testing_percentage.test +++ /dev/null @@ -1,85 +0,0 @@ -# name: test/sql/sample/reservoir_testing_percentage.test -# description: Test SAMPLE keyword -# group: [sample] - -loop i 1 8 - -statement ok -pragma threads=${i}; - -statement ok -CREATE or replace TABLE t1 as select range a from range(1000); - -query I -SELECT count(*) from t1 using sample 0 percent (reservoir); ----- -0 - -query I -SELECT count(*) from t1 using sample 10 percent (reservoir); ----- -100 - -query I -SELECT count(*) from t1 using sample 20 percent (reservoir); ----- -200 - -query I -SELECT count(*) from t1 using sample 80 percent (reservoir); ----- -800 - -query I -SELECT count(*) from t1 using sample 100 percent (reservoir); ----- -1000 - - -statement ok -Insert into t1 select range a from range(9000); - -query I -select count(*) from t1 using sample 80 percent (reservoir); ----- -8000 - -statement ok -Insert into t1 select range a from range(90000); - - -statement ok -Insert into t1 select range a from range(900000); - -query I -select count(*) from t1 using sample 20 percent (reservoir); ----- -200000 - -query I -select count(*) from t1 using sample 30 percent (reservoir); ----- -300000 - -query I -select count(*) from t1 using sample 40 percent (reservoir); ----- -400000 - -query I -select count(*) from t1 using sample 50 percent (reservoir); ----- -500000 - - -query I -select count(*) from t1 using sample 60 percent (reservoir); ----- -600000 - -query I -select count(*) from t1 using sample 70 percent (reservoir); ----- -700000 - -endloop diff --git a/test/sql/sample/reservoir_testing_rows_value.test_slow b/test/sql/sample/reservoir_testing_rows_value.test_slow index 3355225b5b8e..a4e355a69044 100644 --- a/test/sql/sample/reservoir_testing_rows_value.test_slow +++ b/test/sql/sample/reservoir_testing_rows_value.test_slow @@ -8,7 +8,7 @@ statement ok pragma threads=${i}; statement ok -CREATE or replace TABLE t1 as select range a from range(1000); +CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000); query I SELECT count(*) from t1 using sample 0; @@ -30,15 +30,13 @@ SELECT count(*) from t1 using sample 800; ---- 800 - query I SELECT count(*) from t1 using sample 1000; ---- 1000 - statement ok -create or replace table t1 as select * from range(10000); +CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(10000); query I select count(*) from t1 using sample 1000; @@ -61,7 +59,7 @@ select count(*) from t1 using sample 8000; 8000 statement ok -Create or replace table t1 as select range a from range(1000000); +CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000000); query I select count(*) from t1 using sample 200000; diff --git a/test/sql/sample/same_seed_same_sample.test_slow b/test/sql/sample/same_seed_same_sample.test_slow index 1970cf1c4d75..c191fcb75e30 100644 --- a/test/sql/sample/same_seed_same_sample.test_slow +++ b/test/sql/sample/same_seed_same_sample.test_slow @@ -23,15 +23,15 @@ SELECT * from t1 using sample reservoir(100) repeatable (1) order by a; endloop -# testing a table with greater cardinality than the standard vector size, and greater than a row group size. +# testing a table with greater cardinality than the standard vector size statement ok CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(100000); -# samples are only equal when threads = 1 +loop i 1 8 statement ok -set threads=1; +pragma threads=${i}; query III nosort result_2 SELECT * from t1 using sample reservoir(6000) repeatable (1) order by a; @@ -42,6 +42,8 @@ query III nosort result_2 SELECT * from t1 using sample reservoir(6000) repeatable (1) order by a; ---- +endloop + statement ok CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() c from range(1000000); @@ -71,3 +73,5 @@ select count(*) < 10 from (select * from sample1 intersect select * from sample2 ---- true + + diff --git a/test/sql/sample/table_samples/basic_sample_tests.test b/test/sql/sample/table_samples/basic_sample_tests.test deleted file mode 100644 index dc22f6497fd0..000000000000 --- a/test/sql/sample/table_samples/basic_sample_tests.test +++ /dev/null @@ -1,88 +0,0 @@ -# name: test/sql/sample/table_samples/basic_sample_tests.test -# group: [table_samples] - -mode skip - -# currently require fixed vector size since the "randomness" of the sample depends on -# the vector size. If the vector size decreases, the randomness of the sample decreases -# This is especially noticeable for small tables and their samples -require vector_size 2048 - -statement ok -PRAGMA enable_verification - -load __TEST_DIR__/test_samples_basic.db - -query I -select count(*) from range(100000) using sample (10000); ----- -10000 - -query I -select count(*) from range(100) using sample (10); ----- -10 - -query I -select count(*) from range(205000) using sample (10000); ----- -10000 - -statement ok -create table t1 as select range a from range(204800); - -statement ok -select * from duckdb_table_sample('t1'); - -statement ok -create or replace table t1 as select range a from range(1000); - -query II -select avg(a) > 200, avg(a) < 800 from duckdb_table_sample('t1'); ----- -true true - -statement ok -create or replace table t1 as select range a from range(204800); - -# average is not skewed -query II -select avg(a) > (0.2*204800), avg(a) < (0.8*204800) from duckdb_table_sample('t1'); ----- -true true - -# about half the samples are below 102400 and half above -query I -select count(*) from duckdb_table_sample('t1') where a < 102400; ----- -1069 - -query I -select count(*) from duckdb_table_sample('t1') where a > 102400; ----- -979 - -query I -select count(*) from t1 using sample (200000); ----- -200000 - -statement ok -create or replace table materialized_range as select * from range(100); - -statement ok -create or replace table integers_1 as (select range b from materialized_range); - -query I -select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range); ----- -1 - -# sample exists after restart -restart - -query I -select count(b) from duckdb_table_sample('integers_1') where b in (select * from materialized_range); ----- -1 - diff --git a/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow b/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow deleted file mode 100644 index 08c421ca86df..000000000000 --- a/test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow +++ /dev/null @@ -1,42 +0,0 @@ -# name: test/sql/sample/table_samples/sample_stores_rows_from_later_on.test_slow -# description: Test sampling of larger relations -# group: [table_samples] - -mode skip - -# required when testing table samples. See basic_sample_test.test -require vector_size 2048 - -require noforcestorage - -load __TEST_DIR__/test_sample_conversion.db - -statement ok -PRAGMA enable_verification - -statement ok -create table t1 as select 1 a from range(200000); - -loop i 1 4805 - -statement ok -INSERT INTO t1 VALUES(${i} + 1); - -restart - -endloop - -query I -select count(*) from duckdb_table_sample('t1'); ----- -2048 - -query I -select count(*) from duckdb_table_sample('t1') where a > 1; ----- -48 - -query I -select count(*) from (select (floor(range/200000))::INT a from range(204800) using sample reservoir (1%)) t1 where a >= 1; ----- -48 diff --git a/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test b/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test deleted file mode 100644 index d67ab90b7469..000000000000 --- a/test/sql/sample/table_samples/table_sample_converts_to_block_sample.test +++ /dev/null @@ -1,59 +0,0 @@ -# name: test/sql/sample/table_samples/table_sample_converts_to_block_sample.test -# description: Test sampling of larger relations -# group: [table_samples] - -mode skip - -# required when testing table samples. See basic_sample_test.test -require vector_size 2048 - -require noforcestorage - -# table samples first collect only 1% of the table, until the table has a cardinality of 2048. -# then the sample stays at a fixed 2048 values. - -load __TEST_DIR__/test_sample_converts_after_load.db - -statement ok -create table materialized_range as select 1 a from range(102400); - -# only 1% of 102400 -query I -select count(*) from duckdb_table_sample('materialized_range'); ----- -1024 - -restart - -statement ok -insert into materialized_range select 2 a from range(102400); - -# collect another 1% of 102400 -query I -select count(*) from duckdb_table_sample('materialized_range'); ----- -2048 - -query II -select a, count(*) from duckdb_table_sample('materialized_range') group by all order by a; ----- -1 1024 -2 1024 - -# insert another -statement ok -insert into materialized_range select 3 a from range(102400); - -# sample remains at 2048 values -query I -select count(*) from duckdb_table_sample('materialized_range'); ----- -2048 - -# 2048 / 3 = 682. so each value should have at least >650 -query II -select a, count(*) > 650 from duckdb_table_sample('materialized_range') group by all order by a; ----- -1 1 -2 1 -3 1 \ No newline at end of file diff --git a/test/sql/sample/table_samples/table_sample_is_stored.test_slow b/test/sql/sample/table_samples/table_sample_is_stored.test_slow deleted file mode 100644 index b937a05c5a2f..000000000000 --- a/test/sql/sample/table_samples/table_sample_is_stored.test_slow +++ /dev/null @@ -1,150 +0,0 @@ -# name: test/sql/sample/table_samples/table_sample_is_stored.test_slow -# description: Test sampling of larger relations -# group: [table_samples] - -mode skip - -# required when testing table samples. See basic_sample_test.test -require vector_size 2048 - -require noforcestorage - -require icu - -load __TEST_DIR__/test_samples.db - -statement ok -PRAGMA enable_verification - -statement ok -create table materialized_range as select * from range(5000000); - -statement ok -create table integers_1 as (select (range + 5) a, range b, get_current_time() as time from materialized_range); - -query II nosort result_1 -select a::INT, b from duckdb_table_sample('integers_1') order by all; ----- - -statement ok -create table integers_2 as (select (range + 5) a, range b, get_current_time() as time from materialized_range); - -## samples should be the same given the same table and the same contents. -query II nosort result_1 -select a::INT, b from duckdb_table_sample('integers_2') order by all; ----- - -statement ok -create or replace table integers_1 as (select (range + 5) a, range b from materialized_range); - -statement ok -create or replace table integers_2 as (select (range + 5) a, range b from materialized_range); - -# sample only has values in the table it was sampled from -query I -select count(*) from (select b from duckdb_table_sample('integers_1') intersect (select b from integers_1)); ----- -2048 - -query I -select count(*) from (select b from duckdb_table_sample('integers_2') intersect (select b from integers_2)); ----- -2048 - -# sample exists after restart -restart - -query I -select count(*) from duckdb_table_sample('integers_1'); ----- -2048 - -query I -select count(*) from duckdb_table_sample('integers_2'); ----- -2048 - - -query II -select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all; ----- -0.0 453 -1.0 408 -2.0 406 -3.0 404 -4.0 377 - - -# adding another interval should subtract an equal number from the rest of the intervals -statement ok -insert into integers_1 (select (range + 5) a, range b from range(5000000,6000000)); - -query II -select floor(b / 1000000) as interval, count(*) as frequency from duckdb_table_sample('integers_1') group by interval order by all; ----- -0.0 374 -1.0 334 -2.0 332 -3.0 334 -4.0 311 -5.0 363 - -# If double the table count is appended, around half the sample should account for the new values. -statement ok -insert into integers_1 (select -1, -1 from range(6000000)); - -query I -select count(*) from integers_1; ----- -12000000 - - -## about half of the samples should have the pair '-1', 1. -# on latest storage test its something like 997 -query I -select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1; ----- -914 - -restart - -# updated sample is also newly serialized -query I -select count(*) from duckdb_table_sample('integers_1') where a = -1 and b = -1; ----- -914 - -# create a view on top of the sample -statement ok -create view sample_view as select * from duckdb_table_sample('integers_1'); - -# update the sample -statement ok -insert into integers_1 (select -2, -2 from range(6000000)); - - -# 2048 / 3 = 682 (639 is good) -query I -select count(*) from sample_view where a = -2 and b = -2; ----- -639 - -restart - -query I -select count(*) from sample_view where a = -2 and b = -2; ----- -639 - -# currently have 18_000_000 values in the table. -# to try and get 1 value in the sample, we should add -# 18000000 / 2048 = 8789 values to see 1 - -statement ok -insert into integers_1 (select -3, -3 from range(7000)); - -# 1 value makes it -query I -select count(*) from sample_view where a = -3 and b = -3; ----- -1 diff --git a/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test b/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test deleted file mode 100644 index 5e8267435862..000000000000 --- a/test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test +++ /dev/null @@ -1,125 +0,0 @@ -# name: test/sql/sample/table_samples/test_sample_is_destroyed_on_updates.test -# description: Test sampling of larger relations -# group: [table_samples] - -mode skip - -# required when testing table samples. See basic_sample_test.test -require vector_size 2048 - -load __TEST_DIR__/test_sample_is_destroyed_on_update.db - -statement ok -create or replace table integers_1 as select range a, range+1 b from range(102400); - -# no sample collected yet. There are only 5 -query I -select count(*) from duckdb_table_sample('integers_1') order by all; ----- -1024 - -statement ok -delete from integers_1 where a = 3; - -# sample no longer exists -query I -select count(*) from duckdb_table_sample('integers_1') order by all; ----- -0 - -statement ok -create or replace table integers_1 as select range a, range+1 b from range(102400); - -query I -select count(*) from duckdb_table_sample('integers_1'); ----- -1024 - -statement ok -update integers_1 set a = 5 where a = 1; - -query II -select * from duckdb_table_sample('integers_1'); ----- - -# test adding columns destroys the sample. -statement ok -create or replace table integers_1 as select range a, range+1 b from range(204800); - -query I -select count(*) from duckdb_table_sample('integers_1'); ----- -2048 - -statement ok -Alter table integers_1 add column c DOUBLE; - -query III -select * from duckdb_table_sample('integers_1'); ----- - - -# test altering types destroys the sample -statement ok -create or replace table integers_1 as select range a, range+1 b from range(102400); - - -# don't have enough smaples yet. -query I -select count(*) from duckdb_table_sample('integers_1'); ----- -1024 - -statement ok -Alter table integers_1 alter b TYPE VARCHAR - -query II -select * from duckdb_table_sample('integers_1'); ----- - -# test dropping a columns -statement ok -create or replace table integers_1 as select range a, range+1 b from range(102400); - -query I -select count(*) from duckdb_table_sample('integers_1'); ----- -1024 - -statement ok -Alter table integers_1 drop b; - -query I -select * from duckdb_table_sample('integers_1'); ----- - -# test sample is destroyed after a restart -statement ok -create or replace table integers_1 as select range a, range+1 b from range(500); - -query I -select count(*) from duckdb_table_sample('integers_1'); ----- -5 - -statement ok -Alter table integers_1 drop b; - -# sample is destroyed -query I -select * from duckdb_table_sample('integers_1'); ----- - -restart - -statement ok -insert into integers_1 select range a from range(500); - -# sample is still destroyed -query I -select * from duckdb_table_sample('integers_1'); ----- - - - - diff --git a/test/sql/sample/table_samples/test_sample_types.test b/test/sql/sample/table_samples/test_sample_types.test deleted file mode 100644 index 0d656a4abc17..000000000000 --- a/test/sql/sample/table_samples/test_sample_types.test +++ /dev/null @@ -1,80 +0,0 @@ -# name: test/sql/sample/table_samples/test_sample_types.test -# description: Test sampling of larger relations -# group: [table_samples] - -mode skip - -# test valid sampling types (for now only integral types) - -statement ok -pragma enable_verification; - -statement ok -create table string_samples as select range::Varchar a from range(204800); - -query I -select count(*) from duckdb_table_sample('string_samples') where a is NULL; ----- -2048 - -statement ok -create table struct_samples as select {'key1': 'quack-a-lack', 'key2': range} a from range(204800); - -query I -select count(*) from duckdb_table_sample('struct_samples') where a is null; ----- -2048 - -statement ok -create table blob_samples as select '\xAA\xAB\xAC'::BLOB a from range(204800); - -query I -select count(*) from duckdb_table_sample('blob_samples') where a is NULL; ----- -2048 - -statement ok -create table integral_samples as select range::BIGINT a, range::DOUBLE b, range::FLOAT c, range::HUGEINT d, INTERVAL 1 YEAR e from range(204800); - -query I -select count(*) from duckdb_table_sample('integral_samples') where a NOT null; ----- -2048 - -query I -select count(*) from duckdb_table_sample('integral_samples') where b NOT null; ----- -2048 - -query I -select count(*) from duckdb_table_sample('integral_samples') where c NOT null; ----- -2048 - -query I -select count(*) from duckdb_table_sample('integral_samples') where d NOT null; ----- -2048 - -query I -select count(*) from duckdb_table_sample('integral_samples') where e IS null; ----- -2048 - -statement ok -CREATE or replace TABLE t1 as select range a, [1, a, 2] b, a::VARCHAR || 'ducktastic' c, get_current_timestamp() d from range(1000000); - -query I -select count(*) from duckdb_table_sample('t1') where b is null; ----- -2048 - -query I -select count(*) from duckdb_table_sample('t1') where c is null; ----- -2048 - -query I -select count(*) from duckdb_table_sample('t1') where d is null; ----- -2048 \ No newline at end of file diff --git a/test/sql/sample/table_samples/test_table_sample_errors.test b/test/sql/sample/table_samples/test_table_sample_errors.test deleted file mode 100644 index facc8d62a5e8..000000000000 --- a/test/sql/sample/table_samples/test_table_sample_errors.test +++ /dev/null @@ -1,21 +0,0 @@ -# name: test/sql/sample/table_samples/test_table_sample_errors.test -# description: test table sampl[e errors -# group: [table_samples] - -mode skip - -statement ok -create table t1 as select range a from range(204800); - -statement ok -create view v1 as select * from t1; - -statement error -select * from duckdb_table_sample('v1'); ----- -:.*Invalid Catalog type.* - -statement error -select * from duckdb_table_sample('a'); ----- -:.*Catalog Error:.*Table.*does not exist.* \ No newline at end of file diff --git a/test/sql/sample/test_sample.test_slow b/test/sql/sample/test_sample.test_slow index 557b629203e6..d476fcc211d8 100644 --- a/test/sql/sample/test_sample.test_slow +++ b/test/sql/sample/test_sample.test_slow @@ -75,6 +75,7 @@ SELECT COUNT(*) FROM range(2000000) USING SAMPLE 2 ---- 2 + # test sample with multiple columns # we insert the same data in the entire column statement ok @@ -222,8 +223,11 @@ select i from integers using sample (1 rows) repeatable (0); query I noresult repeatable_seed_0 select i from integers using sample (1 rows) repeatable (0); ---- +152 +query I +select i from integers using sample reservoir(1%) repeatable (0); query I noresult repeatable_seed_1 select i from integers using sample reservoir(1%) repeatable (0) order by i; ---- @@ -231,5 +235,7 @@ select i from integers using sample reservoir(1%) repeatable (0) order by i; query I noresult repeatable_seed_1 select i from integers using sample reservoir(1%) repeatable (0) order by i; ---- +51 +78 58 127 diff --git a/test/sql/storage/checkpointed_self_append.test b/test/sql/storage/checkpointed_self_append.test index dc3162709d65..cd1f0e806e7c 100644 --- a/test/sql/storage/checkpointed_self_append.test +++ b/test/sql/storage/checkpointed_self_append.test @@ -4,6 +4,7 @@ require skip_reload + # load the DB from disk load __TEST_DIR__/checkpointed_self_append.db diff --git a/test/sql/storage/reclaim_space/reclaim_space_drop_column_overflow_strings.test_slow b/test/sql/storage/reclaim_space/reclaim_space_drop_column_overflow_strings.test_slow index b1b013597f1d..2b6ae0d0fcac 100644 --- a/test/sql/storage/reclaim_space/reclaim_space_drop_column_overflow_strings.test_slow +++ b/test/sql/storage/reclaim_space/reclaim_space_drop_column_overflow_strings.test_slow @@ -90,4 +90,4 @@ SELECT AVG(STRLEN(s)), MIN(STRLEN(S)), MAX(STRLEN(S)), SUM(STRLEN(S)), MIN(S[1]) ---- 296.955 0 5000 44543527 (empty) X -endloop \ No newline at end of file +endloop diff --git a/test/sql/upsert/test_big_insert.test b/test/sql/upsert/test_big_insert.test index 1bba1ba5fd17..2de05a7d27b1 100644 --- a/test/sql/upsert/test_big_insert.test +++ b/test/sql/upsert/test_big_insert.test @@ -2,6 +2,8 @@ # description: Test insert into statements # group: [upsert] +# TODO: remove this, behavior should be consistent at all vector sizes + statement ok pragma enable_verification;