diff --git a/.gitignore b/.gitignore index 1571229e..456b0117 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ # Visual Studio Code .vscode/ +# Intellij +.idea/ + # OSX files .DS_Store @@ -43,3 +46,9 @@ _*/ docs java + +# clang +.clangd + +# CMakeFiles +CMakeFiles/ diff --git a/CMakeLists.txt b/CMakeLists.txt index c469e456..60fd4e6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,12 +120,13 @@ add_subdirectory(count) add_subdirectory(density) add_subdirectory(tdigest) add_subdirectory(filters) +add_subdirectory(ddsketch) if (WITH_PYTHON) add_subdirectory(python) endif() -target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling req quantiles count) +target_link_libraries(datasketches INTERFACE hll cpc kll fi theta sampling req quantiles count ddsketch) if (COVERAGE) find_program(LCOV_PATH NAMES "lcov") diff --git a/Doxyfile b/Doxyfile index 4e9be389..460b9ff3 100644 --- a/Doxyfile +++ b/Doxyfile @@ -955,6 +955,7 @@ INPUT = common/include \ fi/include \ count/include \ req/include \ + ddsketch/include \ README.md # This tag can be used to specify the character encoding of the source files diff --git a/ddsketch/CMakeLists.txt b/ddsketch/CMakeLists.txt new file mode 100644 index 00000000..f1f3ba18 --- /dev/null +++ b/ddsketch/CMakeLists.txt @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_library(ddsketch INTERFACE) + +add_library(${PROJECT_NAME}::DDSKETCH ALIAS ddsketch) + +if (BUILD_TESTS) + add_subdirectory(test) +endif() + +target_include_directories(ddsketch + INTERFACE + $ + $/include> +) + +target_link_libraries(ddsketch INTERFACE common) + +install(TARGETS ddsketch + EXPORT ${PROJECT_NAME} +) + +install(FILES + include/bin.hpp + include/bin_impl.hpp + include/collapsing_dense_store.hpp + include/collapsing_dense_store_impl.hpp + include/collapsing_highest_dense_store.hpp + include/collapsing_highest_dense_store_impl.hpp + include/collapsing_lowest_dense_store.hpp + include/collapsing_lowest_dense_store_impl.hpp + include/ddsketch.hpp + include/ddsketch_impl.hpp + include/dense_store.hpp + include/dense_store_impl.hpp + include/index_mapping.hpp + include/index_mapping_factory.hpp + include/index_mapping_impl.hpp + include/linearly_interpolated_mapping.hpp + include/linearly_interpolated_mapping_impl.hpp + include/log_like_index_mapping.hpp + include/log_like_index_mapping_impl.hpp + include/logarithmic_mapping.hpp + include/logarithmic_mapping_impl.hpp + include/quadratically_interpolated_mapping.hpp + include/quadratically_interpolated_mapping_impl.hpp + include/sparse_store.hpp + include/sparse_store_impl.hpp + include/store_factory.hpp + include/unbounded_size_dense_store.hpp + include/unbounded_size_dense_store_impl.hpp + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/ddsketch/include/bin.hpp b/ddsketch/include/bin.hpp new file mode 100644 index 00000000..35ff23d1 --- /dev/null +++ b/ddsketch/include/bin.hpp @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef BIN_H +#define BIN_H + +#include +#include + +namespace datasketches { + +/** + * @class Bin + * @brief Represents a bucket of counts in a DDSketch store. + * + * A Bin corresponds to a mapped value index and its associated count. + * It is the fundamental unit used in DenseStore, SparseStore, and their variants. + */ +class Bin { +public: + /** + * @brief Construct a new Bin. + * @param index The index representing the mapped value bucket. + * @param count The number of samples in this bin. + */ + Bin(int index, double count); + + ~Bin() = default; + + /** + * @brief Equality operator. + * @param other The other bin to compare with. + * @return True if both bins have the same index and count. + */ + bool operator==(const Bin& other) const; + std::string to_string() const; + + /** + * @brief Get the count of this bin. + * @return The number of samples in the bin. + */ + double get_count() const; + + /** + * @brief Get the index of this bin. + * @return The integer index. + */ + int get_index() const; + +private: + int index; + double count; +}; +} + +#include "bin_impl.hpp" + +#endif //BIN_H diff --git a/ddsketch/include/bin_impl.hpp b/ddsketch/include/bin_impl.hpp new file mode 100644 index 00000000..b3e50be7 --- /dev/null +++ b/ddsketch/include/bin_impl.hpp @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef BIN_IMPL_H +#define BIN_IMPL_H + +#include "bin.hpp" + +namespace datasketches { +inline Bin::Bin(int index, double count): index(index), count(count) {}; + +inline bool Bin::operator==(const Bin& other) const { + if (this == &other) { + return true; + } + return index == other.index && count == other.count; +}; + +inline double Bin::get_count() const { + return count; +} + +inline int Bin::get_index() const { + return index; +} + +inline std::string Bin::to_string() const { + return "Bin{index= " + std::to_string(index) + ", count= " + std::to_string(count) + "}"; +} + +} +#endif //BIN_IMPL_H diff --git a/ddsketch/include/collapsing_dense_store.hpp b/ddsketch/include/collapsing_dense_store.hpp new file mode 100644 index 00000000..e35d5e03 --- /dev/null +++ b/ddsketch/include/collapsing_dense_store.hpp @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef COLLAPSING_DENSE_STORE_HPP +#define COLLAPSING_DENSE_STORE_HPP + +#include "dense_store.hpp" + +namespace datasketches { + +/** + * @class CollapsingDenseStore + * @brief Common logic for capacity-bounded dense stores with tail-collapsing. + */ +template +class CollapsingDenseStore : public DenseStore { +public: + + using size_type = typename DenseStore::size_type; + CollapsingDenseStore(); + + /** + * Copy assignment + * @param other sketch to be copied + * @return reference to this sketch + */ + CollapsingDenseStore& operator=(const CollapsingDenseStore& other); + + /** + * This method serializes the store into a given stream in a binary form + * @param os output stream + */ + void serialize(std::ostream& os) const; + + /** + * @brief Deserialize the store from a stream (replacing current contents). + * @param is Input stream. + */ + static Derived deserialize(std::istream& is); + + /** + * Computes size needed to serialize the current state of the sketch. + * @return size in bytes needed to serialize this sketch + */ + int get_serialized_size_bytes() const; + + ~CollapsingDenseStore() = default; + + /** + * @brief Clear all contents of the store. + * + * Removes all bins and resets counts to zero while preserving configuration + * (e.g., capacity limits). After this call, @c total_count() is 0 and the + * store contains no non-empty bins. + */ + void clear(); + +protected: + bool is_collapsed; + + /** + * @brief Compute the resized backing-array length for a target index span. + * + * @param new_min_index Lowest bin index to be retained (inclusive). + * @param new_max_index Highest bin index to be retained (inclusive). + * @return size_type New backing-array capacity (in bins). + */ + size_type get_new_length(size_type new_min_index, size_type new_max_index) const; +}; +} + +#include "collapsing_dense_store_impl.hpp" + +#endif //COLLAPSING_DENSE_STORE_HPP \ No newline at end of file diff --git a/ddsketch/include/collapsing_dense_store_impl.hpp b/ddsketch/include/collapsing_dense_store_impl.hpp new file mode 100644 index 00000000..7b1c5c14 --- /dev/null +++ b/ddsketch/include/collapsing_dense_store_impl.hpp @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef COLLAPSING_DENSE_STORE_IMPL_HPP +#define COLLAPSING_DENSE_STORE_IMPL_HPP + +#include "collapsing_dense_store.hpp" + +namespace datasketches { +template +CollapsingDenseStore::CollapsingDenseStore(): + DenseStore(), + is_collapsed(false) {} + +template +CollapsingDenseStore& CollapsingDenseStore::operator=(const CollapsingDenseStore& other) { + this->bins = other.bins; + this->offset = other.offset; + this->min_index = other.min_index; + this->max_index = other.max_index; + + + return *this; +} + +template +typename CollapsingDenseStore::size_type CollapsingDenseStore::get_new_length(size_type new_min_index, size_type new_max_index) const { + return std::min(DenseStore::get_new_length(new_min_index, new_max_index), N); +} + +template +void CollapsingDenseStore::clear() { + DenseStore::clear(); + is_collapsed = false; +} + +template +void CollapsingDenseStore::serialize(std::ostream& os) const { + if (this->is_empty()) { + return; + } + write(os, is_collapsed); + + this->serialize_common(os); +} + +template +Derived CollapsingDenseStore::deserialize(std::istream& is) { + Derived store; + + if (is.peek() == std::istream::traits_type::eof()) { + return store; + } + store.is_collapsed = read(is); + Derived::deserialize_common(store, is); + + return store; +} + +template +int CollapsingDenseStore::get_serialized_size_bytes() const { + // Header written by serialize(): max_num_bins always present + int size_bytes = 0; + if (this->is_empty()) { + return size_bytes; + } + // is_collapsed flag, then the common section (range + bins) + size_bytes += static_cast(sizeof(is_collapsed)); + size_bytes += this->get_serialized_size_bytes_common(); + return size_bytes; +} + +} +#endif //COLLAPSING_DENSE_STORE_IMPL_HPP diff --git a/ddsketch/include/collapsing_highest_dense_store.hpp b/ddsketch/include/collapsing_highest_dense_store.hpp new file mode 100644 index 00000000..0397996b --- /dev/null +++ b/ddsketch/include/collapsing_highest_dense_store.hpp @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef COLLAPSING_HIGHEST_DENSE_STORE_HPP +#define COLLAPSING_HIGHEST_DENSE_STORE_HPP + +#include "collapsing_dense_store.hpp" + +namespace datasketches { +/** + * @class CollapsingHighestDenseStore + * @brief Capacity-bounded dense store collapsing from the higher end. + * @tparam N Maximum number of bins (capacity limit). + * @tparam Allocator Allocator type for internal storage. + * + * When capacity is exceeded, the highest-index bins are merged into one, + * preserving total count while reducing resolution in the high tail. + */ +template +class CollapsingHighestDenseStore : public CollapsingDenseStore, N, Allocator> { +public: + using size_type = typename CollapsingDenseStore::size_type; + + /** + * @brief Constructor. + */ + CollapsingHighestDenseStore(); + + /** + * @brief Create a heap-allocated copy of this store. + * @return Pointer to a new CollapsingHighestDenseStore with identical contents. + */ + CollapsingHighestDenseStore* copy() const; + + /** + * @brief Merge another store into this one. + * @param other Source store; its counts are added into this store. + * @note May trigger tail collapsing to respect the capacity @tparam N. + */ + void merge(const CollapsingHighestDenseStore& other); + + /** + * @brief Bring base-class merge overloads into scope (e.g., generic Store/DenseStore merges). + */ + using DenseStore::merge; + +protected: + + /** + * @brief Normalize a raw bin index into this store's current window. + * + * If @p index exceeds the current @c max_index, the range is extended; if + * extension causes the store to collapse the high tail, the normalized index + * is the last bin. If @p index is below @c min_index, the range is extended + * on the low side. Otherwise returns the in-range offset. + * + * @param index Raw (possibly out-of-range) bin index. + * @return size_type In-range index (offset from @c offset) used for storage. + */ + size_type normalize(size_type index); + + + void adjust(size_type new_min_index, size_type new_max_index); + + friend class DenseStore; +}; +} + +#include "collapsing_highest_dense_store_impl.hpp" +#endif //COLLAPSING_HIGHEST_DENSE_STORE_HPP \ No newline at end of file diff --git a/ddsketch/include/collapsing_highest_dense_store_impl.hpp b/ddsketch/include/collapsing_highest_dense_store_impl.hpp new file mode 100644 index 00000000..2f5ffc40 --- /dev/null +++ b/ddsketch/include/collapsing_highest_dense_store_impl.hpp @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef COLLAPSING_HIGHEST_DENSE_STORE_IMPL_HPP +#define COLLAPSING_HIGHEST_DENSE_STORE_IMPL_HPP + +#include "collapsing_highest_dense_store.hpp" + +namespace datasketches { +template +CollapsingHighestDenseStore::CollapsingHighestDenseStore(): CollapsingDenseStore(){} + +template +CollapsingHighestDenseStore* CollapsingHighestDenseStore::copy() const { + using StoreAlloc = typename std::allocator_traits::template rebind_alloc>; + StoreAlloc alloc(this->bins.get_allocator()); + return new (alloc.allocate(1)) CollapsingHighestDenseStore(*this); +} + +template +void CollapsingHighestDenseStore::merge(const CollapsingHighestDenseStore& other) { + if (other.is_empty()) { + return; + } + + if (other.min_index < this->min_index || other.max_index > this->max_index) { + this->extend_range(other.min_index, other.max_index); + } + + size_type index = other.max_index; + for (; index > this->max_index && index >= other.min_index; index--) { + this->bins[this->bins.size() - 1] += other.bins[index - other.offset]; + } + for (; index > other.min_index; index--) { + this->bins[index - this->offset] += other.bins[index - other.offset]; + } + // This is a separate test so that the comparison in the previous loop is strict (>) and handles + // other.min_index = Integer.MIN_VALUE. + if (index == other.min_index) { + this->bins[index - this->offset] += other.bins[index - other.offset]; + } +} + +template +typename CollapsingHighestDenseStore::size_type CollapsingHighestDenseStore::normalize(size_type index) { + if (index > this->max_index) { + if (this->is_collapsed) { + return this->bins.size() - 1; + } + this->extend_range(index); + if (this->is_collapsed) { + return this->bins.size() - 1; + } + } else if (index < this->min_index) { + this->extend_range(index); + } + + return index - this->offset; +} + +template +void CollapsingHighestDenseStore::adjust(size_type new_min_index, size_type new_max_index) { + if (new_max_index - new_min_index + 1 > static_cast(this->bins.size())) { + // The range of indices is too wide, buckets of lowest indices need to be collapsed. + new_max_index = new_min_index + this->bins.size() - 1; + + if (new_max_index <= this->min_index) { + // There will be only one non-empty bucket. + const double total_count = this->get_total_count(); + this->reset_bins(); + this->offset = new_min_index; + this->max_index = new_max_index; + this->bins[this->bins.size() - 1] = total_count; + } else { + const size_type shift = this->offset - new_min_index; + if (shift > 0) { + // Collapse the buckets. + const double collapsed_count = this->get_total_count(new_max_index + 1, this->max_index); + this->reset_bins(new_max_index + 1, this->max_index); + this->bins[new_max_index - this->offset] += collapsed_count; + this->max_index = new_max_index; + // Shift the buckets to make room for new_min_index. + this->shift_bins(shift); + } else { + // Shift the buckets to make room for new_max_index. + this->shift_bins(shift); + this->max_index = new_max_index; + } + } + this->min_index = new_min_index; + this->is_collapsed = true; + } else { + this->center_bins(new_min_index, new_max_index); + } +} +} + +#endif //COLLAPSING_HIGHEST_DENSE_STORE_IMPL_HPP \ No newline at end of file diff --git a/ddsketch/include/collapsing_lowest_dense_store.hpp b/ddsketch/include/collapsing_lowest_dense_store.hpp new file mode 100644 index 00000000..4a68764d --- /dev/null +++ b/ddsketch/include/collapsing_lowest_dense_store.hpp @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef COLLAPSING_LOWEST_DENSE_STORE_HPP +#define COLLAPSING_LOWEST_DENSE_STORE_HPP + +#include "collapsing_dense_store.hpp" + +namespace datasketches { +/** + * @class CollapsingLowestDenseStore + * @brief Capacity-bounded dense store collapsing from the lower end. + * @tparam N Maximum number of bins (capacity limit). + * @tparam Allocator Allocator type for internal storage. + * + * When capacity is exceeded, the highest-index bins are merged into one, + * preserving total count while reducing resolution in the low tail. + */ +template +class CollapsingLowestDenseStore : public CollapsingDenseStore, N, Allocator> { +public: + using size_type = typename CollapsingDenseStore::size_type; + + /** + * @brief Constructor. + */ + CollapsingLowestDenseStore(); + + /** + * @brief Create a heap-allocated copy of this store. + * @return Pointer to a new CollapsingLowestDenseStore with identical contents. + */ + CollapsingLowestDenseStore* copy() const; + + /** + * @brief Merge another store into this one. + * @param other Source store; its counts are added into this store. + * @note May trigger tail collapsing to respect the capacity @tparam N. + */ + void merge(const CollapsingLowestDenseStore& other); + + /** + * @brief Bring base-class merge overloads into scope (e.g., generic Store/DenseStore merges). + */ + using DenseStore::merge; + +protected: + + /** + * @brief Normalize a raw bin index into this store's current window. + * + * If @p index exceeds the current @c max_index, the range is extended; if + * extension causes the store to collapse the high tail, the normalized index + * is the last bin. If @p index is below @c min_index, the range is extended + * on the low side. Otherwise returns the in-range offset. + * + * @param index Raw (possibly out-of-range) bin index. + * @return size_type In-range index (offset from @c offset) used for storage. + */ + size_type normalize(size_type index); + + /** + * @brief Reframe the active index window to [new_min_index, new_max_index]. + * @param new_min_index New lowest retained index (inclusive). + * @param new_max_index New highest retained index (inclusive). + * @note Collapses highest bins when shrinking from the top to maintain capacity. + */ + void adjust(size_type new_min_index, size_type new_max_index); + + friend class DenseStore; +}; +} + +#include "collapsing_lowest_dense_store_impl.hpp" + +#endif //COLLAPSING_LOWEST_DENSE_STORE_HPP diff --git a/ddsketch/include/collapsing_lowest_dense_store_impl.hpp b/ddsketch/include/collapsing_lowest_dense_store_impl.hpp new file mode 100644 index 00000000..0693e533 --- /dev/null +++ b/ddsketch/include/collapsing_lowest_dense_store_impl.hpp @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef COLLAPSING_LOWEST_DENSE_STORE_IMPL_HPP +#define COLLAPSING_LOWEST_DENSE_STORE_IMPL_HPP + +#include "collapsing_lowest_dense_store.hpp" +#include "collapsing_lowest_dense_store.hpp" + +namespace datasketches { +template +CollapsingLowestDenseStore::CollapsingLowestDenseStore(): CollapsingDenseStore, N, Allocator>(){} + +template +CollapsingLowestDenseStore* CollapsingLowestDenseStore::copy() const { + using StoreAlloc = typename std::allocator_traits::template rebind_alloc>; + StoreAlloc alloc(this->bins.get_allocator()); + return new (alloc.allocate(1)) CollapsingLowestDenseStore(*this); +} + +template +void CollapsingLowestDenseStore::merge(const CollapsingLowestDenseStore& other) { + if (other.is_empty()) { + return; + } + + if (other.min_index < this->min_index || other.max_index > this->max_index) { + this->extend_range(other.min_index, other.max_index); + } + + size_type index = other.min_index; + for (; index < this->min_index && index <= other.max_index; index++) { + this->bins[0] += other.bins[index - other.offset]; + } + for (; index < other.max_index; index++) { + this->bins[index - this->offset] += other.bins[index - other.offset]; + } + // This is a separate test so that the comparison in the previous loop is strict (>) and handles + // other.min_index = Integer.MIN_VALUE. + if (index == other.max_index) { + this->bins[index - this->offset] += other.bins[index - other.offset]; + } +} + +template +typename CollapsingLowestDenseStore::size_type CollapsingLowestDenseStore::normalize(size_type index) { + if (index < this->min_index) { + if (this->is_collapsed) { + return static_cast(0); + } + this->extend_range(index); + if (this->is_collapsed) { + return static_cast(0); + } + } else if (index > this->max_index) { + this->extend_range(index); + } + + return index - this->offset; +} + +template +void CollapsingLowestDenseStore::adjust(size_type new_min_index, size_type new_max_index) { + if (new_max_index - new_min_index + 1 > static_cast(this->bins.size())) { + // The range of indices is too wide, buckets of lowest indices need to be collapsed. + new_min_index = new_max_index - this->bins.size() + 1; + + if (new_min_index >= this->max_index) { + // There will be only one non-empty bucket. + const double total_count = this->get_total_count(); + this->reset_bins(); + this->offset = new_min_index; + this->min_index = new_min_index; + this->bins[0] = total_count; + } else { + const size_type shift = this->offset - new_min_index; + if (shift < 0) { + // Collapse the buckets. + const double collapsed_count = this->get_total_count(this->min_index, new_min_index - 1); + this->reset_bins(this->min_index, new_min_index - 1); + this->bins[new_min_index - this->offset] += collapsed_count; + this->min_index = new_min_index; + // Shift the buckets to make room for new_min_index. + this->shift_bins(shift); + } else { + // Shift the buckets to make room for new_max_index. + this->shift_bins(shift); + this->min_index = new_min_index; + } + } + this->max_index = new_max_index; + this->is_collapsed = true; + } else { + this->center_bins(new_min_index, new_max_index); + } +} + +} + +#endif //COLLAPSING_LOWEST_DENSE_STORE_IMPL_HPP diff --git a/ddsketch/include/ddsketch.hpp b/ddsketch/include/ddsketch.hpp new file mode 100644 index 00000000..af120fa9 --- /dev/null +++ b/ddsketch/include/ddsketch.hpp @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _DDSKETCH_HPP_ +#define _DDSKETCH_HPP_ + +#include +#include +#include +#include "store_factory.hpp" +#include "common_defs.hpp" +#include "memory_operations.hpp" + +namespace datasketches { + +/** + * @class DDSketch + * @brief A @c DDSketch with relative-error guarantees. This sketch computes quantile values + * with an approximation error that is relative to the actual quantile value. It works with both + * positive and negative input values. +* DDSketch works by mapping floating-point input values to bins and counting the number + * of values for each bin. + * + * @tparam Store underlying data structure that keeps track of bin counts. + * @tparam Mapping maps an index to its corresponding bin. + */ +template +class DDSketch { +public: + using vector_double = std::vector; + using T = typename Store::bins_type::value_type; + + /** + * Constructs an initially empty quantile sketch using the specified {@link IndexMapping} and + * {@link Store}. + * + * @param relative_accuracy sets the relative accuracy of the sketch. + * For instance, using {@code DDSketch} with a relative accuracy guarantee set to 1%, if the + * expected quantile value is 100, the computed quantile value is guaranteed to be between 99 and + * 101. + */ + explicit DDSketch(const double& relative_accuracy); + + /** + * Constructs an initially empty quantile sketch using the specified {@link IndexMapping}. + */ + explicit DDSketch(const Mapping& index_mapping); + + /** + * Adds a value to the sketch. + * + * @param value the value to be added. + * @param count the (optional) count to increase by. Default is 1.0. + */ + void update(const double& value, const double& count = 1.0); + + /** + * @brief Merge another {@link DDSketch} into this one. + * + * @param other DDSketch; its counts are added into this store. + * @tparam OtherStore type of the other store. + */ + template + void merge(const DDSketch& other); + + /** + * @brief Computes the rank of @p item in [0,1]. + * Defined as approximately (# of values ≤ @p item) / total_count, computed from + * the sketch’s binned counts. Monotone in @p item and approximately the inverse + * of @c get_quantile(). + */ + double get_rank(const double& item) const; + + /** + * @brieg Computes the quantile k of @p item in [0,1]. + * + * Returns a value v such that (approximately) @c get_rank(v) ≥ @p rank. + */ + double get_quantile(const double& rank) const; + + vector_double get_PMF(const double* split_points, uint32_t size) const; + + vector_double get_CDF(const double* split_points, uint32_t size) const; + + + bool is_empty() const; + + /** + * @brief Clear all contents of the sketch. + * + * Calls clear() on the underlying stores. + */ + void clear(); + + /** + * @return the total count hold by the sketch. + */ + double get_count() const; + + /** + * @return Sum of all inserted values. + */ + double get_sum() const; + /** + * @return Min of all inserted values. + */ + double get_min() const; + /** + * @return Max of all inserted values. + */ + double get_max() const; + + /** + * This method serializes the sketch into a given stream in a binary form + * @param os output stream + */ + void serialize(std::ostream& os) const; + + /** + * @brief Deserialize the store from a stream (replacing current contents). + * @param is Input stream. + */ + static DDSketch deserialize(std::istream& is); + + + /** + * Computes size in bytes needed to serialize the current state of the sketch. + * @return size in bytes needed to serialize this sketch + */ + int get_serialized_size_bytes() const; + + template> + string to_string() const; + + bool operator==(const DDSketch& other) const; +protected: + + /** + * Protected constructor, meant to be used internally only. + * + * @param positive_store + * @param negative_store + * @param mapping + * @param zero_count + * @param min_indexed_value + */ + DDSketch(const Store& positive_store, const Store& negative_store, const Mapping& mapping, const double& zero_count = 0.0, const double& min_indexed_value = 0.0); + Store positive_store; + Store negative_store; + Mapping index_mapping; + + double zero_count; + const double min_indexed_value; + const double max_indexed_value; + + void check_value_trackable(const double& value) const; + + template + void check_mergeability(const DDSketch& other) const; + + double get_quantile(const double& rank, const double& count) const; + + static inline void check_split_pints(const double* values, uint32_t size); +}; + +} /* namespace datasketches */ + +#include "ddsketch_impl.hpp" + +#endif // _DDSKETCH_HPP_ diff --git a/ddsketch/include/ddsketch_impl.hpp b/ddsketch/include/ddsketch_impl.hpp new file mode 100644 index 00000000..eed7c6d5 --- /dev/null +++ b/ddsketch/include/ddsketch_impl.hpp @@ -0,0 +1,318 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef DDSKETCH_IMPL_H +#define DDSKETCH_IMPL_H + +#include +#include +#include "bin.hpp" +#include +#include "ddsketch.hpp" +#include "store_factory.hpp" +namespace datasketches { +template +DDSketch::DDSketch(const double& relative_accuracy): DDSketch(Mapping(relative_accuracy)) {} + +template +DDSketch::DDSketch(const Mapping& index_mapping): + index_mapping(index_mapping), + zero_count(0), + min_indexed_value(index_mapping.min_indexable_value()), + max_indexed_value(index_mapping.max_indexable_value()) +{} + + +template +DDSketch::DDSketch(const Store& positive_store, const Store& negative_store, const Mapping& mapping, const double& zero_count, const double& min_indexed_value): + positive_store(std::move(positive_store)), + negative_store(std::move(negative_store)), + index_mapping(std::move(mapping)), + zero_count(zero_count), + min_indexed_value(std::max(min_indexed_value, mapping.min_indexable_value())), + max_indexed_value(mapping.max_indexable_value()) {} + + +template +void DDSketch::check_value_trackable(const double& value) const { + if (value < -max_indexed_value || value > max_indexed_value) { + throw std::invalid_argument("input value is outside the range that is tracked by the sketch."); + } +} + +template +template +void DDSketch::check_mergeability(const DDSketch& other) const { + if (index_mapping != other.index_mapping) { + throw std::invalid_argument("sketches are not mergeable because they do not use the same index mappings."); + } +} + +template +void DDSketch::update(const double& value, const double& count) { + check_value_trackable(value); + + if (count < 0.0) { + throw std::invalid_argument("count cannot be negative."); + } + + if (value > min_indexed_value) { + positive_store.add(index_mapping.index(value), count); + } else if (value < -min_indexed_value) { + negative_store.add(index_mapping.index(-value), count); + } else { + zero_count += count; + } +} + +template +template +void DDSketch::merge(const DDSketch& other) { + check_mergeability(other); + negative_store.merge(other.negative_store); + positive_store.merge(other.positive_store); + zero_count += other.zero_count; +} + +template +bool DDSketch::is_empty() const { + return zero_count == 0.0 && positive_store.is_empty() && negative_store.is_empty(); +} + +template +void DDSketch::clear() { + negative_store.clear(); + positive_store.clear(); + zero_count = 0.0; +} + +template +double DDSketch::get_count() const { + return zero_count + negative_store.get_total_count() + positive_store.get_total_count(); +} + +template +double DDSketch::get_sum() const { + double sum = 0.0; + for (const Bin& bin : negative_store) { + sum -= index_mapping.value(bin.get_index()) * bin.get_count(); + } + for (const Bin& bin : positive_store) { + sum += index_mapping.value(bin.get_index()) * bin.get_count(); + } + return sum; +} + +template +double DDSketch::get_min() const { + if (!negative_store.is_empty()) { + return -index_mapping.value(negative_store.get_max_index()); + } + if (zero_count > 0.0) { + return 0.0; + } + return index_mapping.value(positive_store.get_min_index()); +} + +template +double DDSketch::get_max() const { + if (!positive_store.is_empty()) { + return index_mapping.value(positive_store.get_max_index()); + } + if (zero_count > 0.0) { + return 0.0; + } + return -index_mapping.value(negative_store.get_min_index()); +} + +template +double DDSketch::get_rank(const double &item) const { + double rank = 0.0; + + if (!negative_store.is_empty()) { + for (auto it = negative_store.rbegin(); it != negative_store.rend() && -index_mapping.value((*it).get_index()) <= item; ++it) { + rank += (*it).get_count(); + } + } + if (item >= 0) { + rank += zero_count; + } + if (!positive_store.is_empty()) { + for (auto it = positive_store.begin(); it != positive_store.end() && index_mapping.value((*it).get_index()) <= item; ++it) { + rank += (*it).get_count(); + } + } + return rank / get_count(); +} + + +template +double DDSketch::get_quantile(const double& rank) const { + return get_quantile(rank, get_count()); +} + +template +double DDSketch::get_quantile(const double& rank, const double& count) const { + if (rank < 0.0 || rank > 1.0) { + throw std::invalid_argument("rank must be in [0.0, 1.0]"); + } + + if (count == 0.0) { + throw std::runtime_error("no such element"); + } + + const double target_rank = rank * (count - 1.0); + double n = 0.0; + + for (auto it = negative_store.rbegin(); it != negative_store.rend(); ++it) { + const Bin& bin = *it; + if ((n += bin.get_count()) > target_rank) { + return -index_mapping.value(bin.get_index()); + } + } + + if ((n += zero_count) > target_rank) { + return 0.0; + } + + for (auto it = positive_store.begin(); it != positive_store.end(); ++it) { + const Bin& bin = *it; + if ((n += bin.get_count()) > target_rank) { + return index_mapping.value(bin.get_index()); + } + } + throw std::invalid_argument("no such element"); +} + +template +typename DDSketch::vector_double DDSketch::get_CDF(const double* split_points, uint32_t size) const { + check_split_pints(split_points, size); + vector_double ranks; + ranks.reserve(size + 1); + for (uint32_t i = 0; i < size; ++i) { + ranks.push_back(get_rank(split_points[i])); + } + ranks.push_back(1.0); + return ranks; +} + +template +typename DDSketch::vector_double DDSketch::get_PMF(const double* split_points, uint32_t size) const { + vector_double buckets = get_CDF(split_points, size); + for (uint32_t i = size; i > 0; --i) { + buckets[i] -= buckets[i - 1]; + } + + return buckets; +} + + +template +void DDSketch::serialize(std::ostream& os) const { + index_mapping.serialize(os); + + write(os, zero_count); + + + auto val = positive_store.get_serialized_size_bytes(); + write(os, positive_store.get_serialized_size_bytes()); + positive_store.serialize(os); + + val = negative_store.get_serialized_size_bytes(); + write(os, negative_store.get_serialized_size_bytes()); + negative_store.serialize(os); +} + +template +DDSketch DDSketch::deserialize(std::istream &is) { + Mapping deserialized_index_mapping = Mapping::deserialize(is); + const auto deserialized_zero_count = read(is); + + const auto positive_store_serialized_size = read(is); + + std::string pos_buf(positive_store_serialized_size, '\0'); + is.read(&pos_buf[0], pos_buf.size()); + std::stringstream pos_stream(pos_buf); + Store deserialized_positive_store = Store::deserialize(pos_stream); + + const auto negative_store_serialized_size = read(is); + std::string neg_buf(negative_store_serialized_size, '\0'); + is.read(&neg_buf[0], neg_buf.size()); + std::stringstream neg_stream(neg_buf); + Store deserialized_negative_store = Store::deserialize(neg_stream); + + DDSketch ddsketch(deserialized_positive_store, deserialized_negative_store, deserialized_index_mapping); + ddsketch.zero_count = deserialized_zero_count; + return ddsketch; +} + +template +int DDSketch::get_serialized_size_bytes() const { + return index_mapping.get_serialized_size_bytes() + + positive_store.get_serialized_size_bytes() + + negative_store.get_serialized_size_bytes() + + sizeof(zero_count) + + 2 * sizeof(double); +} + +template +template +string DDSketch::to_string() const { + std::ostringstream os; + os << "### ddsketch summary:" << std::endl; + os << " Index mapping " << std::endl; + os << index_mapping.to_string(); + os << " Min indexable value : " << min_indexed_value << std::endl; + os << " Max indexable value : " << max_indexed_value << std::endl; + os << " Positive store " << std::endl; + os << positive_store.to_string(); + os << " Negative store " << std::endl; + os << negative_store.to_string(); + os << " Zero count " << zero_count << std::endl; + os << "### End ddsketch summary" << std::endl; + return os.str(); + +} + + +template +bool DDSketch::operator==(const DDSketch& other) const { + return positive_store == other.positive_store && + negative_store == other.negative_store && + index_mapping == other.index_mapping && + zero_count == other.zero_count && + min_indexed_value == other.min_indexed_value && + max_indexed_value == other.max_indexed_value; +} + +template +void DDSketch::check_split_pints(const double *items, uint32_t size) { + for (uint32_t i = 0; i < size ; i++) { + if (std::isnan(items[i])) { + throw std::invalid_argument("Values must not be NaN"); + } + if ((i < (size - 1)) && !(items[i] < items[i + 1])) { + throw std::invalid_argument("Values must be unique and monotonically increasing"); + } + } +} + +} + +#endif diff --git a/ddsketch/include/dense_store.hpp b/ddsketch/include/dense_store.hpp new file mode 100644 index 00000000..54969a58 --- /dev/null +++ b/ddsketch/include/dense_store.hpp @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef DENSE_STORE_HPP +#define DENSE_STORE_HPP + +#include +#include "bin.hpp" +#include "common_defs.hpp" + +namespace datasketches { + +/** + * @class DenseStore + * @brief Contiguous integer-indexed bins backed by a growable array. + * + * @tparam Derived CRTP derived store. + * @tparam Allocator Allocator type for internal storage. + */ +template +class DenseStore { +public: + + /** + * @brief Bin storage type (contiguous counts). + */ + using bins_type = std::vector::template rebind_alloc>; + + /** + * @brief Integer type for indices/lengths within this store. + */ + using size_type = int; + + // Forward declarations + /** + * @brief Forward iterator over non-empty bins (ascending index) + */ + class iterator; + + /** + * @brief Reverse iterator over non-empty bins (descending index) + */ + class reverse_iterator; + + /** + * @brief Increment bin @p index by 1. + */ + void add(int index); + + /** + * @brief Increment bin @p index by @p count. + */ + void add(int index, double count); + + /** + * @brief Increment index by count as specified by @p bin. + */ + void add(const Bin& bin); + + /** + * @brief Clear all contents of the store. + * + * Removes all bins and resets counts to zero while preserving configuration + * (e.g., capacity limits). After this call, @c total_count() is 0 and the + * store contains no non-empty bins. + */ + void clear(); + + bool is_empty() const; + + /** + * @brief Highest non-empty bin index. + */ + size_type get_max_index() const; + + /** + * @brief Lowest non-empty bin inde. + */ + size_type get_min_index() const; + + /** + * @brief Total count across all bins. + */ + double get_total_count() const; + + /** + * @brief Merge another dense store (same allocator) into this one. + * @tparam Store Derived type of the other dense store. + * @param other store; its counts are added here. + */ + template + void merge(const DenseStore& other); + + /** + * This method serializes the store into a given stream in a binary form + * @param os output stream + */ + void serialize(std::ostream& os) const; + + /** + * @brief Deserialize the store from a stream (replacing current contents). + * @param is Input stream. + */ + static Derived deserialize(std::istream& is); + + string to_string() const; + + bool operator==(const DenseStore& other) const; + + /** + * @brief Begin iterator over non-empty bins (ascending). + */ + iterator begin() const; + + /** + * @brief End iterator over non-empty bins (ascending). + */ + iterator end() const; + + /** + * @brief Begin reverse iterator over non-empty bins (descending). + */ + reverse_iterator rbegin() const; + + /** + * @brief End reverse iterator over non-empty bins (descending). + */ + reverse_iterator rend() const; + + ~DenseStore() = default; + + // ---------------- Iterators ---------------- + + /** + * @class DenseStore::iterator + * @brief Input iterator yielding Bin values in ascending index order. + * + * Stable only while the store is not mutated. + */ + class iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = Bin; + using difference_type = std::ptrdiff_t; + using pointer = Bin*; + using reference = Bin; + + /** + * @brief Construct positioned iterator (internal use). + */ + iterator(const bins_type& bins, const size_type& index, const size_type& max_index, const size_type& offset); + + /** + * @brief Assign from another iterator. + */ + iterator& operator=(const iterator& other); + + + /** + * @brief Pre-increment. + */ + iterator& operator++(); + + /** + * @brief Post-increment. + */ + iterator operator++(int); + + /** + * @brief Inequality comparison. + */ + bool operator!=(const iterator& other) const; + + /** + * @brief Dereference to the current Bin (index, count). + */ + reference operator*() const; + + private: + const bins_type& bins; + size_type index; + const size_type& max_index; + const size_type& offset; + }; + + /** + * @class DenseStore::reverse_iterator + * @brief Input iterator yielding Bin values in descending index order. + * + * Stable only while the store is not mutated. + */ + class reverse_iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = Bin; + using difference_type = std::ptrdiff_t; + using pointer = Bin*; + using reference = Bin; + + /** + * @brief Construct positioned reverse iterator (internal use). + */ + reverse_iterator(const bins_type& bins, size_type index, const size_type& min_index, const size_type& offset); + + /** + * @brief Assign from another reverse iterator. + */ + reverse_iterator& operator=(const reverse_iterator& other); + + /** + * @brief Pre-increment. + */ + reverse_iterator& operator++(); + + /** + * @brief Inequality comparison. + */ + bool operator!=(const reverse_iterator& other) const; + + /** + * @brief Dereference to the current Bin (index, count). + */ + reference operator*() const; + + private: + const bins_type& bins; + size_type index; + const size_type& min_index; + const size_type& offset; + }; + +protected: + bins_type bins; + size_type offset; + size_type min_index; + size_type max_index; + + const int array_length_growth_increment; + const int array_length_overhead; + + static constexpr int DEFAULT_ARRAY_LENGTH_GROWTH_INCREMENT = 64; + static constexpr double DEFAULT_ARRAY_LENGTH_OVERHEAD_RATIO = 0.1; + + // Protected constructors. This is a base class only and it is not meant to be instanced. + DenseStore(); + explicit DenseStore(const int& array_length_growth_increment); + explicit DenseStore(const int& array_length_growth_increment, const int& array_length_overhead); + DenseStore(const DenseStore& other) = default; + + /** + * @brief Total count in [@p from_index, @p to_index] (inclusive). + */ + double get_total_count(size_type from_index, size_type to_index) const; + + /** + * @brief Normalize a raw bin index into this store's current window. + */ + size_type normalize(size_type index); + + /** + * @brief Reframe the active index window to [new_min_index, new_max_index]. + */ + void adjust(size_type newMinIndex, size_type newMaxIndex); + + /** + * @brief Extend window to include @p index (may grow and shift). + */ + void extend_range(size_type index); + + /** + * @brief Extend window to include [@p new_min_index, @p new_max_index]. + */ + void extend_range(size_type new_min_index, size_type new_max_index); + + /** + * @brief Shift bins by @p shift (positive: toward higher indices). + */ + void shift_bins(size_type shift); + + /** + * @brief Center bins for the target window [@p new_min_index, @p new_max_index]. + */ + void center_bins(size_type new_min_index, size_type new_max_index); + + /** + * @brief Compute the resized backing-array length for a target index span. + * + * @param new_min_index Lowest bin index to be retained (inclusive). + * @param new_max_index Highest bin index to be retained (inclusive). + * @return size_type New backing-array capacity (in bins). + */ + size_type get_new_length(size_type new_min_index, size_type new_max_index) const; + + /** + * @brief Zero all bins (keep capacity). + */ + void reset_bins(); + + /** + * @brief Zero bins in [@p from_index, @p to_index] (inclusive). + */ + void reset_bins(size_type from_index, size_type to_index); + + /** + * @brief Serialize fields common to all dense stores. + */ + void serialize_common(std::ostream& os) const; + + /** + * @brief Derialize fields common to all dense stores. + */ + static void deserialize_common(Derived& store, std::istream& is); + + /** + * Computes size needed to serialize the current state of the sketch. + * @return size in bytes needed to serialize this sketch + */ + int get_serialized_size_bytes_common() const; + + Derived& derived(); + const Derived& derived() const; +}; +} + +#include "dense_store_impl.hpp" + +#endif //DENSE_STORE_HPP diff --git a/ddsketch/include/dense_store_impl.hpp b/ddsketch/include/dense_store_impl.hpp new file mode 100644 index 00000000..48d2a5f6 --- /dev/null +++ b/ddsketch/include/dense_store_impl.hpp @@ -0,0 +1,454 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef DENSE_STORE_IMPL_HPP +#define DENSE_STORE_IMPL_HPP + +#include +#include + +#include "common_defs.hpp" + +namespace datasketches { + +template +constexpr int DenseStore::DEFAULT_ARRAY_LENGTH_GROWTH_INCREMENT; + +template +DenseStore::DenseStore() : + DenseStore(DEFAULT_ARRAY_LENGTH_GROWTH_INCREMENT) +{} + +template +DenseStore::DenseStore(const int& array_length_growth_increment) : + DenseStore(array_length_growth_increment, array_length_growth_increment * DEFAULT_ARRAY_LENGTH_OVERHEAD_RATIO) +{} + +template +DenseStore::DenseStore(const int& array_length_growth_increment, const int& array_length_overhead): + offset(0), + min_index(std::numeric_limits::max()), + max_index(std::numeric_limits::min()), + array_length_growth_increment(array_length_growth_increment), + array_length_overhead(array_length_overhead) +{} + +template +void DenseStore::add(int index) { + add(index, 1); +} + +template +void DenseStore::add(int index, double count) { + if (count == 0) { + return; + } + const size_type array_index = derived().normalize(index); + bins[array_index] += count; +} + +template +void DenseStore::add(const Bin& bin) { + if (bin.get_count() == 0) { + return; + } + add(bin.get_index(), bin.get_count()); +} + +template +void DenseStore::clear() { + bins.clear(); + min_index = std::numeric_limits::max(); + max_index = std::numeric_limits::min(); + offset = 0; +} + +template +bool DenseStore::is_empty() const { + // return get_total_count() == 0; + return max_index < min_index; +} + +template +typename DenseStore::size_type DenseStore::get_max_index() const { + if (is_empty()) { + throw std::runtime_error("store is empty"); + } + return max_index; +} + +template +typename DenseStore::size_type DenseStore::get_min_index() const { + if (is_empty()) { + throw std::runtime_error("store is empty"); + } + return min_index; +} + +template +double DenseStore::get_total_count() const { + return get_total_count(min_index, max_index); +} + +template +double DenseStore::get_total_count(size_type from_index, size_type to_index) const { + if (is_empty()) { + return 0; + } + + double total_count = 0; + size_type from_array_index = std::max(from_index - offset, static_cast(0)); + size_type to_array_index = std::min(to_index - offset, static_cast(bins.size() - 1)); + for (size_type index = from_array_index; index <= to_array_index; index++) { + total_count += bins[index]; + } + + return total_count; +} + + +template +template +void DenseStore::merge(const DenseStore& other) { + for (const Bin& bin : other) { + add(bin); + } +} + +template +typename DenseStore::iterator DenseStore::begin() const { + if (is_empty()) { + return end(); + } + return DenseStore::iterator(this->bins, this->min_index, this->max_index, this->offset); +} + +template +typename DenseStore::iterator DenseStore::end() const { + return DenseStore::iterator(this->bins, this->max_index + 1, this->max_index, this->offset); +} + +template +typename DenseStore::reverse_iterator DenseStore::rbegin() const { + if (is_empty()) { + return rend(); + } + return DenseStore::reverse_iterator(this->bins, this->max_index, this->min_index, this->offset); +} + +template +typename DenseStore::reverse_iterator DenseStore::rend() const { + return DenseStore::reverse_iterator(this->bins, this->min_index - 1, this->min_index, this->offset); +} + + +template +typename DenseStore::size_type DenseStore::normalize(size_type index) { + if (index < get_min_index() || index > get_max_index()) { + extend_range(index, index); + } + return index - offset; +} + +template +void DenseStore::adjust(size_type newMinIndex, size_type newMaxIndex) { + derived().adjust(newMinIndex, newMaxIndex); +} + +template +void DenseStore::extend_range(size_type index) { + extend_range(index, index); +} + +template +void DenseStore::extend_range(size_type new_min_index, size_type new_max_index) { + new_min_index = std::min(new_min_index, min_index); + new_max_index = std::max(new_max_index, max_index); + + if (is_empty()) { + const size_type initial_length = derived().get_new_length(new_min_index, new_max_index); + if (bins.empty() || initial_length >= static_cast(bins.size())) { + bins.resize(initial_length); + } + offset = new_min_index; + min_index = new_min_index; + max_index = new_max_index; + adjust(new_min_index, new_max_index); + } else if (new_min_index >= offset && new_max_index < offset + static_cast(bins.size())) { + min_index = new_min_index; + max_index = new_max_index; + } else { + // To avoid shifting too often when nearing the capacity of the array, we may grow it before + // we actually reach the capacity. + const size_type new_length = derived().get_new_length(new_min_index, new_max_index); + if (new_length > static_cast(bins.size())) { + bins.resize(new_length); + } + + adjust(new_min_index, new_max_index); + } +} + +template +void DenseStore::shift_bins(size_type shift) { + const size_type min_arr_index = min_index - offset; + const size_type max_arr_index = max_index - offset; + + std::copy(bins.begin() + min_arr_index, bins.begin() + max_arr_index + 1, bins.begin() + min_arr_index + shift); + + if (shift > 0) { + std::fill(bins.begin() + min_arr_index, bins.begin() + min_arr_index + shift, 0); + } else { + std::fill(bins.begin() + max_arr_index + 1 + shift, bins.begin() + max_arr_index + 1, 0.); + } + + offset -= shift; +} + +template +void DenseStore::center_bins(size_type new_min_index, size_type new_max_index) { + const size_type middle_index = new_min_index + (new_max_index - new_min_index + 1) / 2; + shift_bins(offset + bins.size() / 2 - middle_index); + + min_index = new_min_index; + max_index = new_max_index; +} + +template +typename DenseStore::size_type DenseStore::get_new_length(size_type new_min_index, size_type new_max_index) const { + const size_type desired_length = new_max_index - new_min_index + 1; + return ((desired_length + array_length_overhead - 1) / array_length_growth_increment + 1) * array_length_growth_increment; +} + +template +void DenseStore::reset_bins() { + reset_bins(min_index, max_index); +} + +template +void DenseStore::reset_bins(size_type from_index, size_type to_index) { + std::fill(bins.begin() + from_index - offset, bins.begin() + to_index - offset + 1, 0); +} + +template +bool DenseStore::operator==(const DenseStore& other) const { + return offset == other.offset && + min_index == other.min_index && + max_index == other.max_index && + bins == other.bins; +} + +template +Derived& DenseStore::derived() { + return static_cast(*this); +} + +template +const Derived& DenseStore::derived() const { + return static_cast(*this); +} + +template +DenseStore::iterator::iterator(const bins_type& bins, const size_type& index, const size_type& max_index, const size_type& offset): +bins(bins), +index(index), +max_index(max_index), +offset(offset) +{} + +template +typename DenseStore::iterator& DenseStore::iterator::operator=(const iterator& other) { + if (this != &other) { + // Note: we can't assign to reference members, so we only copy the index + // The reference members (bins, max_index, offset) should already point to the same objects + this->index = other.index; + } + return *this; +} + +template +typename DenseStore::iterator& DenseStore::iterator::operator++() { + do { + ++this->index; + } while (this->index <= this->max_index && this->bins[this->index - this->offset] == 0); + return *this; +} + +template +typename DenseStore::iterator DenseStore::iterator::operator++(int) { + iterator temp = *this; + ++(*this); + return temp; +} + +template +bool DenseStore::iterator::operator!=(const iterator& other) const { + return this->index != other.index; +} + +template +typename DenseStore::iterator::reference DenseStore::iterator::operator*() const { + return Bin(this->index, this->bins[this->index - this->offset]); +} + +template +DenseStore::reverse_iterator::reverse_iterator(const bins_type& bins, size_type index, const size_type& min_index, const size_type& offset): +bins(bins), +index(index), +min_index(min_index), +offset(offset) +{} + +template +typename DenseStore::reverse_iterator& DenseStore::reverse_iterator::operator=(const reverse_iterator& other) { + if (this != &other) { + // Note: we can't assign to reference members, so we only copy the index + // The reference members (bins, min_index, offset) should already point to the same objects + this->index = other.index; + } + return *this; +} + +template +typename DenseStore::reverse_iterator& DenseStore::reverse_iterator::operator++() { + do { + --this->index; + } while (this->index >= this->min_index && this->bins[this->index - this->offset] == 0); + return *this; +} + +template +bool DenseStore::reverse_iterator::operator!=(const reverse_iterator& other) const { + return this->index != other.index; +} + +template +typename DenseStore::reverse_iterator::reference DenseStore::reverse_iterator::operator*() const { + return Bin(this->index, this->bins[this->index - this->offset]); +} + +template +void DenseStore::serialize(std::ostream& os) const { + derived().serialize(os); +} + +template +Derived DenseStore::deserialize(std::istream& is) { + return Derived::deserialize(is); +} + +template +void DenseStore::serialize_common(std::ostream& os) const { + if (is_empty()) { + return; + } + + // Serialize the range information + write(os, min_index); + write(os, max_index); + write(os, offset); + + // Serialize the bins array (only the used portion) + const size_type num_bins = bins.size(); + write(os, num_bins); + + size_type non_empty_bins = 0; + for (const double& count : bins) { + non_empty_bins += (count > 1e-16); + } + write(os, non_empty_bins); + + for (const Bin& bin : *this) { + write(os, bin.get_index()); + write(os, bin.get_count()); + } +} + +template +void DenseStore::deserialize_common(Derived& store, std::istream& is) { + if (is.peek() == std::istream::traits_type::eof()) { + return; + } + // Deserialize the range information + store.min_index = read(is); + store.max_index = read(is); + store.offset = read(is); + + // Deserialize the bins array + const auto num_bins = read(is); + store.bins.resize(num_bins, 0.0); + + const auto non_empty_bins = read(is); + // Read the actual bin counts + for (size_type i = 0; i < non_empty_bins; ++i) { + const auto index = read(is); + const auto count = read(is); + store.bins[index-store.offset] = count; + } +} + +template +int DenseStore::get_serialized_size_bytes_common() const { + if (is_empty()) { + return 0; + } + + + // Keep the running total in size_t, cast to int at the end (the public API returns int) + size_t size_bytes = 0; + + // Range metadata written by serialize_common + size_bytes += sizeof(this->min_index); // min_index + size_bytes += sizeof(this->max_index); // max_index + size_bytes += sizeof(this->offset); // offset + + // `serialize_common` writes the number of bins (the full allocated length) + size_type num_bins = static_cast(this->bins.size()); + (void)num_bins; // silence unused warning in templates + size_bytes += sizeof(num_bins); + + // Count non-empty bins exactly as in serialize_common (threshold 1e-16) + size_type non_empty_bins = 0; + for (const double& count : this->bins) { + non_empty_bins += (count > 1e-16); + } + + // It writes the non_empty_bins counter itself + size_bytes += sizeof(non_empty_bins); + + // For each non-empty bin, serialize_common writes: index (int) + count (double) + size_bytes += static_cast(non_empty_bins) * sizeof(int); + size_bytes += static_cast(non_empty_bins) * sizeof(double); + + // Final cast matches the serialized-size field type used elsewhere + return static_cast(size_bytes); +} + +template +string DenseStore::to_string() const { + std::ostringstream os; + os << " Type : dense store " << std::endl; + os << " Bins number :" << bins.size() << std::endl; + os << " Min index :" << min_index << std::endl; + os << " Max index :" << max_index << std::endl; + return os.str(); +} +} + +#endif //DENSE_STORE_IMPL_HPP diff --git a/ddsketch/include/index_mapping.hpp b/ddsketch/include/index_mapping.hpp new file mode 100644 index 00000000..a8259c72 --- /dev/null +++ b/ddsketch/include/index_mapping.hpp @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef INDEX_MAPPING_HPP +#define INDEX_MAPPING_HPP +#include + +namespace datasketches { + +enum class IndexMappingLayout : uint8_t { + LOG, + LOG_LINEAR, + LOG_QUADRATIC, + LOG_CUBIC, + LOG_QUARTIC, +}; + +std::ostream& operator<<(std::ostream& os, const IndexMappingLayout& obj); + +/** + * @class IndexMapping + * @brief CRTP base exposing the value/index transform API. + * @tparam Derived concrete mapping type implementing the operations. + * + * Provides a uniform interface to map doubles to integer bin indices and back, + * with bounds and relative-accuracy queries. + */ +template +class IndexMapping { +public: + + /** + * @brief Map a value to its integer bin index. + * @param value input value + * @return index (may throw if out of range) + */ + int index(const double& value) const; + + /** + * @brief Representative value for a bin @p index. + * @param index bin index + * @return representative value (inverse mapping) + */ + double value(int index) const; + + /** + * @brief Lower bound of values mapped to @p index. + * @param index bin index + * @return inclusive lower bound + */ + double lower_bound(int index) const; + + /** + * @brief Upper bound of values mapped to @p index. + * @param index bin index + * @return exclusive upper bound + */ + double upper_bound(int index) const; + + /** + * @brief Target relative accuracy (multiplicative error bound). + * @return relative accuracy in (0,1) + */ + double get_relative_accuracy() const; + + /** + * @brief Smallest trackable value. + * @return minimum indexable value + */ + double min_indexable_value() const; + + /** + * @brief Largest trackable value. + * @return maximum indexable value + */ + double max_indexable_value() const; + + /** + * @brief Serialize this mapping to a stream. + * @param os output stream + */ + void serialize(std::ostream& os) const; + + /** + * @brief Deserialize a concrete {@link Derived} mapping from a stream. + * @param is input stream + * @return reconstructed mapping + */ + static Derived deserialize(std::istream& is); + + ~IndexMapping() = default; + +protected: + + /** + * @brief Downcast to {@link Derived}. + * @return reference to derived + */ + Derived& derived(); + + /** + * @brief Const downcast to {@link Derived}. + * @return const reference to derived + */ + const Derived& derived() const; +}; + +} + +#include "index_mapping_impl.hpp" +#endif //INDEX_MAPPING_HPP diff --git a/ddsketch/include/index_mapping_factory.hpp b/ddsketch/include/index_mapping_factory.hpp new file mode 100644 index 00000000..c7a1855e --- /dev/null +++ b/ddsketch/include/index_mapping_factory.hpp @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef INDEX_MAPPING_FACTORY_HPP +#define INDEX_MAPPING_FACTORY_HPP + +#include + + +namespace datasketches { +template +class index_mapping_factory { +public: + template + static std::unique_ptr new_mapping(Args&&... args) + { + return std::unique_ptr(new IndexMapping(std::forward(args)...)); + } +}; +} + +#endif //INDEX_MAPPING_FACTORY_HPP diff --git a/ddsketch/include/index_mapping_impl.hpp b/ddsketch/include/index_mapping_impl.hpp new file mode 100644 index 00000000..96109f20 --- /dev/null +++ b/ddsketch/include/index_mapping_impl.hpp @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef INDEX_MAPPING_IMPL_HPP +#define INDEX_MAPPING_IMPL_HPP + +#include + +#include "common_defs.hpp" +#include "index_mapping.hpp" + +namespace datasketches { + +inline std::ostream& operator<<(std::ostream& os, const IndexMappingLayout& obj) { + switch (obj) { + case IndexMappingLayout::LOG: + return os << "LOG"; + case IndexMappingLayout::LOG_LINEAR: + return os << "LOG_LINEAR"; + case IndexMappingLayout::LOG_QUADRATIC: + return os << "LOG_QUADRATIC"; + case IndexMappingLayout::LOG_CUBIC: + return os << "LOG_CUBIC"; + case IndexMappingLayout::LOG_QUARTIC: + return os << "LOG_QUARTIC"; + default: + return os << "INVALID"; + } +} + +template +Derived IndexMapping::deserialize(std::istream &is) { + const auto gamma = read(is); + const auto index_offset = read(is); + + + return Derived(gamma, index_offset); +} + +template +Derived& IndexMapping::derived() { + return *static_cast(this); +} + +template +const Derived& IndexMapping::derived() const { + return *static_cast(this); +} + +template +void IndexMapping::serialize(std::ostream &os) const { + derived().serialize(os); +} + +template +double IndexMapping::get_relative_accuracy() const { + return derived().get_relative_accuracy(); +} +} + + +#endif //INDEX_MAPPING_IMPL_HPP diff --git a/ddsketch/include/linearly_interpolated_mapping.hpp b/ddsketch/include/linearly_interpolated_mapping.hpp new file mode 100644 index 00000000..22ff947d --- /dev/null +++ b/ddsketch/include/linearly_interpolated_mapping.hpp @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef LINEARLY_INTERPOLATED_MAPPING_HPP +#define LINEARLY_INTERPOLATED_MAPPING_HPP + +#include "log_like_index_mapping.hpp" + + +namespace datasketches { +/** + * @class LinearlyInterpolatedMapping + * A fast {@link IndexMapping} that approximates the memory-optimal one (namely {@link + * LogarithmicMapping}) by extracting the floor value of the logarithm to the base 2 from the binary + * representations of floating-point values and linearly interpolating the logarithm in-between. + */ +class LinearlyInterpolatedMapping : public LogLikeIndexMapping { +public: + + /** + * Constructor. + * + * @param relative_accuracy + */ + explicit LinearlyInterpolatedMapping(const double& relative_accuracy); + + /** + * Overloaded constructor. + * This is meant to be used when deserializing only + * + * @param gamma + * @param index_offset + */ + LinearlyInterpolatedMapping(const double& gamma, const double& index_offset); + + double log(const double& value) const; + + double log_inverse(const double& index) const; + + IndexMappingLayout layout() const; + + static constexpr double BASE() { return 2.0; } + static constexpr double CORRECTING_FACTOR() { return 1.44269504088896340735; } + +private: + static double index_offset_shift(const double& relative_accuracy); +}; +} + +#include "linearly_interpolated_mapping_impl.hpp" + +#endif //LINEARLY_INTERPOLATED_MAPPING_HPP diff --git a/ddsketch/include/linearly_interpolated_mapping_impl.hpp b/ddsketch/include/linearly_interpolated_mapping_impl.hpp new file mode 100644 index 00000000..5d6e4c0b --- /dev/null +++ b/ddsketch/include/linearly_interpolated_mapping_impl.hpp @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef LINEARLY_INTERPOLATED_MAPPING_IMPL_HPP +#define LINEARLY_INTERPOLATED_MAPPING_IMPL_HPP + +namespace datasketches { + +inline LinearlyInterpolatedMapping::LinearlyInterpolatedMapping(const double& relative_accuracy): + LogLikeIndexMapping(compute_gamma(require_valid_relative_accuracy(relative_accuracy), CORRECTING_FACTOR()), index_offset_shift(relative_accuracy)) {} + +inline LinearlyInterpolatedMapping::LinearlyInterpolatedMapping(const double& gamma, const double& index_offset): + LogLikeIndexMapping(gamma, index_offset) {} + +inline double LinearlyInterpolatedMapping::log(const double& value) const { + int exponent = 0; + const double mantissa = std::frexp(value, &exponent); + const double significand = 2 * mantissa - 1; + return significand + (exponent - 1); + +} + +inline double LinearlyInterpolatedMapping::log_inverse(const double& index) const { + int exponent = static_cast(std::floor(index)) + 1; + double mantissa = (index - exponent + 2) / 2.0; + return std::ldexp(mantissa, exponent); +} + +inline IndexMappingLayout LinearlyInterpolatedMapping::layout() const { + return IndexMappingLayout::LOG_LINEAR; +} + +inline double LinearlyInterpolatedMapping::index_offset_shift(const double& relative_accuracy) { + return 1 / (std::log1p(2 * relative_accuracy / (1 - relative_accuracy))); +} +} + +#endif //LINEARLY_INTERPOLATED_MAPPING_IMPL_HPP diff --git a/ddsketch/include/log_like_index_mapping.hpp b/ddsketch/include/log_like_index_mapping.hpp new file mode 100644 index 00000000..fe17e26f --- /dev/null +++ b/ddsketch/include/log_like_index_mapping.hpp @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ +#ifndef LOG_LIKE_INDEX_MAPPING_HPP +#define LOG_LIKE_INDEX_MAPPING_HPP +#include "index_mapping.hpp" +#include +#include +#include + +namespace datasketches { + +/** + * @class LogLikeIndexMapping + * A base class for mappings that are derived from a function that approximates the logarithm. + * + *

That function is scaled depending on the targeted relative accuracy, the base of the logarithm + * that log approximates and how well it geometrically pulls apart values from one another, + * that is to say, the infimum of |(l∘exp)(x)-(l∘exp)(y)|/|x-y| where x ≠ y and l = log + */ +template +class LogLikeIndexMapping : public IndexMapping { +public: + /** + * Constructor. + * + * @param gamma + * @param index_offset + */ + LogLikeIndexMapping(const double& gamma, const double& index_offset); + + /** + * @brief Map a value to its integer bin index. + * + * @param value input value + * @return index (may throw if out of range) + */ + int index(const double& value) const; + + /** + * @brief Representative value for a bin @p index. + * @param index bin index + * @return representative value (inverse mapping) + */ + double value(int index) const; + + /** + * @brief Lower bound of values mapped to @p index. + * @param index bin index + * @return inclusive lower bound + */ + double lower_bound(int index) const; + + /** + * @brief Upper bound of values mapped to @p index. + * @param index bin index + * @return exclusive upper bound + */ + double upper_bound(int index) const; + + /** + * @brief Target relative accuracy (multiplicative error bound). + * @return relative accuracy in (0,1) + */ + double get_relative_accuracy() const; + + /** + * @brief Target relative accuracy (multiplicative error bound). + * @return relative accuracy in (0,1) + */ + double min_indexable_value() const; + + /** + * @brief Largest trackable value. + * @return maximum indexable value + */ + double max_indexable_value() const; + + /** + * @brief Serialize this mapping to a stream. + * @param os output stream + */ + void serialize(std::ostream& os) const; + + int get_serialized_size_bytes() const { + return 16; +} + + template> + string to_string() const; + + bool operator==(const LogLikeIndexMapping& other) const; + bool operator!=(const LogLikeIndexMapping& other) const; + +private: + static double compute_relative_accuracy(const double gamma, const double correcting_factor); + static double require_valid_gamma(const double& gamma); + IndexMappingLayout layout() const; + +protected: + static double require_valid_relative_accuracy(const double& relative_accuracy); + static double compute_gamma(const double& relative_accuracy, const double& correcting_factor); + double log(const double& value) const; + double log_inverse(const double& value) const; + + const double gamma; + const double index_offset; + + const double relative_accuracy; + const double multiplier; + + /** + * @brief Downcast to {@link Derived}. + * @return reference to derived + */ + Derived& derived(); + + /** + * @brief Const downcast to {@link Derived}. + * @return const reference to derived + */ + const Derived& derived() const; +}; +} + +#include "log_like_index_mapping_impl.hpp" + +#endif //LOG_LIKE_INDEX_MAPPING_HPP diff --git a/ddsketch/include/log_like_index_mapping_impl.hpp b/ddsketch/include/log_like_index_mapping_impl.hpp new file mode 100644 index 00000000..682681c6 --- /dev/null +++ b/ddsketch/include/log_like_index_mapping_impl.hpp @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef LOG_LIKE_INDEX_MAPPING_IMPL_HPP +#define LOG_LIKE_INDEX_MAPPING_IMPL_HPP +#include "log_like_index_mapping.hpp" +#include +#include "common_defs.hpp" + +namespace datasketches { + +template +LogLikeIndexMapping::LogLikeIndexMapping(const double& gamma, const double& index_offset): + gamma(require_valid_gamma(gamma)), + index_offset(index_offset), + relative_accuracy(compute_relative_accuracy(gamma, Derived::CORRECTING_FACTOR())), + multiplier(std::log(Derived::BASE()) / std::log1p(gamma - 1)) {} + +template +double LogLikeIndexMapping::compute_relative_accuracy(const double gamma, const double correcting_factor) { + const double exact_log_gamma = std::pow(gamma, correcting_factor); + return (exact_log_gamma - 1) / (exact_log_gamma + 1); +} + +template +double LogLikeIndexMapping::compute_gamma(const double& relative_accuracy, const double& correcting_factor) { + const double exact_log_gamma = (1.0 + relative_accuracy) / (1.0 - relative_accuracy); + return std::pow(exact_log_gamma, 1.0 / correcting_factor); +} + +template +double LogLikeIndexMapping::require_valid_relative_accuracy(const double& relative_accuracy) { + if (relative_accuracy <= 0 || relative_accuracy >= 1) { + throw std::invalid_argument("relative_accuracy must be between 0 and 1"); + } + return relative_accuracy; +} + +template +double LogLikeIndexMapping::require_valid_gamma(const double& gamma) { + if (gamma <= 1) { + throw std::invalid_argument("gamma must be greater than 1"); + } + return gamma; +} + +template +int LogLikeIndexMapping::index(const double& value) const{ + assert(std::isfinite(value) && value > 0.0); + const double index = derived().log(value) * multiplier + index_offset; + return static_cast(std::floor(index)); +} + +template +double LogLikeIndexMapping::value(int index) const { + return lower_bound(index) * (1 + relative_accuracy); +} + +template +double LogLikeIndexMapping::lower_bound(int index) const { + return derived().log_inverse((index - index_offset) / multiplier); +} + +template +double LogLikeIndexMapping::upper_bound(int index) const { + return lower_bound(index + 1); +} + +template +double LogLikeIndexMapping::get_relative_accuracy() const { + return relative_accuracy; +} + +template +double LogLikeIndexMapping::min_indexable_value() const { + const double& a = std::pow(Derived::BASE(), (static_cast(std::numeric_limits::min()) - index_offset) / multiplier + 1); + const double& b = std::numeric_limits::min() * (1 + relative_accuracy) / (1 - relative_accuracy); + return std::max(a, b); +} + +template +double LogLikeIndexMapping::max_indexable_value() const { + const double& a = std::pow(Derived::BASE(), (static_cast(std::numeric_limits::max()) - index_offset) / multiplier - 1); + const double& b = std::numeric_limits::max() / (1 + relative_accuracy); + return std::min(a, b); +} + +template +void LogLikeIndexMapping::serialize(std::ostream& os) const { + write(os, gamma); + write(os, index_offset); +} + +template +bool LogLikeIndexMapping::operator==(const LogLikeIndexMapping &other) const { + return gamma == other.gamma && index_offset == other.index_offset; +} + +template +bool LogLikeIndexMapping::operator!=(const LogLikeIndexMapping &other) const { + return !(*this == other); +} + + +template +Derived& LogLikeIndexMapping::derived() { + return static_cast(*this); +} + +template +const Derived& LogLikeIndexMapping::derived() const { + return static_cast(*this); +} + +template +template +string LogLikeIndexMapping::to_string() const { + std::ostringstream os; + os << " gamma : " << gamma << std::endl; + os << " index offset : " << index_offset << std::endl; + os << " relative accuracy : " << relative_accuracy << std::endl; + return os.str(); +} + +} + +#endif //LOG_LIKE_INDEX_MAPPING_IMPL_HPP diff --git a/ddsketch/include/logarithmic_mapping.hpp b/ddsketch/include/logarithmic_mapping.hpp new file mode 100644 index 00000000..cc25c60f --- /dev/null +++ b/ddsketch/include/logarithmic_mapping.hpp @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef LOGARITHMIC_MAPPING_HPP +#define LOGARITHMIC_MAPPING_HPP + +#include "log_like_index_mapping.hpp" + +namespace datasketches { +/** + * @class LogarithmicMapping + * + * An {@link IndexMapping} that is memory-optimal, that is to say that given a targeted + * relative accuracy, it requires the least number of indices to cover a given range of values. This + * is done by logarithmically mapping floating-point values to integers. + */ +class LogarithmicMapping : public LogLikeIndexMapping { +public: + + /** + * Constructor. + * + * @param relative_accuracy + */ + explicit LogarithmicMapping(const double& relative_accuracy); + + /** + * Overloaded constructor. + * This is meant to be used when deserializing only + * + * @param gamma + * @param index_offset + */ + LogarithmicMapping(const double& gamma, const double& index_offset); + + double log(const double& value) const; + + double log_inverse(const double& index) const; + + IndexMappingLayout layout() const; + + static constexpr double BASE() { return 2.71828182845904523536; } + static constexpr double CORRECTING_FACTOR() { return 1.0; } +}; +} + +#include "logarithmic_mapping_impl.hpp" + +#endif //LOGARITHMIC_MAPPING_HPP diff --git a/ddsketch/include/logarithmic_mapping_impl.hpp b/ddsketch/include/logarithmic_mapping_impl.hpp new file mode 100644 index 00000000..6dad6b23 --- /dev/null +++ b/ddsketch/include/logarithmic_mapping_impl.hpp @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef LOGARITHMIC_MAPPING_IMPL_HPP +#define LOGARITHMIC_MAPPING_IMPL_HPP +#include "logarithmic_mapping.hpp" + +namespace datasketches { + +inline LogarithmicMapping::LogarithmicMapping(const double& relative_accuracy) : + LogLikeIndexMapping(compute_gamma(require_valid_relative_accuracy(relative_accuracy), 1.0), 0.0) {} + +inline LogarithmicMapping::LogarithmicMapping(const double& gamma, const double& index_offset) : + LogLikeIndexMapping(gamma, index_offset) {} + +inline double LogarithmicMapping::log(const double& value) const { + return std::log(value); +} + +inline double LogarithmicMapping::log_inverse(const double &index) const { + return std::exp(index); +} + +inline IndexMappingLayout LogarithmicMapping::layout() const { + return IndexMappingLayout::LOG; +} +} + + +#endif //LOGARITHMIC_MAPPING_IMPL_HPP diff --git a/ddsketch/include/quadratically_interpolated_mapping.hpp b/ddsketch/include/quadratically_interpolated_mapping.hpp new file mode 100644 index 00000000..62217550 --- /dev/null +++ b/ddsketch/include/quadratically_interpolated_mapping.hpp @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef QUADRATICALLY_INTERPOLATED_MAPPING_HPP +#define QUADRATICALLY_INTERPOLATED_MAPPING_HPP +#include "log_like_index_mapping.hpp" + +namespace datasketches { +/** + * @class QuadraticallyInterpolatedMapping + * A fast {@link IndexMapping} that approximates the memory-optimal one (namely {@link + * LogarithmicMapping}) by extracting the floor value of the logarithm to the base 2 from the binary + * representations of floating-point values and quadratically interpolating the logarithm + * in-between. + */ +class QuadraticallyInterpolatedMapping : public LogLikeIndexMapping { +public: + + /** + * Constructor. + * + * @param relative_accuracy + */ + explicit QuadraticallyInterpolatedMapping(const double& relative_accuracy); + + /** + * Overloaded constructor. + * This is meant to be used when deserializing only + * + * @param gamma + * @param index_offset + */ + QuadraticallyInterpolatedMapping(const double& gamma, const double& index_offset); + + double log(const double& value) const; + + double log_inverse(const double& index) const; + + IndexMappingLayout layout() const; + + static constexpr double BASE() { return 2.0; } + static constexpr double CORRECTING_FACTOR() { return 3.0 / (4.0 * 0.69314718055994530941); } + +private: + static constexpr double ONE_THIRD() { return 1.0 / 3.0; } +}; +} +#endif //QUADRATICALLY_INTERPOLATED_MAPPING_HPP +#include "quadratically_interpolated_mapping_impl.hpp" diff --git a/ddsketch/include/quadratically_interpolated_mapping_impl.hpp b/ddsketch/include/quadratically_interpolated_mapping_impl.hpp new file mode 100644 index 00000000..809fa98e --- /dev/null +++ b/ddsketch/include/quadratically_interpolated_mapping_impl.hpp @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef QUADRATICALLY_INTERPOLATED_MAPPING_IMPL_HPP +#define QUADRATICALLY_INTERPOLATED_MAPPING_IMPL_HPP + +#include "quadratically_interpolated_mapping.hpp" + +namespace datasketches { + +inline QuadraticallyInterpolatedMapping::QuadraticallyInterpolatedMapping(const double& relative_accuracy) : + LogLikeIndexMapping(compute_gamma(require_valid_relative_accuracy(relative_accuracy), CORRECTING_FACTOR()), 0.0) {} + +inline QuadraticallyInterpolatedMapping::QuadraticallyInterpolatedMapping(const double& gamma, const double& index_offset) : + LogLikeIndexMapping(gamma, index_offset) {} + + +inline double QuadraticallyInterpolatedMapping::log(const double& value) const { + // int64_t value_bits; + // std::memcpy(&value_bits, &value, sizeof(value)); + // + // const int64_t mantissa_plus_one_bits = (value_bits & 0x000FFFFFFFFFFFFFL) | 0x3FF0000000000000L; + // double mantissa_plus_one; + // std::memcpy(&mantissa_plus_one, &mantissa_plus_one_bits, sizeof(mantissa_plus_one_bits)); + // + // const double exponent = static_cast(((value_bits & 0x7FF0000000000000L) >> 52) - 1023); + + int exponent = 0; + const double mantissa = 2 * std::frexp(value, &exponent); + + return exponent - 1 - (mantissa - 5.0) * (mantissa- 1) * ONE_THIRD(); +} + +inline double QuadraticallyInterpolatedMapping::log_inverse(const double& index) const { + const int exponent = static_cast(std::floor(index)); + const double mantissa_plus_one = 3.0 - std::sqrt(4.0 - 3.0 * (index - exponent)); + + return std::ldexp(mantissa_plus_one, exponent); +} + +inline IndexMappingLayout QuadraticallyInterpolatedMapping::layout() const { + return IndexMappingLayout::LOG_QUADRATIC; +} +} + +#endif //QUADRATICALLY_INTERPOLATED_MAPPING_IMPL_HPP diff --git a/ddsketch/include/sparse_store.hpp b/ddsketch/include/sparse_store.hpp new file mode 100644 index 00000000..729bcc32 --- /dev/null +++ b/ddsketch/include/sparse_store.hpp @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef SPARSE_STORE_HPP +#define SPARSE_STORE_HPP + +#include +#include + +#include "bin.hpp" + +/** + * @class SparseStore + * @brief Sparse integer-indexed bins container backed by a std::map. + * + * @tparam Allocator Allocator type for internal storage. + */ +namespace datasketches { +// Forward declaration +template class DenseStore; + +template +class SparseStore { +public: + + /** + * @brief Bin storage type (contiguous counts). + */ + using bins_type = std::map< + int, + double, + std::less, + typename std::allocator_traits::template rebind_alloc> + >; + + // Forward declarations + /** + * @brief Forward iterator over non-empty bins (ascending index) + */ + class iterator; + + /** + * @brief Reverse iterator over non-empty bins (descending index) + */ + class reverse_iterator; + + + /** + * Default constructor + */ + SparseStore() = default; + + bool operator==(const SparseStore &other) const; + + /** + * @brief Increment bin @p index by 1. + */ + void add(int index); + + /** + * @brief Increment bin @p index by @p count. + */ + void add(int index, double count); + + /** + * @brief Increment index by count as specified by @p bin. + */ + void add(const Bin& bin); + + /** + * @brief Create a heap-allocated copy of this store. + * @return Pointer to a new CollapsingHighestDenseStore with identical contents. + */ + SparseStore* copy() const; + + /** + * @brief Clear all contents of the store. + */ + void clear(); + + /** + * @brief Lowest non-empty bin inde. + */ + int get_min_index() const; + + /** + * @brief Highest non-empty bin index. + */ + int get_max_index() const; + + /** + * @brief Merge another sparse store (same allocator) into this one. + * @param other store; its counts are added here. + */ + void merge(const SparseStore& other); + + /** + * @brief Merge a dense store (same allocator) into this one. + * @tparam Derived type of the other dense store. + * @param other store; its counts are added here. + */ + template + void merge(const DenseStore& other); + + + bool is_empty() const; + + /** + * @brief Total count across all bins. + */ + double get_total_count() const; + + /** + * This method serializes the store into a given stream in a binary form + * @param os output stream + */ + void serialize(std::ostream& os) const; + + /** + * @brief Deserialize the store from a stream (replacing current contents). + * @param is Input stream. + */ + static SparseStore deserialize(std::istream& is); + + + /** + * Computes size needed to serialize the current state of the sketch. + * @return size in bytes needed to serialize this sketch + */ + int get_serialized_size_bytes() const; + + string to_string() const; + /** + * @brief Begin iterator over bins (ascending). + */ + iterator begin() const; + + /** + * @brief End iterator over bins (ascending). + */ + iterator end() const; + + /** + * @brief Begin reverse iterator over bins (descending). + */ + reverse_iterator rbegin() const; + + /** + * @brief End reverse iterator over bins (descending). + */ + reverse_iterator rend() const; + + // ---------------- Iterators ---------------- + + /** + * @class SparseStore::iterator + * @brief Input iterator yielding Bin values in ascending index order. + * + * Stable only while the store is not mutated. + */ + class iterator { + public: + using internal_iterator = typename bins_type::const_iterator; + using iterator_category = std::input_iterator_tag; + using value_type = Bin; + using difference_type = std::ptrdiff_t; + using pointer = Bin*; + using reference = Bin; + + /** + * @brief Construct positioned iterator (internal use). + */ + explicit iterator(internal_iterator it); + + /** + * @brief Pre-increment. + */ + iterator& operator++(); + + /** + * @brief Post-increment. + */ + iterator operator++(int); + + /** + * @brief Assign from another iterator. + */ + iterator& operator=(const iterator& other); + + /** + * @brief Inequality comparison. + */ + bool operator!=(const iterator& other) const; + + /** + * @brief Dereference to the current Bin (index, count). + */ + reference operator*() const; + + private: + internal_iterator it; + }; + /** + * @class SparseStore::reverse_iterator + * @brief Input iterator yielding Bin values in descending index order. + * + * Stable only while the store is not mutated. + */ + class reverse_iterator { + public: + using internal_iterator = typename bins_type::const_reverse_iterator; + using iterator_category = std::input_iterator_tag; + using value_type = Bin; + using difference_type = std::ptrdiff_t; + using pointer = Bin*; + using reference = Bin; + + /** + * @brief Construct positioned reverse iterator (internal use). + */ + explicit reverse_iterator(internal_iterator it); + + /** + * @brief Pre-increment. + */ + reverse_iterator& operator++(); + + /** + * @brief Post-increment. + */ + reverse_iterator operator++(int); + + /** + * @brief Assign from another reverse iterator. + */ + reverse_iterator& operator=(const reverse_iterator& other); + + /** + * @brief Inequality comparison. + */ + bool operator!=(const reverse_iterator& other) const; + + /** + * @brief Dereference to the current Bin (index, count). + */ + reference operator*() const; + + private: + internal_iterator it; + }; + + +private: + bins_type bins; +}; +} + +#include "sparse_store_impl.hpp" + +#endif //SPARSE_STORE_HPP \ No newline at end of file diff --git a/ddsketch/include/sparse_store_impl.hpp b/ddsketch/include/sparse_store_impl.hpp new file mode 100644 index 00000000..3870b4d9 --- /dev/null +++ b/ddsketch/include/sparse_store_impl.hpp @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef SPARSE_STORE_IMPL_HPP +#define SPARSE_STORE_IMPL_HPP + +#include "sparse_store.hpp" + +namespace datasketches { + +template +bool SparseStore::operator==(const SparseStore &other) const { + return bins == other.bins; +} + +template +void SparseStore::add(int index) { + add(index, 1); +} + +template +void SparseStore::add(int index, double count) { + if (count == 0) { + return; + } + bins[index] += count; +} + +template +void SparseStore::add(const Bin &bin) { + if (bin.get_count() == 0) { + return; + } + add(bin.get_index(), bin.get_count()); +} + +template +SparseStore* SparseStore::copy() const { + using SparseStoreAlloc = typename std::allocator_traits::template rebind_alloc>; + SparseStoreAlloc alloc(this->bins.get_allocator()); + return new (alloc.allocate(1)) SparseStore(*this); +} + +template +void SparseStore::clear() { + bins.clear(); +} + +template +int SparseStore::get_min_index() const { + if (bins.empty()) { + throw std::runtime_error("operation is undefined for an empty sparse store"); + } + return bins.begin()->first; +} + +template +int SparseStore::get_max_index() const { + if (bins.empty()) { + throw std::runtime_error("operation is undefined for an empty sparse store"); + } + return bins.rbegin()->first; +} + +template +void SparseStore::merge(const SparseStore& other) { + for (const Bin &bin : other) { + add(bin); + } +} + +template +template +void SparseStore::merge(const DenseStore &other) { + for (const Bin& bin : other) { + add(bin); + } +} + +template +bool SparseStore::is_empty() const { + return bins.empty(); +} + +template +typename SparseStore::iterator SparseStore::begin() const { + return iterator(bins.begin()); +} + +template +typename SparseStore::iterator SparseStore::end() const { + return iterator(bins.end()); +} + +template +SparseStore::iterator::iterator(internal_iterator it): it(it) {} + +template +typename SparseStore::iterator& SparseStore::iterator::operator++() { + ++it; + return *this; +} + +template +typename SparseStore::iterator SparseStore::iterator::operator++(int) { + iterator temp = *this; + ++(*this); + return temp; +} + +template +typename SparseStore::iterator& SparseStore::iterator::operator=(const iterator& other) { + if (this != &other) { + this->it = other.it; + } + return *this; +} + +template +bool SparseStore::iterator::operator!=(const iterator& other) const { + return it != other.it; +} + +template +typename SparseStore::iterator::reference SparseStore::iterator::operator*() const { + return Bin(it->first, it->second); +} + +//----------------- + +template +typename SparseStore::reverse_iterator SparseStore::rbegin() const { + return reverse_iterator(bins.rbegin()); +} + +template +typename SparseStore::reverse_iterator SparseStore::rend() const { + return reverse_iterator(bins.rend()); +} + +template +SparseStore::reverse_iterator::reverse_iterator(internal_iterator it): it(it) {} + +template +typename SparseStore::reverse_iterator& SparseStore::reverse_iterator::operator++() { + ++it; + return *this; +} + +template +typename SparseStore::reverse_iterator SparseStore::reverse_iterator::operator++(int) { + iterator temp = *this; + ++(*this); + return temp; +} + +template +typename SparseStore::reverse_iterator& SparseStore::reverse_iterator::operator=(const reverse_iterator& other) { + if (this != &other) { + this->it = other.it; + } + return *this; +} + +template +bool SparseStore::reverse_iterator::operator!=(const reverse_iterator& other) const { + return it != other.it; +} + +template +typename SparseStore::reverse_iterator::reference SparseStore::reverse_iterator::operator*() const { + return Bin(it->first, it->second); +} + +template +double SparseStore::get_total_count() const { + double total_count = 0; + for (typename bins_type::const_iterator it = bins.begin(); it != bins.end(); ++it) { + total_count += it->second; + } + return total_count; +} + +template +void SparseStore::serialize(std::ostream &os) const { + write(os, bins.size()); + for (const auto& [index, count] : bins) { + write(os, index); + write(os, count); + } +} + +template +SparseStore SparseStore::deserialize(std::istream& is) { + SparseStore store; + const auto num_bins = read(is); + for (typename bins_type::size_type i = 0; i < num_bins; ++i) { + const auto index = read(is); + const auto count = read(is); + store.bins[index] = count; + } + + return store; +} + +template +int SparseStore::get_serialized_size_bytes() const { + int size_bytes = 0; + size_bytes += sizeof(typename SparseStore::bins_type::size_type); + size_bytes += bins.size() * sizeof(typename SparseStore::bins_type::key_type); + size_bytes += bins.size() * sizeof(typename SparseStore::bins_type::mapped_type); + + return size_bytes; +} + +template +string SparseStore::to_string() const { + std::ostringstream os; + os << " Type : sparse store" << std::endl; + os << " Bins number : " << bins.size() << std::endl; + return os.str(); +} + + +} + +#endif //SPARSE_STORE_IMPL_HPP \ No newline at end of file diff --git a/ddsketch/include/store_factory.hpp b/ddsketch/include/store_factory.hpp new file mode 100644 index 00000000..0df12de0 --- /dev/null +++ b/ddsketch/include/store_factory.hpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef STORE_FACTORY_HPP +#define STORE_FACTORY_HPP +#include + + +namespace datasketches { +template +class store_factory { +public: + static std::unique_ptr new_store() + { + return std::unique_ptr(new StoreType()); + } +}; +} + +#endif //STORE_FACTORY_HPP diff --git a/ddsketch/include/unbounded_size_dense_store.hpp b/ddsketch/include/unbounded_size_dense_store.hpp new file mode 100644 index 00000000..699c0e1e --- /dev/null +++ b/ddsketch/include/unbounded_size_dense_store.hpp @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef UNBOUNDED_SIZE_DENSE_STORE_HPP +#define UNBOUNDED_SIZE_DENSE_STORE_HPP +#include "dense_store.hpp" + +namespace datasketches { + +/** + * @class UnboundedSizeDenseStore + * @brief Common logic for non-bounded-capacity dense stores. + */ +template +class UnboundedSizeDenseStore: public DenseStore, Allocator> { +public: + using size_type = typename DenseStore::size_type; + + // Constructors + UnboundedSizeDenseStore(); + explicit UnboundedSizeDenseStore(const int& array_length_growth_increment); + explicit UnboundedSizeDenseStore(const int& array_length_growth_increment, const int& array_length_overhead); + + /** + * @brief Create a heap-allocated copy of this store. + * @return Pointer to a new UnboundedSizeDenseStore* copy() const; + */ + UnboundedSizeDenseStore* copy() const; + + ~UnboundedSizeDenseStore() = default; + + /** + * Copy assignment + * @param other sketch to be copied + * @return reference to this sketch + */ + UnboundedSizeDenseStore& operator=(const UnboundedSizeDenseStore& other); + + /** + * @brief Merge another store into this one. + * @param other Source store; its counts are added into this store. + * @note May trigger tail collapsing to respect the capacity @tparam N. + */ + void merge(const UnboundedSizeDenseStore& other); + + /** + * @brief Bring base-class merge overloads into scope (e.g., generic Store/DenseStore merges). + */ + using DenseStore::merge; + + /** + * This method serializes the store into a given stream in a binary form + * @param os output stream + */ + void serialize(std::ostream& os) const; + + /** + * @brief Deserialize the store from a stream (replacing current contents). + * @param is Input stream. + */ + static UnboundedSizeDenseStore deserialize(std::istream& is); + + /** + * Computes size needed to serialize the current state of the sketch. + * @return size in bytes needed to serialize this sketch + */ + int get_serialized_size_bytes() const; + +protected: + + /** + * @brief Normalize a raw bin index into this store's current window. + */ + size_type normalize(size_type index); + + /** + * @brief Reframe the active index window to [new_min_index, new_max_index]. + */ + void adjust(size_type new_min_index, size_type new_max_index); + + friend class DenseStore; +}; +} + +#include "unbounded_size_dense_store_impl.hpp" + +#endif //UNBOUNDED_SIZE_DENSE_STORE_HPP diff --git a/ddsketch/include/unbounded_size_dense_store_impl.hpp b/ddsketch/include/unbounded_size_dense_store_impl.hpp new file mode 100644 index 00000000..50f68bf4 --- /dev/null +++ b/ddsketch/include/unbounded_size_dense_store_impl.hpp @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#ifndef UNBOUNDED_SIZE_DENSE_STORE_IMPL_HPP +#define UNBOUNDED_SIZE_DENSE_STORE_IMPL_HPP + +#include "unbounded_size_dense_store.hpp" + +namespace datasketches { +template +UnboundedSizeDenseStore::UnboundedSizeDenseStore(): DenseStore, Allocator>() {} + +template +UnboundedSizeDenseStore::UnboundedSizeDenseStore(const int &array_length_growth_increment): DenseStore, Allocator>(array_length_growth_increment) {} + +template +UnboundedSizeDenseStore::UnboundedSizeDenseStore(const int &array_length_growth_increment, const int &array_length_overhead): DenseStore, Allocator>(array_length_growth_increment, array_length_overhead) {} + +template +typename UnboundedSizeDenseStore::size_type UnboundedSizeDenseStore::normalize(size_type index) { + if (index < this->min_index || index > this->max_index) { + this->extend_range(index); + } + + return index - this->offset; +} + +template +UnboundedSizeDenseStore *UnboundedSizeDenseStore::copy() const { + using StoreAlloc = typename std::allocator_traits::template rebind_alloc>; + StoreAlloc alloc(this->bins.get_allocator()); + return new (alloc.allocate(1)) UnboundedSizeDenseStore(*this); +} + +template +UnboundedSizeDenseStore &UnboundedSizeDenseStore::operator=(const UnboundedSizeDenseStore &other) { + + this->bins = other.bins; + this->offset = other.offset; + this->min_index = other.min_index; + this->max_index = other.max_index; + + + return *this; +} + + + +template +void UnboundedSizeDenseStore::adjust(size_type new_min_index, size_type new_max_index) { + this->center_bins(new_min_index, new_max_index); +} + +template +void UnboundedSizeDenseStore::merge(const UnboundedSizeDenseStore &other) { + if (other.is_empty()) { + return; + } + + if (other.get_min_index() < this->min_index || other.get_max_index() > this->max_index) { + this->extend_range(other.get_min_index(), other.get_max_index()); + } + + for (int index = other.get_min_index(); index <= other.get_max_index(); ++index) { + this->bins[index - this->offset] += other.bins[index - other.offset]; + } +} + +template +void UnboundedSizeDenseStore::serialize(std::ostream &os) const { + this->serialize_common(os); +} + +template +UnboundedSizeDenseStore UnboundedSizeDenseStore::deserialize(std::istream &is) { + UnboundedSizeDenseStore store; + UnboundedSizeDenseStore::deserialize_common(store, is); + + return store; +} + +template +int UnboundedSizeDenseStore::get_serialized_size_bytes() const { + return this->get_serialized_size_bytes_common(); +} + + + + + +} +#endif //UNBOUNDED_SIZE_DENSE_STORE_IMPL_HPP diff --git a/ddsketch/test/BinTest.cpp b/ddsketch/test/BinTest.cpp new file mode 100644 index 00000000..77ca758f --- /dev/null +++ b/ddsketch/test/BinTest.cpp @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#include +#include "bin.hpp" +#include +#include +namespace datasketches { + +void TestBinInitialization(const int index, const uint64_t count) { + Bin bin(index, count); + REQUIRE(bin.get_count() == count); + REQUIRE(bin.get_index() == index); +} + +TEST_CASE("bintest", "[bintest]") { + TestBinInitialization(0, 1); + TestBinInitialization(3, 1); + TestBinInitialization(INT_MAX >> 1, 1); + TestBinInitialization(INT_MAX, 1); + TestBinInitialization(-3, 1); + TestBinInitialization(INT_MIN >> 1, 1); + TestBinInitialization(INT_MIN, 1); +} +} /* namespace datasketches */ \ No newline at end of file diff --git a/ddsketch/test/CMakeLists.txt b/ddsketch/test/CMakeLists.txt new file mode 100644 index 00000000..8df82ad2 --- /dev/null +++ b/ddsketch/test/CMakeLists.txt @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_executable(ddsketch_test) + +target_link_libraries(ddsketch_test ddsketch common_test_lib) + +set_target_properties(ddsketch_test PROPERTIES + CXX_STANDARD_REQUIRED YES +) + +file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" DDSKETCH_TEST_BINARY_PATH) +string(APPEND DDSKETCH_TEST_BINARY_PATH "/") +target_compile_definitions(ddsketch_test + PRIVATE + TEST_BINARY_INPUT_PATH="${DDSKETCH_TEST_BINARY_PATH}" +) + +add_test( + NAME ddsketch_test + COMMAND ddsketch_test +) + +target_sources(ddsketch_test + PRIVATE + BinTest.cpp + DDSketchTest.cpp + StoreTest.cpp + IndexMappingTest.cpp +) + +if (SERDE_COMPAT) + target_sources(ddsketch_test + PRIVATE + ddsketch_sketch_deserialize_from_java_test.cpp + ) +endif() + +if (GENERATE) + target_sources(ddsketch_test + PRIVATE + ddsketch_sketch_serialize_for_java.cpp + ) +endif() diff --git a/ddsketch/test/DDSketchTest.cpp b/ddsketch/test/DDSketchTest.cpp new file mode 100644 index 00000000..45fc175c --- /dev/null +++ b/ddsketch/test/DDSketchTest.cpp @@ -0,0 +1,588 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ddsketch.hpp" +#include "logarithmic_mapping.hpp" +#include "linearly_interpolated_mapping.hpp" +#include "unbounded_size_dense_store.hpp" +#include "collapsing_highest_dense_store.hpp" +#include "collapsing_lowest_dense_store.hpp" +#include "sparse_store.hpp" +#include "../../tdigest/include/tdigest.hpp" + +namespace datasketches { + +using A = std::allocator; +constexpr double EPSILON = 1e-10; + +void assert_accurate(double min_expected, double max_expected, double actual, double relative_accuracy) { + const double relaxed_min_expected = min_expected > 0 ? min_expected * (1 - relative_accuracy) : min_expected * (1 + relative_accuracy); + const double relaxed_max_expected = max_expected > 0 ? max_expected * (1 + relative_accuracy) : max_expected * (1 - relative_accuracy); + bool failed = (actual < relaxed_min_expected - EPSILON) || (actual > relaxed_max_expected + EPSILON); + REQUIRE(!failed); +} + +// Test helper functions +void assert_quantile_accurate(const std::vector& sorted_values, double quantile, double actual_quantile_value, double relative_accuracy) { + const double lower_quantile_value = sorted_values[static_cast(std::floor(quantile * (sorted_values.size() - 1)))]; + const double upper_quantile_value = sorted_values[static_cast(std::ceil(quantile * (sorted_values.size() - 1)))]; + + assert_accurate(lower_quantile_value, upper_quantile_value, actual_quantile_value, relative_accuracy); +} + + +template +void assert_encodes(const SketchType& sketch, const std::vector& values, double relative_accuracy) { + REQUIRE(sketch.get_count() == Approx(values.size()).margin(EPSILON)); + + if (values.empty()) { + REQUIRE(sketch.is_empty()); + return; + } + + REQUIRE_FALSE(sketch.is_empty()); + + auto sorted_values = values; + std::sort(sorted_values.begin(), sorted_values.end()); + + const double min_value = sketch.get_min(); + const double max_value = sketch.get_max(); + + assert_accurate(sorted_values[0], sorted_values[0], min_value, relative_accuracy); + assert_accurate(sorted_values.back(), sorted_values.back(), max_value, relative_accuracy); + + // Test quantiles + for (double quantile = 0.0; quantile <= 1.0; quantile += 0.01) { + const double value_at_quantile = sketch.get_quantile(quantile); + assert_quantile_accurate(sorted_values, quantile, value_at_quantile, relative_accuracy); + + REQUIRE(value_at_quantile >= min_value); + REQUIRE(value_at_quantile <= max_value); + } + + // Test sum accuracy (if values have same sign) + if (sorted_values[0] >= 0 || sorted_values.back() <= 0) { + const double expected_sum = std::accumulate(values.begin(), values.end(), 0.0); + assert_accurate(expected_sum, expected_sum, sketch.get_sum(), relative_accuracy); + } +} + +template +void test_adding(SketchType& sketch, const std::vector& values, double relative_accuracy) { + // Test individual additions + sketch.clear(); + for (const double& value : values) { + sketch.update(value); + } + assert_encodes(sketch, values, relative_accuracy); + + // Test weighted additions + sketch.clear(); + auto sketch_weighted(sketch); + std::map value_counts; + for (const double& value : values) { + value_counts[value]++; + } + + for (const auto& [value, count] : value_counts) { + sketch_weighted.update(value, count); + } + assert_encodes(sketch_weighted, values, relative_accuracy); +} + +template +void test_merging(SketchType& sketch, const std::vector>& value_arrays, double relative_accuracy) { + sketch.clear(); + for (const auto& values : value_arrays) { + SketchType intermediate_sketch(sketch); + intermediate_sketch.clear(); + for (const double& value : values) { + intermediate_sketch.update(value); + } + sketch.merge(intermediate_sketch); + } + + // Flatten all values + std::vector all_values; + for (const auto& values : value_arrays) { + all_values.insert(all_values.end(), values.begin(), values.end()); + } + + assert_encodes(sketch, all_values, relative_accuracy); +} + +using DDSketchUnboundedStoreTestCase = std::pair>, LogarithmicMapping>; + +template +using DDSketchCollapsingHighestStoreTestCase = std::pair>, LogarithmicMapping>; + +template +using DDSketchCollapsingLowestStoreTestCase = std::pair>, LogarithmicMapping>; + +TEMPLATE_TEST_CASE("DDSketch empty test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<128>, + DDSketchCollapsingLowestStoreTestCase<128> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + constexpr double relative_accuracy = 0.01; + DDSketch sketch(relative_accuracy);; + + REQUIRE(sketch.is_empty()); + REQUIRE(sketch.get_count() == Approx(0.0).margin(EPSILON)); + REQUIRE_THROWS_AS(sketch.get_min(), std::runtime_error); + REQUIRE_THROWS_AS(sketch.get_max(), std::runtime_error); + REQUIRE_THROWS_AS(sketch.get_quantile(0.5), std::runtime_error); +} + +TEMPLATE_TEST_CASE("DDSketch exception test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<128>, + DDSketchCollapsingLowestStoreTestCase<128> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + constexpr double relative_accuracy = 0.01; + DDSketch sketch(relative_accuracy); + + // Test invalid quantile values + sketch.update(1.0); + REQUIRE_THROWS_AS(sketch.get_quantile(-0.1), std::invalid_argument); + REQUIRE_THROWS_AS(sketch.get_quantile(1.1), std::invalid_argument); + + // Test invalid count values + REQUIRE_THROWS_AS(sketch.update(1.0, -1.0), std::invalid_argument); +} + +TEMPLATE_TEST_CASE("DDSketch clear test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<128>, + DDSketchCollapsingLowestStoreTestCase<128> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + constexpr double relative_accuracy = 0.01; + DDSketch sketch(relative_accuracy); + + sketch.update(1.0); + sketch.update(2.0); + sketch.clear(); + + REQUIRE(sketch.is_empty()); + REQUIRE(sketch.get_count() == Approx(0.0).margin(EPSILON)); +} + +TEMPLATE_TEST_CASE("DDSketch constant test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<128>, + DDSketchCollapsingLowestStoreTestCase<128> +){ + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-3; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + test_adding(sketch, {0.0}, relative_accuracy); + test_adding(sketch, {1.0}, relative_accuracy); + test_adding(sketch, {1.0, 1.0, 1.0}, relative_accuracy); + test_adding(sketch, {10.0, 10.0, 10.0}, relative_accuracy); + + std::vector large_constant(10000, 2.0); + test_adding(sketch, large_constant, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch negative constants test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<128>, + DDSketchCollapsingLowestStoreTestCase<128> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-3; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + test_adding(sketch, {0.0}, relative_accuracy); + test_adding(sketch, {-1.0}, relative_accuracy); + test_adding(sketch, {-1.0, -1.0, -1.0}, relative_accuracy); + test_adding(sketch, {-10.0, -10.0, -10.0}, relative_accuracy); + + // Large negative constant array + std::vector large_negative(10000, -2.0); + test_adding(sketch, large_negative, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch mixed positive negative test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<128>, + DDSketchCollapsingLowestStoreTestCase<128> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-3; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + test_adding(sketch, {0.0}, relative_accuracy); + test_adding(sketch, {-1.0, 1.0}, relative_accuracy); + test_adding(sketch, {-1.0, -1.0, -1.0, 1.0, 1.0, 1.0}, relative_accuracy); + test_adding(sketch, {-10.0, -10.0, -10.0, 10.0, 10.0, 10.0}, relative_accuracy); + + // Large negative constant array + std::vector mixed_large; + mixed_large.reserve(20000); + for (int i = 0; i < 20000; ++i) { + mixed_large.push_back(i % 2 == 0 ? 2.0 : -2.0); + } + std::vector large_negative(10000, -2.0); + test_adding(sketch, large_negative, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch with zeros test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<4096>, + DDSketchCollapsingLowestStoreTestCase<4096> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-3 ; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + + // All zeros + std::vector all_zeros(100, 0.0); + test_adding(sketch, all_zeros, relative_accuracy); + + // Zeros at beginning + std::vector zeros_beginning(110, 0.0); + for (int i = 0; i < 100; ++i) { + zeros_beginning[10+i] = i; + } + test_adding(sketch, zeros_beginning, relative_accuracy); + // Zeros at end + std::vector zeros_end(110, 0.0); + zeros_end.reserve(110); + for (int i = 0; i < 10; ++i) { + zeros_end[zeros_end.size() - 1] = i; + } + test_adding(sketch, zeros_end, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch linear sequences test", "[ddsketch]", + DDSketchUnboundedStoreTestCase +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-3 ; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + // Increasing linearly + std::vector increasing; + increasing.reserve(10000); + for (int i = 0; i < 10000; ++i) { + increasing.push_back(i); + } + test_adding(sketch, increasing, relative_accuracy); + + // Decreasing linearly + std::vector decreasing; + decreasing.reserve(10000); + for (int i = 0; i < 10000; ++i) { + decreasing.push_back(10000 - i); + } + test_adding(sketch, decreasing, relative_accuracy); + + // Negative increasing + std::vector negative_increasing; + negative_increasing.reserve(10000); + for (int i = -10000; i < 0; ++i) { + negative_increasing.push_back(i); + } + test_adding(sketch, negative_increasing, relative_accuracy); + + // Mixed positive/negative increasing + std::vector mixed_increasing; + for (int i = -10000; i < 10000; ++i) { + mixed_increasing.push_back(i); + } + test_adding(sketch, mixed_increasing, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch exponential sequence test", "[ddsketch]", + DDSketchUnboundedStoreTestCase +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-3 ; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + // Increasing exponentially + std::vector increasing_exp; + increasing_exp.reserve(100); + for (int i = 0; i < 100; ++i) { + increasing_exp.push_back(std::exp(i)); + } + test_adding(sketch, increasing_exp, relative_accuracy); + + // Decreasing exponentially + std::vector decreasing; + decreasing.reserve(100); + for (int i = 0; i < 100; ++i) { + decreasing.push_back(std::exp(- i)); + } + test_adding(sketch, decreasing, relative_accuracy); + + // Negative increasing + std::vector negative_increasing; + negative_increasing.reserve(100); + for (int i = -100; i < 0; ++i) { + negative_increasing.push_back(-std::exp(i)); + } + test_adding(sketch, negative_increasing, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch merging test", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<4096>, + DDSketchCollapsingLowestStoreTestCase<4096> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + + for (double relative_accuracy = 1e-1; relative_accuracy >= 1e-1 ; relative_accuracy *= 1e-1) { + DDSketch sketch(relative_accuracy); + // Test merging empty sketches + test_merging(sketch, {{}, {}}, relative_accuracy); + test_merging(sketch, {{}, {0.0}}, relative_accuracy); + test_merging(sketch, {{0.0}, {}}, relative_accuracy); + + // Test merging constants + test_merging(sketch, {{1.0, 1.0}, {1.0, 1.0, 1.0}}, relative_accuracy); + + // Test merging far apart values + test_merging(sketch, {{0.0}, {10000.0}}, relative_accuracy); + test_merging(sketch, {{10000.0}, {20000.0}}, relative_accuracy); + test_merging(sketch, {{20000.0}, {10000.0}}, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch different mappings", "[ddsketch]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<4096>, + DDSketchCollapsingLowestStoreTestCase<4096> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + constexpr double relative_accuracy = 0.01; + std::vector test_values = {0.0, 1.0, -1.0, 10.0, -10.0, 100.0, -100.0}; + DDSketch sketch(relative_accuracy); + test_adding(sketch, test_values, relative_accuracy); +} + +TEMPLATE_TEST_CASE("DDSketch add random test", "[ddsketch][random]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<4096>, + DDSketchCollapsingLowestStoreTestCase<4096> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + constexpr double relative_accuracy = 0.01; + constexpr int num_tests = 100; + constexpr int max_num_values = 1000; + + DDSketch sketch(relative_accuracy); + std::random_device rd; + std::mt19937_64 rng(rd()); + std::uniform_int_distribution size_dist(0, max_num_values - 1); + std::uniform_real_distribution value_dist(-1000.0, 1000.0); + + for (int i = 0; i < num_tests; ++i) { + std::vector values; + int num_values = size_dist(rng); + values.reserve(num_values); + + for (int j = 0; j < num_values; ++j) { + values.push_back(value_dist(rng)); + } + + test_adding(sketch, values, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch merging random test", "[ddsketch][random]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<4096>, + DDSketchCollapsingLowestStoreTestCase<4096> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + constexpr double relative_accuracy = 0.01; + constexpr int num_tests = 100; + constexpr int max_num_sketches = 100; + constexpr int max_num_values_per_sketch = 1000; + + DDSketch sketch(relative_accuracy); + std::random_device rd; + std::mt19937_64 rng(rd()); + std::uniform_int_distribution sketch_size_dist(0, max_num_sketches - 1); + std::uniform_int_distribution values_size_dist(0, max_num_values_per_sketch - 1); + std::uniform_real_distribution value_dist(-1000.0, 1000.0); + + for (int i = 0; i < num_tests; ++i) { + std::vector> value_arrays; + int num_sketches = sketch_size_dist(rng); + value_arrays.reserve(num_sketches); + + for (int j = 0; j < num_sketches; ++j) { + std::vector values; + int num_values = values_size_dist(rng); + values.reserve(num_values); + + for (int k = 0; k < num_values; ++k) { + values.push_back(value_dist(rng)); + } + value_arrays.push_back(std::move(values)); + } + + test_merging(sketch, value_arrays, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("DDSketch serialize - deserialize test", "[ddsketch][random]", + DDSketchUnboundedStoreTestCase, + DDSketchCollapsingHighestStoreTestCase<4096>, + DDSketchCollapsingLowestStoreTestCase<4096> +) { + auto positive_store = *TestType::first_type::new_store(); + auto negative_store = *TestType::first_type::new_store(); + using StoreType = decltype(positive_store); + using MappingType = typename TestType::second_type; + constexpr double relative_accuracy = 0.01; + constexpr int num_tests = 100; + constexpr int max_num_values = 1000; + + DDSketch sketch(relative_accuracy); + std::random_device rd; + std::mt19937_64 rng(rd()); + std::uniform_int_distribution size_dist(0, max_num_values - 1); + std::uniform_real_distribution value_dist(-1000.0, 1000.0); + + std::stringstream ss; + sketch.serialize(ss); + DDSketch deserialized_empty_sketch = DDSketch::deserialize(ss); + REQUIRE(sketch.is_empty()); + REQUIRE(deserialized_empty_sketch.is_empty()); + REQUIRE(ss.peek() == std::istream::traits_type::eof()); + REQUIRE(sketch == deserialized_empty_sketch); + ss.clear(); + + for (int i = 0; i < num_tests; ++i) { + std::vector values; + int num_values = size_dist(rng); + + for (int j = 0; j < num_values; ++j) { + sketch.update(value_dist(rng)); + } + + sketch.serialize(ss); + auto deserialized_sketch = DDSketch::deserialize(ss); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(deserialized_sketch.is_empty()); + REQUIRE(ss.peek() == std::istream::traits_type::eof()); + REQUIRE(sketch == deserialized_sketch); + ss.clear(); + + } + + +} + +TEST_CASE("quantile", "[ddsketch]") { + DDSketch, LinearlyInterpolatedMapping> sk(0.01); + std::random_device rd{}; + std::mt19937_64 gen{rd()}; + std::normal_distribution d(0.0, 1.0); + + std::vector values; + + tdigest td(100); + DDSketch, LogarithmicMapping> ddsketch(0.01); + DDSketch, LogarithmicMapping> sparse_sk(0.01); + for (size_t i = 0; i < 10000000; ++i) { + double val = d(gen); + ddsketch.update(val); + sparse_sk.update(val); + td.update(val); + } + + + + std::cout << ddsketch.to_string(); + std::cout << std::endl; + std::cout << sparse_sk.to_string(); + std::cout << std::endl; + std::cout << td.to_string(); + + std::cout << std::setprecision(20) << std::fixed; + for (double q = 0.0; q <= 1.00; q += 0.01) { + std::cout << std::setw(4) << q << " " << std::setw(15) << ddsketch.get_quantile(q) << " " << std::setw(15) << sparse_sk.get_quantile(q) << " " << std::setw(15) << td.get_quantile(q) << " " << std::endl; + } +} +} /* namespace datasketches */ \ No newline at end of file diff --git a/ddsketch/test/IndexMappingTest.cpp b/ddsketch/test/IndexMappingTest.cpp new file mode 100644 index 00000000..49be1903 --- /dev/null +++ b/ddsketch/test/IndexMappingTest.cpp @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. +*/ + +#include +#include +#include "index_mapping_factory.hpp" +#include "linearly_interpolated_mapping.hpp" +#include "logarithmic_mapping.hpp" +#include "quadratically_interpolated_mapping.hpp" + +namespace datasketches { + +constexpr double min_tested_relative_accuracy = 1e-8; +constexpr double max_tested_relative_accuracy = 1 - 1e-3; +// For more precise testing +//constexpr double multiplier = 1 + std::numbers::sqrt2 * 1e-1; +// For faster testing +constexpr double multiplier = 1 + 1.4142135623730950 * 1e6; +constexpr double floating_point_acceptable_error = 1e-10; + +void assert_relative_accuracy(const double& expected, const double& actual, const double& relative_accuracy) { + REQUIRE(expected >= 0); + REQUIRE(actual >= 0); + if (expected == 0) { + REQUIRE(actual == Approx(0.).margin(1e-12)); + } else { + REQUIRE(std::abs(expected - actual) / expected <= relative_accuracy + 1e-12); + } +} + +template +void test_accuracy(const M& mapping, const double& relative_accuracy) { + REQUIRE( mapping.get_relative_accuracy() <= relative_accuracy + floating_point_acceptable_error); + for (double value = mapping.min_indexable_value(); value < mapping.max_indexable_value(); value *= multiplier) { + const double mapped_value = mapping.value(mapping.index(value)); + assert_relative_accuracy(value, mapped_value, relative_accuracy); + } + const double value = mapping.max_indexable_value(); + const double mapped_value = mapping.value(mapping.index(value)); + assert_relative_accuracy(value, mapped_value, relative_accuracy); + REQUIRE(relative_accuracy <= mapping.get_relative_accuracy() + 1e-10); +} + +TEMPLATE_TEST_CASE("test index mapping accuracy", "[indexmappingtest]", + LinearlyInterpolatedMapping, + LogarithmicMapping, + QuadraticallyInterpolatedMapping + ) { + for (double relative_accuracy = max_tested_relative_accuracy; relative_accuracy >= min_tested_relative_accuracy; relative_accuracy *= max_tested_relative_accuracy) { + auto mapping = index_mapping_factory::new_mapping(relative_accuracy); + test_accuracy(*mapping, relative_accuracy); + } +} + +TEMPLATE_TEST_CASE("test index mapping validity", "[indexmappingtest}", + LinearlyInterpolatedMapping, + LogarithmicMapping, + QuadraticallyInterpolatedMapping + ) { + constexpr double relative_accuracy = 1e-2; + constexpr int min_index = -50; + constexpr int max_index = 50; + + auto mapping = index_mapping_factory::new_mapping(relative_accuracy); + int index = min_index; + double bound = mapping->upper_bound(index - 1); + for (; index <= max_index; index++) { + REQUIRE(mapping->lower_bound(index) == Approx(bound).margin(floating_point_acceptable_error)); + REQUIRE(mapping->lower_bound(index) <= mapping->value(index)); + REQUIRE(mapping->upper_bound(index) >= mapping->value(index)); + REQUIRE(mapping->index(mapping->lower_bound(index) - floating_point_acceptable_error) <= index); + REQUIRE(mapping->index(mapping->lower_bound(index) + floating_point_acceptable_error) >= index); + REQUIRE(mapping->index(mapping->upper_bound(index) - floating_point_acceptable_error) <= index); + REQUIRE(mapping->index(mapping->upper_bound(index) + floating_point_acceptable_error) >= index); + bound = mapping->upper_bound(index); + } +} + +TEMPLATE_TEST_CASE("encode - decode", "[indexmappingtest", + LinearlyInterpolatedMapping, + LogarithmicMapping, + QuadraticallyInterpolatedMapping +) { + TestType mapping(0.01); + + std::stringstream ss; + mapping.serialize(ss); + + const TestType decoded_mapping = IndexMapping::deserialize(ss); + + REQUIRE(mapping.get_relative_accuracy() == decoded_mapping.get_relative_accuracy()); +} +} diff --git a/ddsketch/test/StoreTest.cpp b/ddsketch/test/StoreTest.cpp new file mode 100644 index 00000000..74bef8fb --- /dev/null +++ b/ddsketch/test/StoreTest.cpp @@ -0,0 +1,502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// #include +// #include +// #include +// #include +// #include +// #include +// +// #include "collapsing_highest_dense_store.hpp" +// #include "store_factory.hpp" +// +// #include "collapsing_lowest_dense_store.hpp" +// #include "ddsketch.hpp" +// #include "linearly_interpolated_mapping.hpp" +// #include "unbounded_size_dense_store.hpp" +// +// namespace datasketches { +// +// static constexpr double eps = 1e-10; +// static constexpr int numTests = 30; +// +// using A = std::allocator; +// +// template +// class bins_transformer_factory { +// public: +// static std::unique_ptr new_bins_transformer() { +// return std::unique_ptr(); +// } +// }; +// +// template +// class collapsing_lowest_bins { +// public: +// static std::vector collapse(std::vector& bins) { +// int max_index = INT_MIN; +// for (const Bin& bin : bins) { +// max_index = std::max(max_index, bin.get_index()); +// } +// if (max_index < INT_MIN + max_num_bins) { +// return bins; +// } +// int min_collapsed_index = max_index - max_num_bins + 1; +// std::vector collapsed_bins; +// collapsed_bins.reserve(bins.size()); +// for (const Bin& bin : bins) { +// collapsed_bins.emplace_back(std::max(bin.get_index(), min_collapsed_index), bin.get_count()); +// } +// return collapsed_bins; +// } +// }; +// +// template +// class collapsing_highest_bins { +// public: +// static std::vector collapse(std::vector& bins) { +// int min_index = INT_MAX; +// for (const Bin& bin : bins) { +// min_index = std::min(min_index, bin.get_index()); +// } +// if (min_index > INT_MAX - max_num_bins) { +// return bins; +// } +// int max_collapsed_index = min_index + max_num_bins - 1; +// std::vector collapsed_bins; +// collapsed_bins.reserve(bins.size()); +// for (const Bin& bin : bins) { +// collapsed_bins.emplace_back(std::min(bin.get_index(), max_collapsed_index), bin.get_count()); +// } +// return collapsed_bins; +// } +// }; +// +// class noops_collapsing_bins { +// public: +// static std::vector collapse(std::vector& bins) { +// return bins; +// } +// +// }; +// +// std::vector normalize_bins(const std::vector& bins) { +// std::map bins_by_index; +// for (const Bin& bin : bins) { +// if (bin.get_count() <= 0) { +// continue; +// } +// bins_by_index[bin.get_index()] += bin.get_count(); +// } +// +// std::vector normalized_bins; +// normalized_bins.reserve(bins_by_index.size()); +// for (auto & it : bins_by_index) { +// normalized_bins.emplace_back(it.first, it.second); +// } +// +// std::sort(normalized_bins.begin(), normalized_bins.end(), [](const Bin& lhs, const Bin& rhs) { +// return lhs.get_index() < rhs.get_index(); +// }); +// +// return normalized_bins; +// } +// +// int random_index() { +// std::random_device rd; +// std::mt19937_64 rng(rd()); +// std::uniform_int_distribution distribution(-1000, 1000); +// return distribution(rng); +// } +// +// double random_count() { +// std::random_device rd; +// std::mt19937_64 rng(rd()); +// std::uniform_real_distribution distribution(0., 1.); +// double count= 0.; +// do { +// count = distribution(rng); +// } while (count < eps * 10); +// return count; +// } +// +// template +// void assert_encode_bins(StoreType& store, const std::vector& normalized_bins) { +// double expected_total_count = 0; +// for (const Bin& bin : normalized_bins) { +// expected_total_count += bin.get_count(); +// } +// +// if (expected_total_count == 0) { +// REQUIRE(store->is_empty()); +// REQUIRE(store->get_total_count() == 0); +// REQUIRE_THROWS_AS(store->get_min_index(), std::runtime_error); +// REQUIRE_THROWS_AS(store->get_max_index(), std::runtime_error); +// } else { +// REQUIRE_FALSE(store->is_empty()); +// REQUIRE(store->get_total_count() - expected_total_count < eps); +// +// REQUIRE(store->get_min_index() == normalized_bins[0].get_index()); +// REQUIRE(store->get_max_index() == normalized_bins[normalized_bins.size() - 1].get_index()); +// +// std::vector bins; +// for (const Bin& bin : *store) { +// bins.push_back(bin); +// } +// +// std::sort(bins.begin(), bins.end(), [](const Bin& lhs, const Bin& rhs) { +// return lhs.get_index() < rhs.get_index(); +// }); +// REQUIRE(bins.size() == normalized_bins.size()); +// for (size_t i = 0; i < bins.size(); ++i) { +// REQUIRE(bins[i].get_index() == normalized_bins[i].get_index()); +// REQUIRE_THAT(bins[i].get_count(), Catch::Matchers::WithinAbs(normalized_bins[i].get_count(), 1e-3)); +// } +// } +// } +// +// template +// void test_copy(StoreType& store, const std::vector& normalized_bins) { +// auto store_copy = store->copy(); +// store->merge(*store_copy); +// assert_encode_bins(store_copy, normalized_bins); +// store->clear(); +// assert_encode_bins(store_copy, normalized_bins); +// std::vector empty_bins; +// assert_encode_bins(store, empty_bins); +// +// std::vector permutated_bins = normalized_bins; +// +// std::shuffle(permutated_bins.begin(), permutated_bins.end(), std::mt19937(42)); +// +// for (const Bin& bin : permutated_bins) { +// store->add(bin); +// } +// +// assert_encode_bins(store, normalized_bins); +// } +// +// template +// void test_store(StoreType& store, const std::vector& normalized_bins) { +// assert_encode_bins(store, normalized_bins); +// test_copy(store, normalized_bins); +// } +// +// template +// using CollapsingLowestStoreTestCase = store_factory>; +// +// template +// using CollapsingHighestStoreTestCase = store_factory>; +// +// using UnboundedStoreSizeTestCase = store_factory>; +// +// TEMPLATE_TEST_CASE("store test empty", "[storetest]", +// CollapsingLowestStoreTestCase<8>, +// CollapsingLowestStoreTestCase<128>, +// CollapsingLowestStoreTestCase<1024>, +// CollapsingHighestStoreTestCase<8>, +// CollapsingHighestStoreTestCase<128>, +// CollapsingHighestStoreTestCase<1024>, +// UnboundedStoreSizeTestCase +// ) { +// auto store = TestType::new_store(); +// std::vector empty_bins{}; +// test_store(store, empty_bins); +// } +// +// TEMPLATE_TEST_CASE("store test add datasets", "[storetest]", +// (std::pair>, collapsing_lowest_bins<8>>), +// (std::pair>, collapsing_lowest_bins<128>>), +// (std::pair>, collapsing_lowest_bins<1024>>), +// (std::pair>, collapsing_highest_bins<8>>), +// (std::pair>, collapsing_highest_bins<128>>), +// (std::pair>, collapsing_highest_bins<1024>>), +// (std::pair>, noops_collapsing_bins>) +// ) { +// std::vector> datasets{ +// {-1000}, +// {-1}, +// {0}, +// {1}, +// {1000}, +// {1000, 1000}, +// {1000, -1000}, +// {-1000, 1000}, +// {-1000, -1000}, +// {0, 0, 0, 0} +// }; +// std::vector counts{0.1, 1, 100}; +// +// for (const std::vector& dataset : datasets) { +// std::vector bins; +// bins.reserve(dataset.size()); +// auto storeAdd = TestType::first_type::new_store(); +// for (const int& index : dataset) { +// Bin bin(index, 1); +// bins.push_back(bin); +// storeAdd->add(index); +// } +// std::vector normalized_bins = normalize_bins(TestType::second_type::collapse(bins)); +// test_store(storeAdd, normalized_bins); +// for (const double& count : counts) { +// bins.clear(); +// auto storeAddBin = TestType::first_type::new_store(); +// auto storeAddWithCount = TestType::first_type::new_store(); +// for (const int& index : dataset) { +// Bin bin(index, count); +// bins.push_back(bin); +// storeAddBin->add(bin); +// storeAddWithCount->add(index, count); +// } +// normalized_bins = normalize_bins(TestType::second_type::collapse(bins)); +// test_store(storeAddBin, normalized_bins); +// test_store(storeAddWithCount, normalized_bins); +// } +// } +// +// } +// +// TEMPLATE_TEST_CASE("store test add constant", "[storetest]", +// (std::pair>, collapsing_lowest_bins<8>>), +// (std::pair>, collapsing_lowest_bins<128>>), +// (std::pair>, collapsing_lowest_bins<1024>>), +// (std::pair>, collapsing_highest_bins<8>>), +// (std::pair>, collapsing_highest_bins<128>>), +// (std::pair>, collapsing_highest_bins<1024>>), +// (std::pair>, noops_collapsing_bins>) +// ) { +// std::vector indexes{-1000, -1, 0, 1, 1000}; +// std::vector counts{0, 1, 2, 4, 5, 10, 20, 100, 1000, 10000}; +// +// for (int idx: indexes) { +// for (double count: counts) { +// auto storeAdd = TestType::first_type::new_store(); +// auto storeAddBin = TestType::first_type::new_store(); +// auto storeAddWithCount = TestType::first_type::new_store(); +// for (int i = 0; i < count; ++i) { +// storeAdd->add(idx); +// storeAddBin->add(Bin(idx, 1)); +// storeAddWithCount->add(idx, 1); +// } +// std::vector bins{Bin(idx, count)}; +// std::vector normalized_bins = normalize_bins(TestType::second_type::collapse(bins)); +// test_store(storeAdd, normalized_bins); +// test_store(storeAddBin, normalized_bins); +// test_store(storeAddWithCount, normalized_bins); +// } +// } +// } +// +// TEMPLATE_TEST_CASE("test add monotonous", "[storetest]", +// (std::pair>, collapsing_lowest_bins<8>>), +// (std::pair>, collapsing_lowest_bins<128>>), +// (std::pair>, collapsing_lowest_bins<1024>>), +// (std::pair>, collapsing_highest_bins<8>>), +// (std::pair>, collapsing_highest_bins<128>>), +// (std::pair>, collapsing_highest_bins<1024>>), +// (std::pair>, noops_collapsing_bins>) +// ) { +// std::vector increments{2, 10, 100, -2, -10, -100}; +// std::vector spreads{2, 10, 10000}; +// +// for (const int& incr: increments) { +// for (const int& spread: spreads) { +// std::vector bins; +// auto storeAdd = TestType::first_type::new_store(); +// auto storeAddBin = TestType::first_type::new_store(); +// auto storeAddWithCount = TestType::first_type::new_store(); +// for (int index = 0; std::abs(index) <= spread; index += incr) { +// Bin bin(index, 1); +// bins.push_back(bin); +// storeAdd->add(index); +// storeAddBin->add(bin); +// storeAddWithCount->add(index, 1); +// } +// std::vector normalized_bins = normalize_bins(TestType::second_type::collapse(bins)); +// test_store(storeAdd, normalized_bins); +// test_store(storeAddBin, normalized_bins); +// test_store(storeAddWithCount, normalized_bins); +// } +// } +// } +// +// TEMPLATE_TEST_CASE("test add fuzzy", "[storetest]", +// (std::pair>, collapsing_lowest_bins<8>>), +// (std::pair>, collapsing_lowest_bins<128>>), +// (std::pair>, collapsing_lowest_bins<1024>>), +// (std::pair>, collapsing_highest_bins<8>>), +// (std::pair>, collapsing_highest_bins<128>>), +// (std::pair>, collapsing_highest_bins<1024>>), +// (std::pair>, noops_collapsing_bins>) +// ) { +// const int maxNumValues = 1000; +// std::random_device r; +// std::mt19937_64 rng(r()); +// std::uniform_int_distribution dist(0, maxNumValues - 1); +// +// for (int i = 0; i < numTests; i++) { +// std::vector bins; +// auto storeAdd = TestType::first_type::new_store(); +// auto storeAddBin = TestType::first_type::new_store(); +// auto storeAddWithCount = TestType::first_type::new_store(); +// int numValues = dist(rng); +// for (int j = 0; j < numValues; j++) { +// Bin bin(random_index(), random_count()); +// bins.push_back(bin); +// storeAddBin->add(bin); +// storeAddWithCount->add(bin.get_index(), bin.get_count()); +// } +// std::vector normalized_bins = normalize_bins(TestType::second_type::collapse(bins)); +// test_store(storeAddBin, normalized_bins); +// test_store(storeAddWithCount, normalized_bins); +// } +// } +// +// TEMPLATE_TEST_CASE("test merge fuzzy", "[storetest]", +// (std::pair>, collapsing_lowest_bins<8>>), +// (std::pair>, collapsing_lowest_bins<128>>), +// (std::pair>, collapsing_lowest_bins<1024>>), +// (std::pair>, collapsing_highest_bins<8>>), +// (std::pair>, collapsing_highest_bins<128>>), +// (std::pair>, collapsing_highest_bins<1024>>), +// (std::pair>, noops_collapsing_bins>) +// ) { +// const int numMerges = 3; +// const int maxNumAdds = 1000; +// +// std::random_device r; +// std::mt19937_64 rng(r()); +// std::uniform_int_distribution dist(0, maxNumAdds - 1); +// +// for (int i = 0; i < numTests; i++) { +// std::vector bins; +// auto store = TestType::first_type::new_store(); +// for (int j = 0; j < numMerges; j++) { +// int numValues = dist(rng); +// auto tmpStore = TestType::first_type::new_store(); +// for (int k = 0; k < numValues; k++) { +// Bin bin(random_index(), random_count()); +// bins.push_back(bin); +// tmpStore->add(bin); +// } +// store->merge(*tmpStore); +// } +// std::vector normalized_bins = normalize_bins(TestType::second_type::collapse(bins)); +// test_store(store, normalized_bins); +// } +// } +// +// template +// void test_cross_merge(StoreType& s, OtherType& o, std::function(std::vector&)> c1, std::function(std::vector&)> c2) { +// std::vector indexes{-1000, -1, 0, 1, 1000}; +// std::vector counts{0, 1, 2, 4, 5, 10, 20, 100, 1000, 10000}; +// +// std::vector bins; +// for (const int& index: indexes) { +// double total_count = 0; +// for (const double& count: counts) { +// s.add(index, count); +// o.add(index, count); +// total_count += count; +// } +// bins.emplace_back(index, total_count); +// } +// std::vector normalized_bins = normalize_bins(c1(bins)); +// std::vector normalized_other_bins = normalize_bins(c2(bins)); +// test_store(s, normalized_bins); +// test_store(o, normalized_other_bins); +// +// +// std::vector merged_bins(normalized_bins); +// merged_bins.insert(merged_bins.end(), normalized_other_bins.begin(), normalized_other_bins.end()); +// std::vector normalized_merged_bins = normalize_bins(c1(merged_bins)); +// s.merge(o); +// test_store(s, normalized_merged_bins); +// +// std::vector merged_other_bins(normalized_other_bins); +// merged_other_bins.insert(merged_other_bins.end(), normalized_merged_bins.begin(), normalized_merged_bins.end()); +// std::vector normalized_merged_other_bins = normalize_bins(c2(merged_other_bins)); +// o.merge(s); +// test_store(o, normalized_merged_other_bins); +// } +// +// TEMPLATE_TEST_CASE("test cross merge", "[storetest]", +// (std::pair>, collapsing_lowest_bins<8>>), +// (std::pair>, collapsing_lowest_bins<128>>), +// (std::pair>, collapsing_lowest_bins<1024>>), +// (std::pair>, collapsing_highest_bins<8>>), +// (std::pair>, collapsing_highest_bins<128>>), +// (std::pair>, collapsing_highest_bins<1024>>), +// (std::pair>, noops_collapsing_bins>) +// ) { +// std::vector indexes{-1000, -1, 0, 1, 1000}; +// std::vector counts{0, 1, 2, 4, 5, 10, 20, 100, 1000, 10000}; +// +// auto dense_store_1 = *store_factory>::new_store(); +// auto collapse_1 = collapsing_lowest_bins<1024>::collapse; +// auto store_1 = *TestType::first_type::new_store(); +// +// test_cross_merge(store_1, dense_store_1, collapse_1, collapse_1); +// auto dense_store_2 = store_factory>::new_store(); +// auto collapse_2 = collapsing_highest_bins<1024>::collapse; +// auto dense_store_3 = store_factory>::new_store(); +// auto collapse_3 = noops_collapsing_bins::collapse; +// } +// +// TEMPLATE_TEST_CASE("test store serialize - deserialize", "[serialization]", +// CollapsingLowestStoreTestCase<8>, +// CollapsingLowestStoreTestCase<128>, +// CollapsingLowestStoreTestCase<1024>, +// CollapsingHighestStoreTestCase<8>, +// CollapsingHighestStoreTestCase<128>, +// CollapsingHighestStoreTestCase<1024>, +// UnboundedStoreSizeTestCase +// ) { +// // Test empty store serialization +// auto store = *TestType::new_store(); +// using StoreType = decltype(store); +// std::stringstream stream; +// +// store.serialize(stream); +// StoreType deserialized_empty_store = StoreType::deserialize(stream); +// REQUIRE(store.is_empty()); +// REQUIRE(deserialized_empty_store.is_empty()); +// REQUIRE(stream.peek() == std::istream::traits_type::eof()); +// REQUIRE(store == deserialized_empty_store); +// stream.clear(); +// +// std::vector indexes{-1000, -1, 0, 1, 1000}; +// std::vector counts{0, 1, 2, 4, 5, 10, 20, 100, 1000, 10000}; +// for (int idx: indexes) { +// for (double count: counts) { +// store.add(idx, count); +// } +// } +// +// store.serialize(stream); +// auto deserialized_store = StoreType::deserialize(stream); +// REQUIRE_FALSE(store.is_empty()); +// REQUIRE_FALSE(deserialized_store.is_empty()); +// REQUIRE(stream.peek() == std::istream::traits_type::eof()); +// REQUIRE(store == deserialized_store); +// +// } +// } +