From ed4886505be0acfb8a211e9c6f8b75d97e1a9e25 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 11:45:31 +0800 Subject: [PATCH 01/26] feat: implement T-Digest Signed-off-by: tison --- src/lib.rs | 1 + src/tdigest/mod.rs | 79 +++++++++++++++++++++++++ src/tdigest/sketch.rs | 133 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 src/tdigest/mod.rs create mode 100644 src/tdigest/sketch.rs diff --git a/src/lib.rs b/src/lib.rs index 07ace55..20f5940 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,3 +28,4 @@ pub mod error; pub mod hll; +pub mod tdigest; diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs new file mode 100644 index 0000000..1e1d74a --- /dev/null +++ b/src/tdigest/mod.rs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! T-Digest implementation for estimating quantiles and ranks. +//! +//! The implementation in this library is based on the MergingDigest described in +//! [Computing Extremely Accurate Quantiles Using t-Digests][paper] by Ted Dunning and Otmar Ertl. +//! +//! The implementation in this library has a few differences from the reference implementation +//! associated with that paper: +//! +//! * Merge does not modify the input +//! * Deserialization similar to other sketches in this library, although reading the reference +//! implementation format is supported +//! +//! Unlike all other algorithms in the library, t-digest is empirical and has no mathematical +//! basis for estimating its error and its results are dependent on the input data. However, +//! for many common data distributions, it can produce excellent results. t-digest also operates +//! only on numeric data and, unlike the quantiles family algorithms in the library which return +//! quantile approximations from the input domain, t-digest interpolates values and will hold and +//! return data points not seen in the input. +//! +//! The closest alternative to t-digest in this library is REQ sketch. It prioritizes one chosen +//! side of the rank domain: either low rank accuracy or high rank accuracy. t-digest (in this +//! implementation) prioritizes both ends of the rank domain and has lower accuracy towards the +//! middle of the rank domain (median). +//! +//! Measurements show that t-digest is slightly biased (tends to underestimate low ranks and +//! overestimate high ranks), while still doing very well close to the extremes. The effect seems +//! to be more pronounced with more input values. +//! +//! For more information on the performance characteristics, see the +//! [Datasketches page on t-digest](https://datasketches.apache.org/docs/tdigest/tdigest.html). +//! +//! [paper]: https://arxiv.org/abs/1902.04023 + +mod sketch; +pub use self::sketch::TDigest; + +/// The default value of K if one is not specified. +const DEFAULT_K: usize = 200; + +/// Generates cluster sizes proportional to `q*(1-q)`. +/// +/// The use of a normalizing function results in a strictly bounded number of clusters no matter +/// how many samples. +/// +/// Corresponds to K_2 in the reference implementation +mod scale_function { + pub(super) fn max(q: f64, normalizer: f64) -> f64 { + q * (1. - q) / normalizer + } + + pub(super) fn normalizer(compression: f64, n: f64) -> f64 { + compression / z(compression, n) + } + + pub(super) fn z(compression: f64, n: f64) -> f64 { + 4. * (n / compression).ln() + 24. + } +} + +const fn weighted_average(x1: f64, w1: f64, x2: f64, w2: f64) -> f64 { + (x1 * w1 + x2 * w2) / (w1 + w2) +} diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs new file mode 100644 index 0000000..3b1c6ba --- /dev/null +++ b/src/tdigest/sketch.rs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::tdigest::DEFAULT_K; + +const BUFFER_MULTIPLIER: usize = 4; + +/// T-Digest sketch for estimating quantiles and ranks. +/// +/// See the [module documentation](super) for more details. +#[derive(Debug, Clone, PartialEq)] +pub struct TDigest { + k: usize, + + reverse_merge: bool, + min: f64, + max: f64, + + centroids: Vec, + centroids_weight: u64, + centroids_capacity: usize, + buffer: Vec, +} + +impl Default for TDigest { + fn default() -> Self { + TDigest::new(DEFAULT_K) + } +} + +impl TDigest { + /// Creates a tdigest instance with the given value of k. + /// + /// # Panics + /// + /// If k is less than 10 + pub fn new(k: usize) -> Self { + assert!(k >= 10, "k must be at least 10"); + + let fudge = if k < 30 { 30 } else { 10 }; + let centroids_capacity = (k * 2) + fudge; + + let centroids = Vec::with_capacity(centroids_capacity); + let buffer = Vec::with_capacity(centroids_capacity * BUFFER_MULTIPLIER); + + TDigest { + k, + reverse_merge: false, + min: f64::INFINITY, + max: f64::NEG_INFINITY, + centroids, + centroids_weight: 0, + centroids_capacity, + buffer, + } + } + + /// Update this TDigest with the given value. + pub fn update(&mut self, value: f64) { + if value.is_nan() { + return; + } + + if self.buffer.len() == self.centroids_capacity * BUFFER_MULTIPLIER { + todo!("implement compress()"); + } + + self.buffer.push(value); + self.min = self.min.min(value); + self.max = self.max.max(value); + } + + /// Returns true if TDigest has not seen any data. + pub fn is_empty(&self) -> bool { + self.centroids.is_empty() && self.buffer.is_empty() + } + + /// Returns minimum value seen by TDigest. + pub fn min_value(&self) -> Option { + if self.is_empty() { + None + } else { + Some(self.min) + } + } + + /// Returns maximum value seen by TDigest. + pub fn max_value(&self) -> Option { + if self.is_empty() { + None + } else { + Some(self.max) + } + } + + /// Returns total weight. + pub fn total_weight(&self) -> u64 { + self.centroids_weight + (self.buffer.len() as u64) + } +} + +#[derive(Debug, Clone, PartialEq)] +struct Centroid { + mean: f64, + weight: u64, +} + +impl Centroid { + fn add(&mut self, other: &Centroid) { + if self.weight != 0 { + let total_weight = self.weight + other.weight; + self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); + self.weight = total_weight; + } else { + self.mean = other.mean; + self.weight = other.weight; + } + } +} From 26ee9550941afef4f2d6f753a598cf63b717bc86 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 13:04:29 +0800 Subject: [PATCH 02/26] impl merge and compress Signed-off-by: tison --- src/tdigest/mod.rs | 27 ------ src/tdigest/sketch.rs | 190 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 183 insertions(+), 34 deletions(-) diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs index 1e1d74a..c0818fc 100644 --- a/src/tdigest/mod.rs +++ b/src/tdigest/mod.rs @@ -50,30 +50,3 @@ mod sketch; pub use self::sketch::TDigest; - -/// The default value of K if one is not specified. -const DEFAULT_K: usize = 200; - -/// Generates cluster sizes proportional to `q*(1-q)`. -/// -/// The use of a normalizing function results in a strictly bounded number of clusters no matter -/// how many samples. -/// -/// Corresponds to K_2 in the reference implementation -mod scale_function { - pub(super) fn max(q: f64, normalizer: f64) -> f64 { - q * (1. - q) / normalizer - } - - pub(super) fn normalizer(compression: f64, n: f64) -> f64 { - compression / z(compression, n) - } - - pub(super) fn z(compression: f64, n: f64) -> f64 { - 4. * (n / compression).ln() + 24. - } -} - -const fn weighted_average(x1: f64, w1: f64, x2: f64, w2: f64) -> f64 { - (x1 * w1 + x2 * w2) / (w1 + w2) -} diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 3b1c6ba..e6799d7 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use crate::tdigest::DEFAULT_K; - const BUFFER_MULTIPLIER: usize = 4; /// T-Digest sketch for estimating quantiles and ranks. @@ -38,11 +36,14 @@ pub struct TDigest { impl Default for TDigest { fn default() -> Self { - TDigest::new(DEFAULT_K) + TDigest::new(Self::DEFAULT_K) } } impl TDigest { + /// The default value of K if one is not specified. + pub const DEFAULT_K: usize = 200; + /// Creates a tdigest instance with the given value of k. /// /// # Panics @@ -69,14 +70,14 @@ impl TDigest { } } - /// Update this TDigest with the given value. + /// Update this TDigest with the given value (`NaN` values are ignored). pub fn update(&mut self, value: f64) { if value.is_nan() { return; } if self.buffer.len() == self.centroids_capacity * BUFFER_MULTIPLIER { - todo!("implement compress()"); + self.compress(); } self.buffer.push(value); @@ -111,16 +112,167 @@ impl TDigest { pub fn total_weight(&self) -> u64 { self.centroids_weight + (self.buffer.len() as u64) } + + /// Compute approximate quantile value corresponding to the given normalized rank + /// + /// # Panics + /// + /// If rank is not in [0.0, 1.0], or if TDigest is empty. + pub fn get_quantile(&mut self, rank: f64) -> f64 { + assert!(rank >= 0.0 && rank <= 1.0, "rank must be in [0.0, 1.0]"); + assert!(!self.is_empty(), "TDigest is empty"); + self.compress(); + if self.centroids.len() == 1 { + return self.centroids[0].mean; + } + + // at least 2 centroids + let centroids_weight = self.centroids_weight as f64; + let num_centroids = self.centroids.len(); + let weight = rank * centroids_weight; + if weight < 1. { + return self.min; + } + if weight > centroids_weight - 1. { + return self.max; + } + let first_weight = self.centroids[0].weight as f64; + if first_weight > 1. && weight < first_weight / 2. { + return self.min + + (((weight - 1.) / ((first_weight / 2.) - 1.)) + * (self.centroids[0].mean - self.min)); + } + let last_weight = self.centroids[num_centroids - 1].weight as f64; + if last_weight > 1. && (centroids_weight - weight <= last_weight / 2.) { + return self.max + + (((centroids_weight - weight - 1.) / ((last_weight / 2.) - 1.)) + * (self.max - self.centroids[num_centroids - 1].mean)); + } + + // interpolate between extremes + let mut weight_so_far = first_weight / 2.; + for i in 0..(num_centroids - 1) { + let dw = (self.centroids[i].weight + self.centroids[i + 1].weight) as f64 / 2.; + if weight_so_far + dw > weight { + // the target weight is between centroids i and i+1 + let mut left_weight = 0.; + if self.centroids[i].weight == 1 { + if weight - weight_so_far < 0.5 { + return self.centroids[i].mean; + } + left_weight = 0.5; + } + let mut right_weight = 0.; + if self.centroids[i + 1].weight == 1 { + if weight_so_far + dw - weight < 0.5 { + return self.centroids[i + 1].mean; + } + right_weight = 0.5; + } + let w1 = weight - weight_so_far - left_weight; + let w2 = weight_so_far + dw - weight - right_weight; + return weighted_average( + self.centroids[i].mean, + w1, + self.centroids[i + 1].mean, + w2, + ); + } + weight_so_far += dw; + } + + let w1 = weight + - (self.centroids_weight as f64) + - ((self.centroids[num_centroids - 1].weight as f64) / 2.); + let w2 = (self.centroids[num_centroids - 1].weight as f64 / 2.) - w1; + weighted_average(self.centroids[num_centroids - 1].mean, w1, self.max, w2) + } + + /// Process buffered values and merge centroids if needed. + fn compress(&mut self) { + if self.buffer.is_empty() { + return; + } + let mut tmp = Vec::with_capacity(self.buffer.len() + self.centroids.len()); + for &v in &self.buffer { + tmp.push(Centroid { mean: v, weight: 1 }); + } + self.merge(tmp, self.buffer.len() as u64) + } + + /// Merges the given buffer of centroids into this TDigest. + /// + /// # Contract + /// + /// * `buffer` must have at least one centroid. + /// * `buffer` is generated from `self.buffer`, and thus: + /// * No `NAN` values are present in `buffer`. + /// * We should clear `self.buffer` after merging. + fn merge(&mut self, mut buffer: Vec, weight: u64) { + buffer.extend(std::mem::take(&mut self.centroids)); + buffer.sort_by(centroid_cmp); + if self.reverse_merge { + buffer.reverse(); + } + self.centroids_weight += weight; + + let mut num_centroids = 0; + let len = buffer.len(); + self.centroids.push(buffer[0]); + num_centroids += 1; + let mut current = 1; + let mut weight_so_far = 0.; + while current < len { + let c = buffer[current]; + let proposed_weight = (self.centroids[num_centroids - 1].weight + c.weight) as f64; + let mut add_this = false; + if (current != 1) && (current != (len - 1)) { + let centroids_weight = self.centroids_weight as f64; + let q0 = weight_so_far / centroids_weight; + let q2 = (weight_so_far + proposed_weight) / centroids_weight; + let normalizer = scale_function::normalizer((2 * self.k) as f64, centroids_weight); + add_this = proposed_weight + <= (centroids_weight + * scale_function::max(q0, normalizer) + .min(scale_function::max(q2, normalizer))); + } + if add_this { + // merge into existing centroid + self.centroids[num_centroids - 1].add(c); + } else { + // copy to a new centroid + weight_so_far += self.centroids[num_centroids - 1].weight as f64; + self.centroids.push(c); + num_centroids += 1; + } + current += 1; + } + + if self.reverse_merge { + self.centroids.reverse(); + } + self.min = self.min.min(self.centroids[0].mean); + self.max = self.max.max(self.centroids[num_centroids - 1].mean); + self.reverse_merge = !self.reverse_merge; + self.buffer.clear(); + } } -#[derive(Debug, Clone, PartialEq)] +fn centroid_cmp(a: &Centroid, b: &Centroid) -> std::cmp::Ordering { + match a.mean.partial_cmp(&b.mean) { + Some(order) => order, + None => unreachable!("NaN values should never be present in centroids"), + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] struct Centroid { mean: f64, weight: u64, } impl Centroid { - fn add(&mut self, other: &Centroid) { + fn add(&mut self, other: Centroid) { if self.weight != 0 { let total_weight = self.weight + other.weight; self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); @@ -131,3 +283,27 @@ impl Centroid { } } } + +/// Generates cluster sizes proportional to `q*(1-q)`. +/// +/// The use of a normalizing function results in a strictly bounded number of clusters no matter +/// how many samples. +/// +/// Corresponds to K_2 in the reference implementation +mod scale_function { + pub(super) fn max(q: f64, normalizer: f64) -> f64 { + q * (1. - q) / normalizer + } + + pub(super) fn normalizer(compression: f64, n: f64) -> f64 { + compression / z(compression, n) + } + + pub(super) fn z(compression: f64, n: f64) -> f64 { + 4. * (n / compression).ln() + 24. + } +} + +const fn weighted_average(x1: f64, w1: f64, x2: f64, w2: f64) -> f64 { + (x1 * w1 + x2 * w2) / (w1 + w2) +} From 88ac87e8dec133e785b694f3bfe86b9919290251 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 14:56:03 +0800 Subject: [PATCH 03/26] impl get_rank Signed-off-by: tison --- src/tdigest/sketch.rs | 180 ++++++++++++++++++++++++++++++++++++------ tests/tdigest_test.rs | 122 ++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 22 deletions(-) create mode 100644 tests/tdigest_test.rs diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index e6799d7..b888736 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::convert::identity; + const BUFFER_MULTIPLIER: usize = 4; /// T-Digest sketch for estimating quantiles and ranks. @@ -85,12 +87,17 @@ impl TDigest { self.max = self.max.max(value); } + /// Returns parameter k (compression) that was used to configure this TDigest. + pub fn k(&self) -> usize { + self.k + } + /// Returns true if TDigest has not seen any data. pub fn is_empty(&self) -> bool { self.centroids.is_empty() && self.buffer.is_empty() } - /// Returns minimum value seen by TDigest. + /// Returns minimum value seen by TDigest; `None` if TDigest is empty. pub fn min_value(&self) -> Option { if self.is_empty() { None @@ -99,7 +106,7 @@ impl TDigest { } } - /// Returns maximum value seen by TDigest. + /// Returns maximum value seen by TDigest; `None` if TDigest is empty. pub fn max_value(&self) -> Option { if self.is_empty() { None @@ -113,17 +120,137 @@ impl TDigest { self.centroids_weight + (self.buffer.len() as u64) } - /// Compute approximate quantile value corresponding to the given normalized rank + /// Compute approximate normalized rank (from 0 to 1 inclusive) of the given value. + /// + /// Returns `None` if TDigest is empty. /// /// # Panics /// - /// If rank is not in [0.0, 1.0], or if TDigest is empty. - pub fn get_quantile(&mut self, rank: f64) -> f64 { - assert!(rank >= 0.0 && rank <= 1.0, "rank must be in [0.0, 1.0]"); - assert!(!self.is_empty(), "TDigest is empty"); - self.compress(); + /// If the value is `NaN`. + pub fn get_rank(&mut self, value: f64) -> Option { + assert!(!value.is_nan(), "value must not be NaN"); + + if self.is_empty() { + return None; + } + if value < self.min { + return Some(0.0); + } + if value > self.max { + return Some(1.0); + } + // one centroid and value == min == max + if self.centroids.len() + self.buffer.len() == 1 { + return Some(0.5); + } + + self.compress(); // side effect + let centroids_weight = self.centroids_weight as f64; + let num_centroids = self.centroids.len(); + + // left tail + let first_mean = self.centroids[0].mean; + if value < first_mean { + if first_mean - self.min > 0. { + return Some(if value == self.min { + 0.5 / centroids_weight + } else { + 1. + (((value - self.min) / (first_mean - self.min)) + * ((self.centroids[0].weight as f64 / 2.) - 1.)) + }); + } + return Some(0.); // should never happen + } + + // right tail + let last_mean = self.centroids[num_centroids - 1].mean; + if value > last_mean { + if self.max - last_mean > 0. { + return Some(if value == self.max { + 1. - (0.5 / centroids_weight) + } else { + 1.0 - ((1.0 + + (((self.max - value) / (self.max - last_mean)) + * ((self.centroids[num_centroids - 1].weight as f64 / 2.) - 1.))) + / centroids_weight) + }); + } + return Some(1.); // should never happen + } + + let mut lower = self + .centroids + .binary_search_by(|c| { + if c.mean < value { + std::cmp::Ordering::Less + } else { + std::cmp::Ordering::Greater + } + }) + .unwrap_or_else(identity); + debug_assert_ne!(lower, num_centroids, "get_rank: lower == end"); + let mut upper = self + .centroids + .binary_search_by(|c| { + if c.mean > value { + std::cmp::Ordering::Greater + } else { + std::cmp::Ordering::Less + } + }) + .unwrap_or_else(identity); + debug_assert_ne!(upper, 0, "get_rank: upper == begin"); + if value < self.centroids[lower].mean { + lower -= 1; + } + if (upper == num_centroids) || (self.centroids[upper - 1].mean >= value) { + upper -= 1; + } + + let mut weight_below = 0.; + let mut i = 0; + while i < lower { + weight_below += self.centroids[i].weight as f64; + i += 1; + } + weight_below += self.centroids[lower].weight as f64 / 2.; + + let mut weight_delta = 0.; + while i < upper { + weight_delta += self.centroids[i].weight as f64; + i += 1; + } + weight_delta -= self.centroids[lower].weight as f64 / 2.; + weight_delta += self.centroids[upper].weight as f64 / 2.; + Some( + if self.centroids[upper].mean - self.centroids[lower].mean > 0. { + (weight_below + + (weight_delta * (value - self.centroids[lower].mean) + / (self.centroids[upper].mean - self.centroids[lower].mean))) + / centroids_weight + } else { + (weight_below + weight_delta / 2.) / centroids_weight + }, + ) + } + + /// Compute approximate quantile value corresponding to the given normalized rank. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If rank is not in [0.0, 1.0]. + pub fn get_quantile(&mut self, rank: f64) -> Option { + assert!((0.0..=1.0).contains(&rank), "rank must be in [0.0, 1.0]"); + + if self.is_empty() { + return None; + } + + self.compress(); // side effect if self.centroids.len() == 1 { - return self.centroids[0].mean; + return Some(self.centroids[0].mean); } // at least 2 centroids @@ -131,22 +258,26 @@ impl TDigest { let num_centroids = self.centroids.len(); let weight = rank * centroids_weight; if weight < 1. { - return self.min; + return Some(self.min); } if weight > centroids_weight - 1. { - return self.max; + return Some(self.max); } let first_weight = self.centroids[0].weight as f64; if first_weight > 1. && weight < first_weight / 2. { - return self.min - + (((weight - 1.) / ((first_weight / 2.) - 1.)) - * (self.centroids[0].mean - self.min)); + return Some( + self.min + + (((weight - 1.) / ((first_weight / 2.) - 1.)) + * (self.centroids[0].mean - self.min)), + ); } let last_weight = self.centroids[num_centroids - 1].weight as f64; if last_weight > 1. && (centroids_weight - weight <= last_weight / 2.) { - return self.max - + (((centroids_weight - weight - 1.) / ((last_weight / 2.) - 1.)) - * (self.max - self.centroids[num_centroids - 1].mean)); + return Some( + self.max + + (((centroids_weight - weight - 1.) / ((last_weight / 2.) - 1.)) + * (self.max - self.centroids[num_centroids - 1].mean)), + ); } // interpolate between extremes @@ -158,25 +289,25 @@ impl TDigest { let mut left_weight = 0.; if self.centroids[i].weight == 1 { if weight - weight_so_far < 0.5 { - return self.centroids[i].mean; + return Some(self.centroids[i].mean); } left_weight = 0.5; } let mut right_weight = 0.; if self.centroids[i + 1].weight == 1 { if weight_so_far + dw - weight < 0.5 { - return self.centroids[i + 1].mean; + return Some(self.centroids[i + 1].mean); } right_weight = 0.5; } let w1 = weight - weight_so_far - left_weight; let w2 = weight_so_far + dw - weight - right_weight; - return weighted_average( + return Some(weighted_average( self.centroids[i].mean, w1, self.centroids[i + 1].mean, w2, - ); + )); } weight_so_far += dw; } @@ -185,7 +316,12 @@ impl TDigest { - (self.centroids_weight as f64) - ((self.centroids[num_centroids - 1].weight as f64) / 2.); let w2 = (self.centroids[num_centroids - 1].weight as f64 / 2.) - w1; - weighted_average(self.centroids[num_centroids - 1].mean, w1, self.max, w2) + Some(weighted_average( + self.centroids[num_centroids - 1].mean, + w1, + self.max, + w2, + )) } /// Process buffered values and merge centroids if needed. diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs new file mode 100644 index 0000000..9d50e10 --- /dev/null +++ b/tests/tdigest_test.rs @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::tdigest::TDigest; + +#[test] +fn test_empty() { + let mut tdigest = TDigest::new(10); + assert!(tdigest.is_empty()); + assert_eq!(tdigest.k(), 10); + assert_eq!(tdigest.total_weight(), 0); + assert_eq!(tdigest.min_value(), None); + assert_eq!(tdigest.max_value(), None); + assert_eq!(tdigest.get_rank(0.0), None); + assert_eq!(tdigest.get_quantile(0.5), None); + + // TODO: Support PMF and CDF + // const double split_points[1] {0}; + // REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error); + // REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error); +} + +#[test] +fn test_one_value() { + let mut tdigest = TDigest::new(100); + tdigest.update(1.0); + assert_eq!(tdigest.k(), 100); + assert_eq!(tdigest.total_weight(), 1); + assert_eq!(tdigest.min_value(), Some(1.0)); + assert_eq!(tdigest.max_value(), Some(1.0)); + assert_eq!(tdigest.get_rank(0.99), Some(0.0)); + assert_eq!(tdigest.get_rank(1.0), Some(0.5)); + assert_eq!(tdigest.get_rank(1.01), Some(1.0)); + assert_eq!(tdigest.get_quantile(0.0), Some(1.0)); + assert_eq!(tdigest.get_quantile(0.5), Some(1.0)); + assert_eq!(tdigest.get_quantile(1.0), Some(1.0)); +} + +#[test] +fn test_many_values() { + // TODO: Later until PMF and CDF are supported + // const size_t n = 10000; + // tdigest_double td; + // for (size_t i = 0; i < n; ++i) td.update(i); + // REQUIRE_FALSE(td.is_empty()); + // REQUIRE(td.get_total_weight() == n); + // REQUIRE(td.get_min_value() == 0); + // REQUIRE(td.get_max_value() == n - 1); + // REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001)); + // REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001)); + // REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001)); + // REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001)); + // REQUIRE(td.get_rank(n) == 1); + // REQUIRE(td.get_quantile(0) == 0); + // REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03)); + // REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01)); + // REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01)); + // REQUIRE(td.get_quantile(1) == n - 1); + // const double split_points[1] {n / 2}; + // const auto pmf = td.get_PMF(split_points, 1); + // REQUIRE(pmf.size() == 2); + // REQUIRE(pmf[0] == Approx(0.5).margin(0.0001)); + // REQUIRE(pmf[1] == Approx(0.5).margin(0.0001)); + // const auto cdf = td.get_CDF(split_points, 1); + // REQUIRE(cdf.size() == 2); + // REQUIRE(cdf[0] == Approx(0.5).margin(0.0001)); + // REQUIRE(cdf[1] == 1); +} + +#[test] +fn test_rank_two_values() { + let mut tdigest = TDigest::new(100); + tdigest.update(1.0); + tdigest.update(2.0); + assert_eq!(tdigest.get_rank(0.99), Some(0.0)); + assert_eq!(tdigest.get_rank(1.0), Some(0.25)); + assert_eq!(tdigest.get_rank(1.25), Some(0.375)); + assert_eq!(tdigest.get_rank(1.5), Some(0.5)); + assert_eq!(tdigest.get_rank(1.75), Some(0.625)); + assert_eq!(tdigest.get_rank(2.0), Some(0.75)); + assert_eq!(tdigest.get_rank(2.01), Some(1.0)); +} + +#[test] +fn test_rank_repeated_values() { + let mut tdigest = TDigest::new(100); + tdigest.update(1.0); + tdigest.update(1.0); + tdigest.update(1.0); + tdigest.update(1.0); + assert_eq!(tdigest.get_rank(0.99), Some(0.0)); + assert_eq!(tdigest.get_rank(1.0), Some(0.5)); + assert_eq!(tdigest.get_rank(1.01), Some(1.0)); +} + +#[test] +fn test_repeated_blocks() { + let mut tdigest = TDigest::new(100); + tdigest.update(1.0); + tdigest.update(2.0); + tdigest.update(2.0); + tdigest.update(3.0); + assert_eq!(tdigest.get_rank(0.99), Some(0.0)); + assert_eq!(tdigest.get_rank(1.0), Some(0.125)); + assert_eq!(tdigest.get_rank(2.0), Some(0.5)); + assert_eq!(tdigest.get_rank(3.0), Some(0.875)); + assert_eq!(tdigest.get_rank(3.01), Some(1.0)); +} From 09afcc9a7a3be24547efd050134a766fae8f6f27 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 15:23:08 +0800 Subject: [PATCH 04/26] impl merge and add tests Signed-off-by: tison --- Cargo.lock | 124 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 + src/tdigest/sketch.rs | 25 ++++++++- tests/tdigest_test.rs | 103 +++++++++++++++++++++++++++++------ 4 files changed, 236 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1a5fdbd..aa760e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,15 +2,139 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "datasketches" version = "0.1.0" dependencies = [ + "googletest", "mur3", ] +[[package]] +name = "googletest" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06597b7d02ee58b9a37f522785ac15b9e18c6b178747c4439a6c03fbb35ea753" +dependencies = [ + "googletest_macro", + "num-traits", + "regex", + "rustversion", +] + +[[package]] +name = "googletest_macro" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31d9f07c9c19b855faebf71637be3b43f8e13a518aece5d61a3beee7710b4ef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + [[package]] name = "mur3" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "proc-macro2" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "syn" +version = "2.0.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" diff --git a/Cargo.toml b/Cargo.toml index ad4eeb6..897e9bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,9 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] mur3 = { version = "0.1.0" } +[dev-dependencies] +googletest = { version = "0.14.2" } + [lints.rust] unknown_lints = "deny" unsafe_code = "deny" diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index b888736..58560ba 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -120,6 +120,27 @@ impl TDigest { self.centroids_weight + (self.buffer.len() as u64) } + /// Merge the given t-Digest into this one + pub fn merge(&mut self, other: &TDigest) { + if other.is_empty() { + return; + } + + let mut tmp = Vec::with_capacity( + self.centroids.len() + self.buffer.len() + other.centroids.len() + other.buffer.len(), + ); + for &v in &self.buffer { + tmp.push(Centroid { mean: v, weight: 1 }); + } + for &v in &other.buffer { + tmp.push(Centroid { mean: v, weight: 1 }); + } + for &c in &other.centroids { + tmp.push(c); + } + self.do_merge(tmp, self.buffer.len() as u64 + other.total_weight()) + } + /// Compute approximate normalized rank (from 0 to 1 inclusive) of the given value. /// /// Returns `None` if TDigest is empty. @@ -333,7 +354,7 @@ impl TDigest { for &v in &self.buffer { tmp.push(Centroid { mean: v, weight: 1 }); } - self.merge(tmp, self.buffer.len() as u64) + self.do_merge(tmp, self.buffer.len() as u64) } /// Merges the given buffer of centroids into this TDigest. @@ -344,7 +365,7 @@ impl TDigest { /// * `buffer` is generated from `self.buffer`, and thus: /// * No `NAN` values are present in `buffer`. /// * We should clear `self.buffer` after merging. - fn merge(&mut self, mut buffer: Vec, weight: u64) { + fn do_merge(&mut self, mut buffer: Vec, weight: u64) { buffer.extend(std::mem::take(&mut self.centroids)); buffer.sort_by(centroid_cmp); if self.reverse_merge { diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs index 9d50e10..0847094 100644 --- a/tests/tdigest_test.rs +++ b/tests/tdigest_test.rs @@ -16,6 +16,8 @@ // under the License. use datasketches::tdigest::TDigest; +use googletest::assert_that; +use googletest::prelude::{eq, near}; #[test] fn test_empty() { @@ -52,24 +54,45 @@ fn test_one_value() { #[test] fn test_many_values() { + let n = 10000; + + let mut tdigest = TDigest::default(); + for i in 0..n { + tdigest.update(i as f64); + } + + assert!(!tdigest.is_empty()); + assert_eq!(tdigest.total_weight(), n); + assert_eq!(tdigest.min_value(), Some(0.0)); + assert_eq!(tdigest.max_value(), Some((n - 1) as f64)); + + assert_that!(tdigest.get_rank(0.0).unwrap(), near(0.0, 0.0001)); + assert_that!( + tdigest.get_rank((n / 4) as f64).unwrap(), + near(0.25, 0.0001) + ); + assert_that!(tdigest.get_rank((n / 2) as f64).unwrap(), near(0.5, 0.0001)); + assert_that!( + tdigest.get_rank((n * 3 / 4) as f64).unwrap(), + near(0.75, 0.0001) + ); + assert_that!(tdigest.get_rank(n as f64).unwrap(), eq(1.0)); + assert_that!(tdigest.get_quantile(0.0).unwrap(), eq(0.0)); + assert_that!( + tdigest.get_quantile(0.5).unwrap(), + near((n / 2) as f64, 0.03 * (n / 2) as f64) + ); + assert_that!( + tdigest.get_quantile(0.9).unwrap(), + near((n as f64) * 0.9, 0.01 * (n as f64) * 0.9) + ); + assert_that!( + tdigest.get_quantile(0.95).unwrap(), + near((n as f64) * 0.95, 0.01 * (n as f64) * 0.95) + ); + assert_that!(tdigest.get_quantile(1.0).unwrap(), eq((n - 1) as f64)); + // TODO: Later until PMF and CDF are supported - // const size_t n = 10000; - // tdigest_double td; - // for (size_t i = 0; i < n; ++i) td.update(i); - // REQUIRE_FALSE(td.is_empty()); - // REQUIRE(td.get_total_weight() == n); - // REQUIRE(td.get_min_value() == 0); - // REQUIRE(td.get_max_value() == n - 1); - // REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001)); - // REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001)); - // REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001)); - // REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001)); - // REQUIRE(td.get_rank(n) == 1); - // REQUIRE(td.get_quantile(0) == 0); - // REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03)); - // REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01)); - // REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01)); - // REQUIRE(td.get_quantile(1) == n - 1); // const double split_points[1] {n / 2}; // const auto pmf = td.get_PMF(split_points, 1); // REQUIRE(pmf.size() == 2); @@ -120,3 +143,49 @@ fn test_repeated_blocks() { assert_eq!(tdigest.get_rank(3.0), Some(0.875)); assert_eq!(tdigest.get_rank(3.01), Some(1.0)); } + +#[test] +fn test_merge_small() { + let mut td1 = TDigest::new(10); + td1.update(1.0); + td1.update(2.0); + let mut td2 = TDigest::new(10); + td2.update(2.0); + td2.update(3.0); + td1.merge(&td2); + assert_eq!(td1.min_value(), Some(1.0)); + assert_eq!(td1.max_value(), Some(3.0)); + assert_eq!(td1.total_weight(), 4); + assert_eq!(td1.get_rank(0.99), Some(0.0)); + assert_eq!(td1.get_rank(1.0), Some(0.125)); + assert_eq!(td1.get_rank(2.0), Some(0.5)); + assert_eq!(td1.get_rank(3.0), Some(0.875)); + assert_eq!(td1.get_rank(3.01), Some(1.0)); +} + +#[test] +fn test_merge_large() { + let n = 10000; + + let mut td1 = TDigest::new(10); + let mut td2 = TDigest::new(10); + let sup = n / 2; + for i in 0..sup { + td1.update(i as f64); + td2.update((sup + i) as f64); + } + td1.merge(&td2); + + assert_eq!(td1.total_weight(), n); + assert_eq!(td1.min_value(), Some(0.0)); + assert_eq!(td1.max_value(), Some((n - 1) as f64)); + + assert_that!(td1.get_rank(0.0).unwrap(), near(0.0, 0.0001)); + assert_that!(td1.get_rank((n / 4) as f64).unwrap(), near(0.25, 0.0001)); + assert_that!(td1.get_rank((n / 2) as f64).unwrap(), near(0.5, 0.0001)); + assert_that!( + td1.get_rank((n * 3 / 4) as f64).unwrap(), + near(0.75, 0.0001) + ); + assert_that!(td1.get_rank(n as f64).unwrap(), eq(1.0)); +} From a3271d718c6211eec6d115a50c9cbf763134a0e8 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 15:55:54 +0800 Subject: [PATCH 05/26] demo iter Signed-off-by: tison --- src/tdigest/iter.rs | 50 ++++++++++++++++++++++++++++++++++++ src/tdigest/mod.rs | 39 +++++++++++++++++++++++++++- src/tdigest/serialization.rs | 22 ++++++++++++++++ src/tdigest/sketch.rs | 42 +++--------------------------- 4 files changed, 114 insertions(+), 39 deletions(-) create mode 100644 src/tdigest/iter.rs create mode 100644 src/tdigest/serialization.rs diff --git a/src/tdigest/iter.rs b/src/tdigest/iter.rs new file mode 100644 index 0000000..7d12381 --- /dev/null +++ b/src/tdigest/iter.rs @@ -0,0 +1,50 @@ +use crate::tdigest::{Centroid, TDigest}; + +impl IntoIterator for TDigest { + type Item = (f64, u64); + type IntoIter = TDigestIntoIter; + + fn into_iter(self) -> Self::IntoIter { + TDigestIntoIter { + centroids: self.centroids, + index: 0, + } + } +} + +/// Iterator over the centroids of a TDigest. +pub struct TDigestIntoIter { + centroids: Vec, + index: usize, +} + +impl Iterator for TDigestIntoIter { + type Item = (f64, u64); + + fn next(&mut self) -> Option { + if self.index < self.centroids.len() { + let centroid = self.centroids[self.index]; + self.index += 1; + Some((centroid.mean, centroid.weight)) + } else { + None + } + } +} + +impl FromIterator<(f64, u64)> for TDigest { + fn from_iter>(iter: I) -> Self { + let iter = iter.into_iter(); + + let mut tmp = Vec::with_capacity(iter.size_hint().0); + let mut total_weight = 0; + for (mean, weight) in iter { + tmp.push(Centroid { mean, weight }); + total_weight += weight; + } + + let mut tdigest = TDigest::default(); + tdigest.do_merge(tmp, total_weight); + tdigest + } +} diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs index c0818fc..32b55b5 100644 --- a/src/tdigest/mod.rs +++ b/src/tdigest/mod.rs @@ -48,5 +48,42 @@ //! //! [paper]: https://arxiv.org/abs/1902.04023 +mod iter; +mod serialization; mod sketch; -pub use self::sketch::TDigest; + +/// T-Digest sketch for estimating quantiles and ranks. +/// +/// See the [module documentation](self) for more details. +#[derive(Debug, Clone, PartialEq)] +pub struct TDigest { + k: usize, + + reverse_merge: bool, + min: f64, + max: f64, + + centroids: Vec, + centroids_weight: u64, + centroids_capacity: usize, + buffer: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +struct Centroid { + mean: f64, + weight: u64, +} + +impl Centroid { + fn add(&mut self, other: Centroid) { + if self.weight != 0 { + let total_weight = self.weight + other.weight; + self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); + self.weight = total_weight; + } else { + self.mean = other.mean; + self.weight = other.weight; + } + } +} diff --git a/src/tdigest/serialization.rs b/src/tdigest/serialization.rs new file mode 100644 index 0000000..5597ddd --- /dev/null +++ b/src/tdigest/serialization.rs @@ -0,0 +1,22 @@ +use crate::tdigest::TDigest; + +const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; +const PREAMBLE_LONGS_MULTIPLE: u8 = 2; +const SERIAL_VERSION: u8 = 1; +const TDIGEST_FAMILY_ID: u8 = 20; + +impl TDigest { + /// Serializes this TDigest to bytes. + pub fn serialize(&mut self) -> Vec { + self.compress(); + let mut bytes = vec![]; + bytes.push(match self.total_weight() { + 0 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, + 1 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, + _ => PREAMBLE_LONGS_MULTIPLE, + }); + bytes.push(SERIAL_VERSION); + bytes.push(TDIGEST_FAMILY_ID); + bytes + } +} diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 58560ba..e33a17a 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -17,24 +17,9 @@ use std::convert::identity; -const BUFFER_MULTIPLIER: usize = 4; +use crate::tdigest::{Centroid, TDigest}; -/// T-Digest sketch for estimating quantiles and ranks. -/// -/// See the [module documentation](super) for more details. -#[derive(Debug, Clone, PartialEq)] -pub struct TDigest { - k: usize, - - reverse_merge: bool, - min: f64, - max: f64, - - centroids: Vec, - centroids_weight: u64, - centroids_capacity: usize, - buffer: Vec, -} +const BUFFER_MULTIPLIER: usize = 4; impl Default for TDigest { fn default() -> Self { @@ -346,7 +331,7 @@ impl TDigest { } /// Process buffered values and merge centroids if needed. - fn compress(&mut self) { + pub(super) fn compress(&mut self) { if self.buffer.is_empty() { return; } @@ -365,7 +350,7 @@ impl TDigest { /// * `buffer` is generated from `self.buffer`, and thus: /// * No `NAN` values are present in `buffer`. /// * We should clear `self.buffer` after merging. - fn do_merge(&mut self, mut buffer: Vec, weight: u64) { + pub(super) fn do_merge(&mut self, mut buffer: Vec, weight: u64) { buffer.extend(std::mem::take(&mut self.centroids)); buffer.sort_by(centroid_cmp); if self.reverse_merge { @@ -422,25 +407,6 @@ fn centroid_cmp(a: &Centroid, b: &Centroid) -> std::cmp::Ordering { } } -#[derive(Debug, Clone, Copy, PartialEq)] -struct Centroid { - mean: f64, - weight: u64, -} - -impl Centroid { - fn add(&mut self, other: Centroid) { - if self.weight != 0 { - let total_weight = self.weight + other.weight; - self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); - self.weight = total_weight; - } else { - self.mean = other.mean; - self.weight = other.weight; - } - } -} - /// Generates cluster sizes proportional to `q*(1-q)`. /// /// The use of a normalizing function results in a strictly bounded number of clusters no matter From 81ba5af24af430b8cb37333cb654012fb524e5c2 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 16:12:36 +0800 Subject: [PATCH 06/26] impl ser Signed-off-by: tison --- Cargo.lock | 7 ++++ Cargo.toml | 1 + src/tdigest/iter.rs | 50 ---------------------------- src/tdigest/mod.rs | 3 +- src/tdigest/serialization.rs | 63 ++++++++++++++++++++++++++++++++++++ src/tdigest/sketch.rs | 8 ++--- 6 files changed, 76 insertions(+), 56 deletions(-) delete mode 100644 src/tdigest/iter.rs diff --git a/Cargo.lock b/Cargo.lock index aa760e8..a2f2c69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,10 +17,17 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "datasketches" version = "0.1.0" dependencies = [ + "byteorder", "googletest", "mur3", ] diff --git a/Cargo.toml b/Cargo.toml index 897e9bc..90961f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] +byteorder = { version = "1.5.0" } mur3 = { version = "0.1.0" } [dev-dependencies] diff --git a/src/tdigest/iter.rs b/src/tdigest/iter.rs deleted file mode 100644 index 7d12381..0000000 --- a/src/tdigest/iter.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::tdigest::{Centroid, TDigest}; - -impl IntoIterator for TDigest { - type Item = (f64, u64); - type IntoIter = TDigestIntoIter; - - fn into_iter(self) -> Self::IntoIter { - TDigestIntoIter { - centroids: self.centroids, - index: 0, - } - } -} - -/// Iterator over the centroids of a TDigest. -pub struct TDigestIntoIter { - centroids: Vec, - index: usize, -} - -impl Iterator for TDigestIntoIter { - type Item = (f64, u64); - - fn next(&mut self) -> Option { - if self.index < self.centroids.len() { - let centroid = self.centroids[self.index]; - self.index += 1; - Some((centroid.mean, centroid.weight)) - } else { - None - } - } -} - -impl FromIterator<(f64, u64)> for TDigest { - fn from_iter>(iter: I) -> Self { - let iter = iter.into_iter(); - - let mut tmp = Vec::with_capacity(iter.size_hint().0); - let mut total_weight = 0; - for (mean, weight) in iter { - tmp.push(Centroid { mean, weight }); - total_weight += weight; - } - - let mut tdigest = TDigest::default(); - tdigest.do_merge(tmp, total_weight); - tdigest - } -} diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs index 32b55b5..26c9342 100644 --- a/src/tdigest/mod.rs +++ b/src/tdigest/mod.rs @@ -48,7 +48,6 @@ //! //! [paper]: https://arxiv.org/abs/1902.04023 -mod iter; mod serialization; mod sketch; @@ -57,7 +56,7 @@ mod sketch; /// See the [module documentation](self) for more details. #[derive(Debug, Clone, PartialEq)] pub struct TDigest { - k: usize, + k: u16, reverse_merge: bool, min: f64, diff --git a/src/tdigest/serialization.rs b/src/tdigest/serialization.rs index 5597ddd..5092fbb 100644 --- a/src/tdigest/serialization.rs +++ b/src/tdigest/serialization.rs @@ -1,14 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use byteorder::{ByteOrder, LE}; + +use crate::error::SerdeError; use crate::tdigest::TDigest; const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; const PREAMBLE_LONGS_MULTIPLE: u8 = 2; const SERIAL_VERSION: u8 = 1; const TDIGEST_FAMILY_ID: u8 = 20; +const FLAGS_IS_EMPTY: u8 = 1 << 0; +const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1; +const FLAGS_REVERSE_MERGE: u8 = 1 << 2; impl TDigest { /// Serializes this TDigest to bytes. pub fn serialize(&mut self) -> Vec { self.compress(); + let mut bytes = vec![]; bytes.push(match self.total_weight() { 0 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, @@ -17,6 +41,45 @@ impl TDigest { }); bytes.push(SERIAL_VERSION); bytes.push(TDIGEST_FAMILY_ID); + LE::write_u16(&mut bytes, self.k); + bytes.push({ + let mut flags = 0; + if self.is_empty() { + flags |= FLAGS_IS_EMPTY; + } + if self.is_single_value() { + flags |= FLAGS_IS_SINGLE_VALUE; + } + if self.reverse_merge { + flags |= FLAGS_REVERSE_MERGE; + } + flags + }); + LE::write_u16(&mut bytes, 0); // unused + if self.is_empty() { + return bytes; + } + if self.is_single_value() { + LE::write_f64(&mut bytes, self.min); + return bytes; + } + LE::write_u32(&mut bytes, self.centroids.len() as u32); + LE::write_u32(&mut bytes, 0); // unused + LE::write_f64(&mut bytes, self.min); + LE::write_f64(&mut bytes, self.max); + for centroid in &self.centroids { + LE::write_f64(&mut bytes, centroid.mean); + LE::write_u64(&mut bytes, centroid.weight); + } bytes } + + /// Deserializes a TDigest from bytes. + pub fn deserialize(_bytes: &[u8]) -> Result { + unimplemented!("Deserialization is not yet implemented"); + } + + fn is_single_value(&self) -> bool { + self.total_weight() == 1 + } } diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index e33a17a..bfeef1d 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -29,18 +29,18 @@ impl Default for TDigest { impl TDigest { /// The default value of K if one is not specified. - pub const DEFAULT_K: usize = 200; + pub const DEFAULT_K: u16 = 200; /// Creates a tdigest instance with the given value of k. /// /// # Panics /// /// If k is less than 10 - pub fn new(k: usize) -> Self { + pub fn new(k: u16) -> Self { assert!(k >= 10, "k must be at least 10"); let fudge = if k < 30 { 30 } else { 10 }; - let centroids_capacity = (k * 2) + fudge; + let centroids_capacity = (k as usize * 2) + fudge; let centroids = Vec::with_capacity(centroids_capacity); let buffer = Vec::with_capacity(centroids_capacity * BUFFER_MULTIPLIER); @@ -73,7 +73,7 @@ impl TDigest { } /// Returns parameter k (compression) that was used to configure this TDigest. - pub fn k(&self) -> usize { + pub fn k(&self) -> u16 { self.k } From a213242a33c7f0610d3b2e75ad81cc831bb7c116 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 16:50:45 +0800 Subject: [PATCH 07/26] impl de Signed-off-by: tison --- src/tdigest/serialization.rs | 118 +++++++++++++++++++++++++++++++++-- src/tdigest/sketch.rs | 33 ++++++++-- 2 files changed, 141 insertions(+), 10 deletions(-) diff --git a/src/tdigest/serialization.rs b/src/tdigest/serialization.rs index 5092fbb..b0dac3d 100644 --- a/src/tdigest/serialization.rs +++ b/src/tdigest/serialization.rs @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. -use byteorder::{ByteOrder, LE}; +use byteorder::{ByteOrder, LE, ReadBytesExt}; +use std::io::Cursor; use crate::error::SerdeError; -use crate::tdigest::TDigest; +use crate::tdigest::{Centroid, TDigest}; const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; const PREAMBLE_LONGS_MULTIPLE: u8 = 2; @@ -75,8 +76,117 @@ impl TDigest { } /// Deserializes a TDigest from bytes. - pub fn deserialize(_bytes: &[u8]) -> Result { - unimplemented!("Deserialization is not yet implemented"); + /// + /// Supports reading compact format with (float, int) centroids as opposed to (double, long) to + /// represent (mean, weight). [^1] + /// + /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. + pub fn deserialize(bytes: &[u8], is_float: bool) -> Result { + let make_error = |_| SerdeError::InsufficientData("tdigest".to_string()); + let mut cursor = Cursor::new(bytes); + + let preamble_longs = cursor.read_u8().map_err(make_error)?; + let serial_version = cursor.read_u8().map_err(make_error)?; + let family_id = cursor.read_u8().map_err(make_error)?; + if family_id != TDIGEST_FAMILY_ID { + // TODO: Support reading format of the reference implementation + return Err(SerdeError::InvalidFamily(format!( + "expected {} (TDigest), got {}", + TDIGEST_FAMILY_ID, family_id + ))); + } + if serial_version != SERIAL_VERSION { + return Err(SerdeError::UnsupportedVersion(format!( + "expected {}, got {}", + SERIAL_VERSION, serial_version + ))); + } + let k = cursor.read_u16::().map_err(make_error)?; + let flags = cursor.read_u8().map_err(make_error)?; + let is_empty = (flags & FLAGS_IS_EMPTY) != 0; + let is_single_value = (flags & FLAGS_IS_SINGLE_VALUE) != 0; + let expected_preamble_longs = if is_empty || is_single_value { + PREAMBLE_LONGS_EMPTY_OR_SINGLE + } else { + PREAMBLE_LONGS_MULTIPLE + }; + if preamble_longs != expected_preamble_longs { + return Err(SerdeError::MalformedData(format!( + "expected preamble_longs to be {}, got {}", + expected_preamble_longs, preamble_longs + ))); + } + cursor.read_u16::().map_err(make_error)?; // unused + if is_empty { + return Ok(TDigest::new(k)); + } + let reverse_merge = (flags & FLAGS_REVERSE_MERGE) != 0; + if is_single_value { + let value = if is_float { + cursor.read_f32::().map_err(make_error)? as f64 + } else { + cursor.read_f64::().map_err(make_error)? + }; + return Ok(TDigest::make( + k, + reverse_merge, + value, + value, + vec![Centroid { + mean: value, + weight: 1, + }], + 1, + vec![], + )); + } + let num_centroids = cursor.read_u32::().map_err(make_error)? as usize; + let num_buffered = cursor.read_u32::().map_err(make_error)? as usize; + let (min, max) = if is_float { + ( + cursor.read_f32::().map_err(make_error)? as f64, + cursor.read_f32::().map_err(make_error)? as f64, + ) + } else { + ( + cursor.read_f64::().map_err(make_error)?, + cursor.read_f64::().map_err(make_error)?, + ) + }; + let mut centroids = Vec::with_capacity(num_centroids); + let mut centroids_weight = 0; + for _ in 0..num_centroids { + let (mean, weight) = if is_float { + ( + cursor.read_f32::().map_err(make_error)? as f64, + cursor.read_u32::().map_err(make_error)? as u64, + ) + } else { + ( + cursor.read_f64::().map_err(make_error)?, + cursor.read_u64::().map_err(make_error)?, + ) + }; + centroids_weight += weight; + centroids.push(Centroid { mean, weight }); + } + let mut buffer = Vec::with_capacity(num_buffered); + for _ in 0..num_buffered { + buffer.push(if is_float { + cursor.read_f32::().map_err(make_error)? as f64 + } else { + cursor.read_f64::().map_err(make_error)? + }) + } + Ok(TDigest::make( + k, + reverse_merge, + min, + max, + centroids, + centroids_weight, + buffer, + )) } fn is_single_value(&self) -> bool { diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index bfeef1d..20b366e 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -37,21 +37,42 @@ impl TDigest { /// /// If k is less than 10 pub fn new(k: u16) -> Self { + Self::make( + k, + false, + f64::INFINITY, + f64::NEG_INFINITY, + vec![], + 0, + vec![], + ) + } + + // for deserialization + pub(super) fn make( + k: u16, + reverse_merge: bool, + min: f64, + max: f64, + mut centroids: Vec, + centroids_weight: u64, + mut buffer: Vec, + ) -> Self { assert!(k >= 10, "k must be at least 10"); let fudge = if k < 30 { 30 } else { 10 }; let centroids_capacity = (k as usize * 2) + fudge; - let centroids = Vec::with_capacity(centroids_capacity); - let buffer = Vec::with_capacity(centroids_capacity * BUFFER_MULTIPLIER); + centroids.reserve(centroids_capacity); + buffer.reserve(centroids_capacity * BUFFER_MULTIPLIER); TDigest { k, - reverse_merge: false, - min: f64::INFINITY, - max: f64::NEG_INFINITY, + reverse_merge, + min, + max, centroids, - centroids_weight: 0, + centroids_weight, centroids_capacity, buffer, } From 5cc6e213548faf576e4038c88a32963573968624 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 16:56:17 +0800 Subject: [PATCH 08/26] fine tune deserialize tags Signed-off-by: tison --- src/tdigest/serialization.rs | 55 ++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/src/tdigest/serialization.rs b/src/tdigest/serialization.rs index b0dac3d..f2f1f14 100644 --- a/src/tdigest/serialization.rs +++ b/src/tdigest/serialization.rs @@ -82,12 +82,12 @@ impl TDigest { /// /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. pub fn deserialize(bytes: &[u8], is_float: bool) -> Result { - let make_error = |_| SerdeError::InsufficientData("tdigest".to_string()); + let make_error = |tag: &'static str| move |_| SerdeError::InsufficientData(tag.to_string()); let mut cursor = Cursor::new(bytes); - let preamble_longs = cursor.read_u8().map_err(make_error)?; - let serial_version = cursor.read_u8().map_err(make_error)?; - let family_id = cursor.read_u8().map_err(make_error)?; + let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; + let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; + let family_id = cursor.read_u8().map_err(make_error("family_id"))?; if family_id != TDIGEST_FAMILY_ID { // TODO: Support reading format of the reference implementation return Err(SerdeError::InvalidFamily(format!( @@ -101,8 +101,8 @@ impl TDigest { SERIAL_VERSION, serial_version ))); } - let k = cursor.read_u16::().map_err(make_error)?; - let flags = cursor.read_u8().map_err(make_error)?; + let k = cursor.read_u16::().map_err(make_error("k"))?; + let flags = cursor.read_u8().map_err(make_error("flags"))?; let is_empty = (flags & FLAGS_IS_EMPTY) != 0; let is_single_value = (flags & FLAGS_IS_SINGLE_VALUE) != 0; let expected_preamble_longs = if is_empty || is_single_value { @@ -116,16 +116,21 @@ impl TDigest { expected_preamble_longs, preamble_longs ))); } - cursor.read_u16::().map_err(make_error)?; // unused + cursor.read_u16::().map_err(make_error(""))?; // unused if is_empty { return Ok(TDigest::new(k)); } + let reverse_merge = (flags & FLAGS_REVERSE_MERGE) != 0; if is_single_value { let value = if is_float { - cursor.read_f32::().map_err(make_error)? as f64 + cursor + .read_f32::() + .map_err(make_error("single_value"))? as f64 } else { - cursor.read_f64::().map_err(make_error)? + cursor + .read_f64::() + .map_err(make_error("single_value"))? }; return Ok(TDigest::make( k, @@ -140,17 +145,21 @@ impl TDigest { vec![], )); } - let num_centroids = cursor.read_u32::().map_err(make_error)? as usize; - let num_buffered = cursor.read_u32::().map_err(make_error)? as usize; + let num_centroids = cursor + .read_u32::() + .map_err(make_error("num_centroids"))? as usize; + let num_buffered = cursor + .read_u32::() + .map_err(make_error("num_buffered"))? as usize; let (min, max) = if is_float { ( - cursor.read_f32::().map_err(make_error)? as f64, - cursor.read_f32::().map_err(make_error)? as f64, + cursor.read_f32::().map_err(make_error("min"))? as f64, + cursor.read_f32::().map_err(make_error("max"))? as f64, ) } else { ( - cursor.read_f64::().map_err(make_error)?, - cursor.read_f64::().map_err(make_error)?, + cursor.read_f64::().map_err(make_error("min"))?, + cursor.read_f64::().map_err(make_error("max"))?, ) }; let mut centroids = Vec::with_capacity(num_centroids); @@ -158,13 +167,13 @@ impl TDigest { for _ in 0..num_centroids { let (mean, weight) = if is_float { ( - cursor.read_f32::().map_err(make_error)? as f64, - cursor.read_u32::().map_err(make_error)? as u64, + cursor.read_f32::().map_err(make_error("mean"))? as f64, + cursor.read_u32::().map_err(make_error("weight"))? as u64, ) } else { ( - cursor.read_f64::().map_err(make_error)?, - cursor.read_u64::().map_err(make_error)?, + cursor.read_f64::().map_err(make_error("mean"))?, + cursor.read_u64::().map_err(make_error("weight"))?, ) }; centroids_weight += weight; @@ -173,9 +182,13 @@ impl TDigest { let mut buffer = Vec::with_capacity(num_buffered); for _ in 0..num_buffered { buffer.push(if is_float { - cursor.read_f32::().map_err(make_error)? as f64 + cursor + .read_f32::() + .map_err(make_error("buffered_value"))? as f64 } else { - cursor.read_f64::().map_err(make_error)? + cursor + .read_f64::() + .map_err(make_error("buffered_value"))? }) } Ok(TDigest::make( From d90491d3feefab9952d5b063720d208832b4d2e4 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 18:23:07 +0800 Subject: [PATCH 09/26] define code in one place Signed-off-by: tison --- src/tdigest/mod.rs | 19 +--- src/tdigest/serialization.rs | 198 ++------------------------------- src/tdigest/sketch.rs | 205 ++++++++++++++++++++++++++++++++++- 3 files changed, 210 insertions(+), 212 deletions(-) diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs index 26c9342..f774bf3 100644 --- a/src/tdigest/mod.rs +++ b/src/tdigest/mod.rs @@ -49,24 +49,9 @@ //! [paper]: https://arxiv.org/abs/1902.04023 mod serialization; -mod sketch; - -/// T-Digest sketch for estimating quantiles and ranks. -/// -/// See the [module documentation](self) for more details. -#[derive(Debug, Clone, PartialEq)] -pub struct TDigest { - k: u16, - - reverse_merge: bool, - min: f64, - max: f64, - centroids: Vec, - centroids_weight: u64, - centroids_capacity: usize, - buffer: Vec, -} +mod sketch; +pub use self::sketch::TDigest; #[derive(Debug, Clone, Copy, PartialEq)] struct Centroid { diff --git a/src/tdigest/serialization.rs b/src/tdigest/serialization.rs index f2f1f14..6d6ef34 100644 --- a/src/tdigest/serialization.rs +++ b/src/tdigest/serialization.rs @@ -15,194 +15,10 @@ // specific language governing permissions and limitations // under the License. -use byteorder::{ByteOrder, LE, ReadBytesExt}; -use std::io::Cursor; - -use crate::error::SerdeError; -use crate::tdigest::{Centroid, TDigest}; - -const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; -const PREAMBLE_LONGS_MULTIPLE: u8 = 2; -const SERIAL_VERSION: u8 = 1; -const TDIGEST_FAMILY_ID: u8 = 20; -const FLAGS_IS_EMPTY: u8 = 1 << 0; -const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1; -const FLAGS_REVERSE_MERGE: u8 = 1 << 2; - -impl TDigest { - /// Serializes this TDigest to bytes. - pub fn serialize(&mut self) -> Vec { - self.compress(); - - let mut bytes = vec![]; - bytes.push(match self.total_weight() { - 0 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, - 1 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, - _ => PREAMBLE_LONGS_MULTIPLE, - }); - bytes.push(SERIAL_VERSION); - bytes.push(TDIGEST_FAMILY_ID); - LE::write_u16(&mut bytes, self.k); - bytes.push({ - let mut flags = 0; - if self.is_empty() { - flags |= FLAGS_IS_EMPTY; - } - if self.is_single_value() { - flags |= FLAGS_IS_SINGLE_VALUE; - } - if self.reverse_merge { - flags |= FLAGS_REVERSE_MERGE; - } - flags - }); - LE::write_u16(&mut bytes, 0); // unused - if self.is_empty() { - return bytes; - } - if self.is_single_value() { - LE::write_f64(&mut bytes, self.min); - return bytes; - } - LE::write_u32(&mut bytes, self.centroids.len() as u32); - LE::write_u32(&mut bytes, 0); // unused - LE::write_f64(&mut bytes, self.min); - LE::write_f64(&mut bytes, self.max); - for centroid in &self.centroids { - LE::write_f64(&mut bytes, centroid.mean); - LE::write_u64(&mut bytes, centroid.weight); - } - bytes - } - - /// Deserializes a TDigest from bytes. - /// - /// Supports reading compact format with (float, int) centroids as opposed to (double, long) to - /// represent (mean, weight). [^1] - /// - /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. - pub fn deserialize(bytes: &[u8], is_float: bool) -> Result { - let make_error = |tag: &'static str| move |_| SerdeError::InsufficientData(tag.to_string()); - let mut cursor = Cursor::new(bytes); - - let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; - let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; - let family_id = cursor.read_u8().map_err(make_error("family_id"))?; - if family_id != TDIGEST_FAMILY_ID { - // TODO: Support reading format of the reference implementation - return Err(SerdeError::InvalidFamily(format!( - "expected {} (TDigest), got {}", - TDIGEST_FAMILY_ID, family_id - ))); - } - if serial_version != SERIAL_VERSION { - return Err(SerdeError::UnsupportedVersion(format!( - "expected {}, got {}", - SERIAL_VERSION, serial_version - ))); - } - let k = cursor.read_u16::().map_err(make_error("k"))?; - let flags = cursor.read_u8().map_err(make_error("flags"))?; - let is_empty = (flags & FLAGS_IS_EMPTY) != 0; - let is_single_value = (flags & FLAGS_IS_SINGLE_VALUE) != 0; - let expected_preamble_longs = if is_empty || is_single_value { - PREAMBLE_LONGS_EMPTY_OR_SINGLE - } else { - PREAMBLE_LONGS_MULTIPLE - }; - if preamble_longs != expected_preamble_longs { - return Err(SerdeError::MalformedData(format!( - "expected preamble_longs to be {}, got {}", - expected_preamble_longs, preamble_longs - ))); - } - cursor.read_u16::().map_err(make_error(""))?; // unused - if is_empty { - return Ok(TDigest::new(k)); - } - - let reverse_merge = (flags & FLAGS_REVERSE_MERGE) != 0; - if is_single_value { - let value = if is_float { - cursor - .read_f32::() - .map_err(make_error("single_value"))? as f64 - } else { - cursor - .read_f64::() - .map_err(make_error("single_value"))? - }; - return Ok(TDigest::make( - k, - reverse_merge, - value, - value, - vec![Centroid { - mean: value, - weight: 1, - }], - 1, - vec![], - )); - } - let num_centroids = cursor - .read_u32::() - .map_err(make_error("num_centroids"))? as usize; - let num_buffered = cursor - .read_u32::() - .map_err(make_error("num_buffered"))? as usize; - let (min, max) = if is_float { - ( - cursor.read_f32::().map_err(make_error("min"))? as f64, - cursor.read_f32::().map_err(make_error("max"))? as f64, - ) - } else { - ( - cursor.read_f64::().map_err(make_error("min"))?, - cursor.read_f64::().map_err(make_error("max"))?, - ) - }; - let mut centroids = Vec::with_capacity(num_centroids); - let mut centroids_weight = 0; - for _ in 0..num_centroids { - let (mean, weight) = if is_float { - ( - cursor.read_f32::().map_err(make_error("mean"))? as f64, - cursor.read_u32::().map_err(make_error("weight"))? as u64, - ) - } else { - ( - cursor.read_f64::().map_err(make_error("mean"))?, - cursor.read_u64::().map_err(make_error("weight"))?, - ) - }; - centroids_weight += weight; - centroids.push(Centroid { mean, weight }); - } - let mut buffer = Vec::with_capacity(num_buffered); - for _ in 0..num_buffered { - buffer.push(if is_float { - cursor - .read_f32::() - .map_err(make_error("buffered_value"))? as f64 - } else { - cursor - .read_f64::() - .map_err(make_error("buffered_value"))? - }) - } - Ok(TDigest::make( - k, - reverse_merge, - min, - max, - centroids, - centroids_weight, - buffer, - )) - } - - fn is_single_value(&self) -> bool { - self.total_weight() == 1 - } -} +pub(super) const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; +pub(super) const PREAMBLE_LONGS_MULTIPLE: u8 = 2; +pub(super) const SERIAL_VERSION: u8 = 1; +pub(super) const TDIGEST_FAMILY_ID: u8 = 20; +pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0; +pub(super) const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1; +pub(super) const FLAGS_REVERSE_MERGE: u8 = 1 << 2; diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 20b366e..6f84eea 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -15,12 +15,33 @@ // specific language governing permissions and limitations // under the License. +use byteorder::{ByteOrder, LE, ReadBytesExt}; use std::convert::identity; +use std::io::Cursor; -use crate::tdigest::{Centroid, TDigest}; +use crate::error::SerdeError; +use crate::tdigest::Centroid; +use crate::tdigest::serialization::*; const BUFFER_MULTIPLIER: usize = 4; +/// T-Digest sketch for estimating quantiles and ranks. +/// +/// See the [module documentation](self) for more details. +#[derive(Debug, Clone, PartialEq)] +pub struct TDigest { + k: u16, + + reverse_merge: bool, + min: f64, + max: f64, + + centroids: Vec, + centroids_weight: u64, + centroids_capacity: usize, + buffer: Vec, +} + impl Default for TDigest { fn default() -> Self { TDigest::new(Self::DEFAULT_K) @@ -49,7 +70,7 @@ impl TDigest { } // for deserialization - pub(super) fn make( + fn make( k: u16, reverse_merge: bool, min: f64, @@ -351,8 +372,184 @@ impl TDigest { )) } + /// Serializes this TDigest to bytes. + pub fn serialize(&mut self) -> Vec { + self.compress(); + + let mut bytes = vec![]; + bytes.push(match self.total_weight() { + 0 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, + 1 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, + _ => PREAMBLE_LONGS_MULTIPLE, + }); + bytes.push(SERIAL_VERSION); + bytes.push(TDIGEST_FAMILY_ID); + LE::write_u16(&mut bytes, self.k); + bytes.push({ + let mut flags = 0; + if self.is_empty() { + flags |= FLAGS_IS_EMPTY; + } + if self.is_single_value() { + flags |= FLAGS_IS_SINGLE_VALUE; + } + if self.reverse_merge { + flags |= FLAGS_REVERSE_MERGE; + } + flags + }); + LE::write_u16(&mut bytes, 0); // unused + if self.is_empty() { + return bytes; + } + if self.is_single_value() { + LE::write_f64(&mut bytes, self.min); + return bytes; + } + LE::write_u32(&mut bytes, self.centroids.len() as u32); + LE::write_u32(&mut bytes, 0); // unused + LE::write_f64(&mut bytes, self.min); + LE::write_f64(&mut bytes, self.max); + for centroid in &self.centroids { + LE::write_f64(&mut bytes, centroid.mean); + LE::write_u64(&mut bytes, centroid.weight); + } + bytes + } + + /// Deserializes a TDigest from bytes. + /// + /// Supports reading compact format with (float, int) centroids as opposed to (double, long) to + /// represent (mean, weight). [^1] + /// + /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. + pub fn deserialize(bytes: &[u8], is_float: bool) -> Result { + let make_error = |tag: &'static str| move |_| SerdeError::InsufficientData(tag.to_string()); + let mut cursor = Cursor::new(bytes); + + let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; + let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; + let family_id = cursor.read_u8().map_err(make_error("family_id"))?; + if family_id != TDIGEST_FAMILY_ID { + // TODO: Support reading format of the reference implementation + return Err(SerdeError::InvalidFamily(format!( + "expected {} (TDigest), got {}", + TDIGEST_FAMILY_ID, family_id + ))); + } + if serial_version != SERIAL_VERSION { + return Err(SerdeError::UnsupportedVersion(format!( + "expected {}, got {}", + SERIAL_VERSION, serial_version + ))); + } + let k = cursor.read_u16::().map_err(make_error("k"))?; + let flags = cursor.read_u8().map_err(make_error("flags"))?; + let is_empty = (flags & FLAGS_IS_EMPTY) != 0; + let is_single_value = (flags & FLAGS_IS_SINGLE_VALUE) != 0; + let expected_preamble_longs = if is_empty || is_single_value { + PREAMBLE_LONGS_EMPTY_OR_SINGLE + } else { + PREAMBLE_LONGS_MULTIPLE + }; + if preamble_longs != expected_preamble_longs { + return Err(SerdeError::MalformedData(format!( + "expected preamble_longs to be {}, got {}", + expected_preamble_longs, preamble_longs + ))); + } + cursor.read_u16::().map_err(make_error(""))?; // unused + if is_empty { + return Ok(TDigest::new(k)); + } + + let reverse_merge = (flags & FLAGS_REVERSE_MERGE) != 0; + if is_single_value { + let value = if is_float { + cursor + .read_f32::() + .map_err(make_error("single_value"))? as f64 + } else { + cursor + .read_f64::() + .map_err(make_error("single_value"))? + }; + return Ok(TDigest::make( + k, + reverse_merge, + value, + value, + vec![Centroid { + mean: value, + weight: 1, + }], + 1, + vec![], + )); + } + let num_centroids = cursor + .read_u32::() + .map_err(make_error("num_centroids"))? as usize; + let num_buffered = cursor + .read_u32::() + .map_err(make_error("num_buffered"))? as usize; + let (min, max) = if is_float { + ( + cursor.read_f32::().map_err(make_error("min"))? as f64, + cursor.read_f32::().map_err(make_error("max"))? as f64, + ) + } else { + ( + cursor.read_f64::().map_err(make_error("min"))?, + cursor.read_f64::().map_err(make_error("max"))?, + ) + }; + let mut centroids = Vec::with_capacity(num_centroids); + let mut centroids_weight = 0; + for _ in 0..num_centroids { + let (mean, weight) = if is_float { + ( + cursor.read_f32::().map_err(make_error("mean"))? as f64, + cursor.read_u32::().map_err(make_error("weight"))? as u64, + ) + } else { + ( + cursor.read_f64::().map_err(make_error("mean"))?, + cursor.read_u64::().map_err(make_error("weight"))?, + ) + }; + centroids_weight += weight; + centroids.push(Centroid { mean, weight }); + } + let mut buffer = Vec::with_capacity(num_buffered); + for _ in 0..num_buffered { + buffer.push(if is_float { + cursor + .read_f32::() + .map_err(make_error("buffered_value"))? as f64 + } else { + cursor + .read_f64::() + .map_err(make_error("buffered_value"))? + }) + } + Ok(TDigest::make( + k, + reverse_merge, + min, + max, + centroids, + centroids_weight, + buffer, + )) + } + + fn is_single_value(&self) -> bool { + self.total_weight() == 1 + } + /// Process buffered values and merge centroids if needed. - pub(super) fn compress(&mut self) { + fn compress(&mut self) { if self.buffer.is_empty() { return; } @@ -371,7 +568,7 @@ impl TDigest { /// * `buffer` is generated from `self.buffer`, and thus: /// * No `NAN` values are present in `buffer`. /// * We should clear `self.buffer` after merging. - pub(super) fn do_merge(&mut self, mut buffer: Vec, weight: u64) { + fn do_merge(&mut self, mut buffer: Vec, weight: u64) { buffer.extend(std::mem::take(&mut self.centroids)); buffer.sort_by(centroid_cmp); if self.reverse_merge { From 9497d24065c32ee47ed8d48d640e8662462b6569 Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 18:28:16 +0800 Subject: [PATCH 10/26] centralize compare logics Signed-off-by: tison --- src/tdigest/mod.rs | 19 --------------- src/tdigest/sketch.rs | 55 ++++++++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs index f774bf3..002e479 100644 --- a/src/tdigest/mod.rs +++ b/src/tdigest/mod.rs @@ -52,22 +52,3 @@ mod serialization; mod sketch; pub use self::sketch::TDigest; - -#[derive(Debug, Clone, Copy, PartialEq)] -struct Centroid { - mean: f64, - weight: u64, -} - -impl Centroid { - fn add(&mut self, other: Centroid) { - if self.weight != 0 { - let total_weight = self.weight + other.weight; - self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); - self.weight = total_weight; - } else { - self.mean = other.mean; - self.weight = other.weight; - } - } -} diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 6f84eea..4566531 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -16,11 +16,11 @@ // under the License. use byteorder::{ByteOrder, LE, ReadBytesExt}; +use std::cmp::Ordering; use std::convert::identity; use std::io::Cursor; use crate::error::SerdeError; -use crate::tdigest::Centroid; use crate::tdigest::serialization::*; const BUFFER_MULTIPLIER: usize = 4; @@ -228,24 +228,12 @@ impl TDigest { let mut lower = self .centroids - .binary_search_by(|c| { - if c.mean < value { - std::cmp::Ordering::Less - } else { - std::cmp::Ordering::Greater - } - }) + .binary_search_by(|c| centroid_lower_bound(c, value)) .unwrap_or_else(identity); debug_assert_ne!(lower, num_centroids, "get_rank: lower == end"); let mut upper = self .centroids - .binary_search_by(|c| { - if c.mean > value { - std::cmp::Ordering::Greater - } else { - std::cmp::Ordering::Less - } - }) + .binary_search_by(|c| centroid_upper_bound(c, value)) .unwrap_or_else(identity); debug_assert_ne!(upper, 0, "get_rank: upper == begin"); if value < self.centroids[lower].mean { @@ -618,13 +606,48 @@ impl TDigest { } } -fn centroid_cmp(a: &Centroid, b: &Centroid) -> std::cmp::Ordering { +fn centroid_cmp(a: &Centroid, b: &Centroid) -> Ordering { match a.mean.partial_cmp(&b.mean) { Some(order) => order, None => unreachable!("NaN values should never be present in centroids"), } } +fn centroid_lower_bound(c: &Centroid, value: f64) -> Ordering { + if c.mean < value { + Ordering::Less + } else { + Ordering::Greater + } +} + +fn centroid_upper_bound(c: &Centroid, value: f64) -> Ordering { + if c.mean > value { + Ordering::Greater + } else { + Ordering::Less + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +struct Centroid { + mean: f64, + weight: u64, +} + +impl Centroid { + fn add(&mut self, other: Centroid) { + if self.weight != 0 { + let total_weight = self.weight + other.weight; + self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); + self.weight = total_weight; + } else { + self.mean = other.mean; + self.weight = other.weight; + } + } +} + /// Generates cluster sizes proportional to `q*(1-q)`. /// /// The use of a normalizing function results in a strictly bounded number of clusters no matter From 53b74eef5f01604f795c81dbd092a81399b1997c Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 23:14:59 +0800 Subject: [PATCH 11/26] finish serde Signed-off-by: tison --- src/tdigest/sketch.rs | 74 ++++++++++++++++------- tests/tdigest_serialization_test.rs | 69 ++++++++++++++++++++++ tests/tdigest_test.rs | 92 ++++++++++++++--------------- 3 files changed, 165 insertions(+), 70 deletions(-) create mode 100644 tests/tdigest_serialization_test.rs diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 4566531..05514e5 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use byteorder::{ByteOrder, LE, ReadBytesExt}; +use byteorder::{LE, ReadBytesExt}; use std::cmp::Ordering; use std::convert::identity; use std::io::Cursor; @@ -28,7 +28,7 @@ const BUFFER_MULTIPLIER: usize = 4; /// T-Digest sketch for estimating quantiles and ranks. /// /// See the [module documentation](self) for more details. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct TDigest { k: u16, @@ -147,7 +147,7 @@ impl TDigest { self.centroids_weight + (self.buffer.len() as u64) } - /// Merge the given t-Digest into this one + /// Merge the given TDigest into this one pub fn merge(&mut self, other: &TDigest) { if other.is_empty() { return; @@ -175,7 +175,7 @@ impl TDigest { /// # Panics /// /// If the value is `NaN`. - pub fn get_rank(&mut self, value: f64) -> Option { + pub fn rank(&mut self, value: f64) -> Option { assert!(!value.is_nan(), "value must not be NaN"); if self.is_empty() { @@ -277,7 +277,7 @@ impl TDigest { /// # Panics /// /// If rank is not in [0.0, 1.0]. - pub fn get_quantile(&mut self, rank: f64) -> Option { + pub fn quantile(&mut self, rank: f64) -> Option { assert!((0.0..=1.0).contains(&rank), "rank must be in [0.0, 1.0]"); if self.is_empty() { @@ -364,7 +364,35 @@ impl TDigest { pub fn serialize(&mut self) -> Vec { self.compress(); - let mut bytes = vec![]; + let mut total_size = 0; + if self.is_empty() || self.is_single_value() { + // 1 byte preamble + // + 1 byte serial version + // + 1 byte family + // + 2 bytes k + // + 1 byte flags + // + 2 bytes unused + total_size += size_of::(); + } else { + // all of the above + // + 4 bytes num centroids + // + 4 bytes num buffered + total_size += size_of::() * 2; + } + if self.is_empty() { + // nothing more + } else if self.is_single_value() { + // + 8 bytes single value + total_size += size_of::(); + } else { + // + 8 bytes min + // + 8 bytes max + total_size += size_of::() * 2; + // + (8+8) bytes per centroid + total_size += self.centroids.len() * (size_of::() + size_of::()); + } + + let mut bytes = Vec::with_capacity(total_size); bytes.push(match self.total_weight() { 0 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, 1 => PREAMBLE_LONGS_EMPTY_OR_SINGLE, @@ -372,7 +400,7 @@ impl TDigest { }); bytes.push(SERIAL_VERSION); bytes.push(TDIGEST_FAMILY_ID); - LE::write_u16(&mut bytes, self.k); + bytes.extend_from_slice(&self.k.to_le_bytes()); bytes.push({ let mut flags = 0; if self.is_empty() { @@ -386,21 +414,21 @@ impl TDigest { } flags }); - LE::write_u16(&mut bytes, 0); // unused + bytes.extend_from_slice(&0u16.to_le_bytes()); // unused if self.is_empty() { return bytes; } if self.is_single_value() { - LE::write_f64(&mut bytes, self.min); + bytes.extend_from_slice(&self.min.to_le_bytes()); return bytes; } - LE::write_u32(&mut bytes, self.centroids.len() as u32); - LE::write_u32(&mut bytes, 0); // unused - LE::write_f64(&mut bytes, self.min); - LE::write_f64(&mut bytes, self.max); + bytes.extend_from_slice(&(self.centroids.len() as u32).to_le_bytes()); + bytes.extend_from_slice(&0u32.to_le_bytes()); // unused + bytes.extend_from_slice(&self.min.to_le_bytes()); + bytes.extend_from_slice(&self.max.to_le_bytes()); for centroid in &self.centroids { - LE::write_f64(&mut bytes, centroid.mean); - LE::write_u64(&mut bytes, centroid.weight); + bytes.extend_from_slice(¢roid.mean.to_le_bytes()); + bytes.extend_from_slice(¢roid.weight.to_le_bytes()); } bytes } @@ -411,8 +439,12 @@ impl TDigest { /// represent (mean, weight). [^1] /// /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. - pub fn deserialize(bytes: &[u8], is_float: bool) -> Result { - let make_error = |tag: &'static str| move |_| SerdeError::InsufficientData(tag.to_string()); + pub fn deserialize(bytes: &[u8], is_f32: bool) -> Result { + fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> SerdeError { + let tag = tag.to_string(); + move |_| SerdeError::InsufficientData(tag) + } + let mut cursor = Cursor::new(bytes); let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; @@ -453,7 +485,7 @@ impl TDigest { let reverse_merge = (flags & FLAGS_REVERSE_MERGE) != 0; if is_single_value { - let value = if is_float { + let value = if is_f32 { cursor .read_f32::() .map_err(make_error("single_value"))? as f64 @@ -481,7 +513,7 @@ impl TDigest { let num_buffered = cursor .read_u32::() .map_err(make_error("num_buffered"))? as usize; - let (min, max) = if is_float { + let (min, max) = if is_f32 { ( cursor.read_f32::().map_err(make_error("min"))? as f64, cursor.read_f32::().map_err(make_error("max"))? as f64, @@ -495,7 +527,7 @@ impl TDigest { let mut centroids = Vec::with_capacity(num_centroids); let mut centroids_weight = 0; for _ in 0..num_centroids { - let (mean, weight) = if is_float { + let (mean, weight) = if is_f32 { ( cursor.read_f32::().map_err(make_error("mean"))? as f64, cursor.read_u32::().map_err(make_error("weight"))? as u64, @@ -511,7 +543,7 @@ impl TDigest { } let mut buffer = Vec::with_capacity(num_buffered); for _ in 0..num_buffered { - buffer.push(if is_float { + buffer.push(if is_f32 { cursor .read_f32::() .map_err(make_error("buffered_value"))? as f64 diff --git a/tests/tdigest_serialization_test.rs b/tests/tdigest_serialization_test.rs new file mode 100644 index 0000000..5b24d1e --- /dev/null +++ b/tests/tdigest_serialization_test.rs @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datasketches::tdigest::TDigest; + +#[test] +fn test_empty() { + let mut td = TDigest::new(100); + assert!(td.is_empty()); + + let bytes = td.serialize(); + assert_eq!(bytes.len(), 8); + + let deserialized_td = TDigest::deserialize(&bytes, false).unwrap(); + assert_eq!(td.k(), deserialized_td.k()); + assert_eq!(td.total_weight(), deserialized_td.total_weight()); + assert!(td.is_empty()); + assert!(deserialized_td.is_empty()); +} + +#[test] +fn test_single_value() { + let mut td = TDigest::default(); + td.update(123.0); + + let bytes = td.serialize(); + assert_eq!(bytes.len(), 16); + + let deserialized_td = TDigest::deserialize(&bytes, false).unwrap(); + assert_eq!(deserialized_td.k(), 200); + assert_eq!(deserialized_td.total_weight(), 1); + assert!(!deserialized_td.is_empty()); + assert_eq!(deserialized_td.min_value(), Some(123.0)); + assert_eq!(deserialized_td.max_value(), Some(123.0)); +} + +#[test] +fn test_many_values() { + let mut td = TDigest::new(100); + for i in 0..1000 { + td.update(i as f64); + } + + let bytes = td.serialize(); + assert_eq!(bytes.len(), 1584); + + let mut deserialized_td = TDigest::deserialize(&bytes, false).unwrap(); + assert_eq!(td.k(), deserialized_td.k()); + assert_eq!(td.total_weight(), deserialized_td.total_weight()); + assert_eq!(td.is_empty(), deserialized_td.is_empty()); + assert_eq!(td.min_value(), deserialized_td.min_value()); + assert_eq!(td.max_value(), deserialized_td.max_value()); + assert_eq!(td.rank(500.0), deserialized_td.rank(500.0)); + assert_eq!(td.quantile(0.5), deserialized_td.quantile(0.5)); +} diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs index 0847094..4b43296 100644 --- a/tests/tdigest_test.rs +++ b/tests/tdigest_test.rs @@ -27,8 +27,8 @@ fn test_empty() { assert_eq!(tdigest.total_weight(), 0); assert_eq!(tdigest.min_value(), None); assert_eq!(tdigest.max_value(), None); - assert_eq!(tdigest.get_rank(0.0), None); - assert_eq!(tdigest.get_quantile(0.5), None); + assert_eq!(tdigest.rank(0.0), None); + assert_eq!(tdigest.quantile(0.5), None); // TODO: Support PMF and CDF // const double split_points[1] {0}; @@ -44,12 +44,12 @@ fn test_one_value() { assert_eq!(tdigest.total_weight(), 1); assert_eq!(tdigest.min_value(), Some(1.0)); assert_eq!(tdigest.max_value(), Some(1.0)); - assert_eq!(tdigest.get_rank(0.99), Some(0.0)); - assert_eq!(tdigest.get_rank(1.0), Some(0.5)); - assert_eq!(tdigest.get_rank(1.01), Some(1.0)); - assert_eq!(tdigest.get_quantile(0.0), Some(1.0)); - assert_eq!(tdigest.get_quantile(0.5), Some(1.0)); - assert_eq!(tdigest.get_quantile(1.0), Some(1.0)); + assert_eq!(tdigest.rank(0.99), Some(0.0)); + assert_eq!(tdigest.rank(1.0), Some(0.5)); + assert_eq!(tdigest.rank(1.01), Some(1.0)); + assert_eq!(tdigest.quantile(0.0), Some(1.0)); + assert_eq!(tdigest.quantile(0.5), Some(1.0)); + assert_eq!(tdigest.quantile(1.0), Some(1.0)); } #[test] @@ -66,31 +66,28 @@ fn test_many_values() { assert_eq!(tdigest.min_value(), Some(0.0)); assert_eq!(tdigest.max_value(), Some((n - 1) as f64)); - assert_that!(tdigest.get_rank(0.0).unwrap(), near(0.0, 0.0001)); + assert_that!(tdigest.rank(0.0).unwrap(), near(0.0, 0.0001)); + assert_that!(tdigest.rank((n / 4) as f64).unwrap(), near(0.25, 0.0001)); + assert_that!(tdigest.rank((n / 2) as f64).unwrap(), near(0.5, 0.0001)); assert_that!( - tdigest.get_rank((n / 4) as f64).unwrap(), - near(0.25, 0.0001) - ); - assert_that!(tdigest.get_rank((n / 2) as f64).unwrap(), near(0.5, 0.0001)); - assert_that!( - tdigest.get_rank((n * 3 / 4) as f64).unwrap(), + tdigest.rank((n * 3 / 4) as f64).unwrap(), near(0.75, 0.0001) ); - assert_that!(tdigest.get_rank(n as f64).unwrap(), eq(1.0)); - assert_that!(tdigest.get_quantile(0.0).unwrap(), eq(0.0)); + assert_that!(tdigest.rank(n as f64).unwrap(), eq(1.0)); + assert_that!(tdigest.quantile(0.0).unwrap(), eq(0.0)); assert_that!( - tdigest.get_quantile(0.5).unwrap(), + tdigest.quantile(0.5).unwrap(), near((n / 2) as f64, 0.03 * (n / 2) as f64) ); assert_that!( - tdigest.get_quantile(0.9).unwrap(), + tdigest.quantile(0.9).unwrap(), near((n as f64) * 0.9, 0.01 * (n as f64) * 0.9) ); assert_that!( - tdigest.get_quantile(0.95).unwrap(), + tdigest.quantile(0.95).unwrap(), near((n as f64) * 0.95, 0.01 * (n as f64) * 0.95) ); - assert_that!(tdigest.get_quantile(1.0).unwrap(), eq((n - 1) as f64)); + assert_that!(tdigest.quantile(1.0).unwrap(), eq((n - 1) as f64)); // TODO: Later until PMF and CDF are supported // const double split_points[1] {n / 2}; @@ -109,13 +106,13 @@ fn test_rank_two_values() { let mut tdigest = TDigest::new(100); tdigest.update(1.0); tdigest.update(2.0); - assert_eq!(tdigest.get_rank(0.99), Some(0.0)); - assert_eq!(tdigest.get_rank(1.0), Some(0.25)); - assert_eq!(tdigest.get_rank(1.25), Some(0.375)); - assert_eq!(tdigest.get_rank(1.5), Some(0.5)); - assert_eq!(tdigest.get_rank(1.75), Some(0.625)); - assert_eq!(tdigest.get_rank(2.0), Some(0.75)); - assert_eq!(tdigest.get_rank(2.01), Some(1.0)); + assert_eq!(tdigest.rank(0.99), Some(0.0)); + assert_eq!(tdigest.rank(1.0), Some(0.25)); + assert_eq!(tdigest.rank(1.25), Some(0.375)); + assert_eq!(tdigest.rank(1.5), Some(0.5)); + assert_eq!(tdigest.rank(1.75), Some(0.625)); + assert_eq!(tdigest.rank(2.0), Some(0.75)); + assert_eq!(tdigest.rank(2.01), Some(1.0)); } #[test] @@ -125,9 +122,9 @@ fn test_rank_repeated_values() { tdigest.update(1.0); tdigest.update(1.0); tdigest.update(1.0); - assert_eq!(tdigest.get_rank(0.99), Some(0.0)); - assert_eq!(tdigest.get_rank(1.0), Some(0.5)); - assert_eq!(tdigest.get_rank(1.01), Some(1.0)); + assert_eq!(tdigest.rank(0.99), Some(0.0)); + assert_eq!(tdigest.rank(1.0), Some(0.5)); + assert_eq!(tdigest.rank(1.01), Some(1.0)); } #[test] @@ -137,11 +134,11 @@ fn test_repeated_blocks() { tdigest.update(2.0); tdigest.update(2.0); tdigest.update(3.0); - assert_eq!(tdigest.get_rank(0.99), Some(0.0)); - assert_eq!(tdigest.get_rank(1.0), Some(0.125)); - assert_eq!(tdigest.get_rank(2.0), Some(0.5)); - assert_eq!(tdigest.get_rank(3.0), Some(0.875)); - assert_eq!(tdigest.get_rank(3.01), Some(1.0)); + assert_eq!(tdigest.rank(0.99), Some(0.0)); + assert_eq!(tdigest.rank(1.0), Some(0.125)); + assert_eq!(tdigest.rank(2.0), Some(0.5)); + assert_eq!(tdigest.rank(3.0), Some(0.875)); + assert_eq!(tdigest.rank(3.01), Some(1.0)); } #[test] @@ -156,11 +153,11 @@ fn test_merge_small() { assert_eq!(td1.min_value(), Some(1.0)); assert_eq!(td1.max_value(), Some(3.0)); assert_eq!(td1.total_weight(), 4); - assert_eq!(td1.get_rank(0.99), Some(0.0)); - assert_eq!(td1.get_rank(1.0), Some(0.125)); - assert_eq!(td1.get_rank(2.0), Some(0.5)); - assert_eq!(td1.get_rank(3.0), Some(0.875)); - assert_eq!(td1.get_rank(3.01), Some(1.0)); + assert_eq!(td1.rank(0.99), Some(0.0)); + assert_eq!(td1.rank(1.0), Some(0.125)); + assert_eq!(td1.rank(2.0), Some(0.5)); + assert_eq!(td1.rank(3.0), Some(0.875)); + assert_eq!(td1.rank(3.01), Some(1.0)); } #[test] @@ -180,12 +177,9 @@ fn test_merge_large() { assert_eq!(td1.min_value(), Some(0.0)); assert_eq!(td1.max_value(), Some((n - 1) as f64)); - assert_that!(td1.get_rank(0.0).unwrap(), near(0.0, 0.0001)); - assert_that!(td1.get_rank((n / 4) as f64).unwrap(), near(0.25, 0.0001)); - assert_that!(td1.get_rank((n / 2) as f64).unwrap(), near(0.5, 0.0001)); - assert_that!( - td1.get_rank((n * 3 / 4) as f64).unwrap(), - near(0.75, 0.0001) - ); - assert_that!(td1.get_rank(n as f64).unwrap(), eq(1.0)); + assert_that!(td1.rank(0.0).unwrap(), near(0.0, 0.0001)); + assert_that!(td1.rank((n / 4) as f64).unwrap(), near(0.25, 0.0001)); + assert_that!(td1.rank((n / 2) as f64).unwrap(), near(0.5, 0.0001)); + assert_that!(td1.rank((n * 3 / 4) as f64).unwrap(), near(0.75, 0.0001)); + assert_that!(td1.rank(n as f64).unwrap(), eq(1.0)); } From 7175d986a6a93bf27072a1d579a08a4f0ff2cbea Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 16 Dec 2025 23:43:50 +0800 Subject: [PATCH 12/26] enable freeze TDigestMut Signed-off-by: tison --- src/tdigest/mod.rs | 1 + src/tdigest/sketch.rs | 479 ++++++++++++++++++---------- tests/tdigest_serialization_test.rs | 19 +- tests/tdigest_test.rs | 22 +- 4 files changed, 343 insertions(+), 178 deletions(-) diff --git a/src/tdigest/mod.rs b/src/tdigest/mod.rs index 002e479..ad9ca42 100644 --- a/src/tdigest/mod.rs +++ b/src/tdigest/mod.rs @@ -52,3 +52,4 @@ mod serialization; mod sketch; pub use self::sketch::TDigest; +pub use self::sketch::TDigestMut; diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 05514e5..7855729 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -23,13 +23,16 @@ use std::io::Cursor; use crate::error::SerdeError; use crate::tdigest::serialization::*; +/// The default value of K if one is not specified. +const DEFAULT_K: u16 = 200; +/// Multiplier for buffer size relative to centroids capacity. const BUFFER_MULTIPLIER: usize = 4; /// T-Digest sketch for estimating quantiles and ranks. /// /// See the [module documentation](self) for more details. #[derive(Debug, Clone)] -pub struct TDigest { +pub struct TDigestMut { k: u16, reverse_merge: bool, @@ -42,16 +45,13 @@ pub struct TDigest { buffer: Vec, } -impl Default for TDigest { +impl Default for TDigestMut { fn default() -> Self { - TDigest::new(Self::DEFAULT_K) + TDigestMut::new(DEFAULT_K) } } -impl TDigest { - /// The default value of K if one is not specified. - pub const DEFAULT_K: u16 = 200; - +impl TDigestMut { /// Creates a tdigest instance with the given value of k. /// /// # Panics @@ -87,7 +87,7 @@ impl TDigest { centroids.reserve(centroids_capacity); buffer.reserve(centroids_capacity * BUFFER_MULTIPLIER); - TDigest { + TDigestMut { k, reverse_merge, min, @@ -148,7 +148,7 @@ impl TDigest { } /// Merge the given TDigest into this one - pub fn merge(&mut self, other: &TDigest) { + pub fn merge(&mut self, other: &TDigestMut) { if other.is_empty() { return; } @@ -168,6 +168,29 @@ impl TDigest { self.do_merge(tmp, self.buffer.len() as u64 + other.total_weight()) } + /// Freezes this TDigest into an immutable one. + pub fn freeze(mut self) -> TDigest { + self.compress(); + TDigest { + k: self.k, + reverse_merge: self.reverse_merge, + min: self.min, + max: self.max, + centroids: self.centroids, + centroids_weight: self.centroids_weight, + centroids_capacity: self.centroids_capacity, + } + } + + fn view(&self) -> TDigestView<'_> { + TDigestView { + min: self.min, + max: self.max, + centroids: &self.centroids, + centroids_weight: self.centroids_weight, + } + } + /// Compute approximate normalized rank (from 0 to 1 inclusive) of the given value. /// /// Returns `None` if TDigest is empty. @@ -193,81 +216,7 @@ impl TDigest { } self.compress(); // side effect - let centroids_weight = self.centroids_weight as f64; - let num_centroids = self.centroids.len(); - - // left tail - let first_mean = self.centroids[0].mean; - if value < first_mean { - if first_mean - self.min > 0. { - return Some(if value == self.min { - 0.5 / centroids_weight - } else { - 1. + (((value - self.min) / (first_mean - self.min)) - * ((self.centroids[0].weight as f64 / 2.) - 1.)) - }); - } - return Some(0.); // should never happen - } - - // right tail - let last_mean = self.centroids[num_centroids - 1].mean; - if value > last_mean { - if self.max - last_mean > 0. { - return Some(if value == self.max { - 1. - (0.5 / centroids_weight) - } else { - 1.0 - ((1.0 - + (((self.max - value) / (self.max - last_mean)) - * ((self.centroids[num_centroids - 1].weight as f64 / 2.) - 1.))) - / centroids_weight) - }); - } - return Some(1.); // should never happen - } - - let mut lower = self - .centroids - .binary_search_by(|c| centroid_lower_bound(c, value)) - .unwrap_or_else(identity); - debug_assert_ne!(lower, num_centroids, "get_rank: lower == end"); - let mut upper = self - .centroids - .binary_search_by(|c| centroid_upper_bound(c, value)) - .unwrap_or_else(identity); - debug_assert_ne!(upper, 0, "get_rank: upper == begin"); - if value < self.centroids[lower].mean { - lower -= 1; - } - if (upper == num_centroids) || (self.centroids[upper - 1].mean >= value) { - upper -= 1; - } - - let mut weight_below = 0.; - let mut i = 0; - while i < lower { - weight_below += self.centroids[i].weight as f64; - i += 1; - } - weight_below += self.centroids[lower].weight as f64 / 2.; - - let mut weight_delta = 0.; - while i < upper { - weight_delta += self.centroids[i].weight as f64; - i += 1; - } - weight_delta -= self.centroids[lower].weight as f64 / 2.; - weight_delta += self.centroids[upper].weight as f64 / 2.; - Some( - if self.centroids[upper].mean - self.centroids[lower].mean > 0. { - (weight_below - + (weight_delta * (value - self.centroids[lower].mean) - / (self.centroids[upper].mean - self.centroids[lower].mean))) - / centroids_weight - } else { - (weight_below + weight_delta / 2.) / centroids_weight - }, - ) + self.view().rank(value) } /// Compute approximate quantile value corresponding to the given normalized rank. @@ -285,79 +234,7 @@ impl TDigest { } self.compress(); // side effect - if self.centroids.len() == 1 { - return Some(self.centroids[0].mean); - } - - // at least 2 centroids - let centroids_weight = self.centroids_weight as f64; - let num_centroids = self.centroids.len(); - let weight = rank * centroids_weight; - if weight < 1. { - return Some(self.min); - } - if weight > centroids_weight - 1. { - return Some(self.max); - } - let first_weight = self.centroids[0].weight as f64; - if first_weight > 1. && weight < first_weight / 2. { - return Some( - self.min - + (((weight - 1.) / ((first_weight / 2.) - 1.)) - * (self.centroids[0].mean - self.min)), - ); - } - let last_weight = self.centroids[num_centroids - 1].weight as f64; - if last_weight > 1. && (centroids_weight - weight <= last_weight / 2.) { - return Some( - self.max - + (((centroids_weight - weight - 1.) / ((last_weight / 2.) - 1.)) - * (self.max - self.centroids[num_centroids - 1].mean)), - ); - } - - // interpolate between extremes - let mut weight_so_far = first_weight / 2.; - for i in 0..(num_centroids - 1) { - let dw = (self.centroids[i].weight + self.centroids[i + 1].weight) as f64 / 2.; - if weight_so_far + dw > weight { - // the target weight is between centroids i and i+1 - let mut left_weight = 0.; - if self.centroids[i].weight == 1 { - if weight - weight_so_far < 0.5 { - return Some(self.centroids[i].mean); - } - left_weight = 0.5; - } - let mut right_weight = 0.; - if self.centroids[i + 1].weight == 1 { - if weight_so_far + dw - weight < 0.5 { - return Some(self.centroids[i + 1].mean); - } - right_weight = 0.5; - } - let w1 = weight - weight_so_far - left_weight; - let w2 = weight_so_far + dw - weight - right_weight; - return Some(weighted_average( - self.centroids[i].mean, - w1, - self.centroids[i + 1].mean, - w2, - )); - } - weight_so_far += dw; - } - - let w1 = weight - - (self.centroids_weight as f64) - - ((self.centroids[num_centroids - 1].weight as f64) / 2.); - let w2 = (self.centroids[num_centroids - 1].weight as f64 / 2.) - w1; - Some(weighted_average( - self.centroids[num_centroids - 1].mean, - w1, - self.max, - w2, - )) + self.view().quantile(rank) } /// Serializes this TDigest to bytes. @@ -480,7 +357,7 @@ impl TDigest { } cursor.read_u16::().map_err(make_error(""))?; // unused if is_empty { - return Ok(TDigest::new(k)); + return Ok(TDigestMut::new(k)); } let reverse_merge = (flags & FLAGS_REVERSE_MERGE) != 0; @@ -494,7 +371,7 @@ impl TDigest { .read_f64::() .map_err(make_error("single_value"))? }; - return Ok(TDigest::make( + return Ok(TDigestMut::make( k, reverse_merge, value, @@ -553,7 +430,7 @@ impl TDigest { .map_err(make_error("buffered_value"))? }) } - Ok(TDigest::make( + Ok(TDigestMut::make( k, reverse_merge, min, @@ -638,6 +515,288 @@ impl TDigest { } } +/// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks. +/// +/// See the [module documentation](self) for more details. +pub struct TDigest { + k: u16, + + reverse_merge: bool, + min: f64, + max: f64, + + centroids: Vec, + centroids_weight: u64, + centroids_capacity: usize, +} + +impl TDigest { + /// Returns parameter k (compression) that was used to configure this TDigest. + pub fn k(&self) -> u16 { + self.k + } + + /// Returns true if TDigest has not seen any data. + pub fn is_empty(&self) -> bool { + self.centroids.is_empty() + } + + /// Returns minimum value seen by TDigest; `None` if TDigest is empty. + pub fn min_value(&self) -> Option { + if self.is_empty() { + None + } else { + Some(self.min) + } + } + + /// Returns maximum value seen by TDigest; `None` if TDigest is empty. + pub fn max_value(&self) -> Option { + if self.is_empty() { + None + } else { + Some(self.max) + } + } + + /// Returns total weight. + pub fn total_weight(&self) -> u64 { + self.centroids_weight + } + + fn view(&self) -> TDigestView<'_> { + TDigestView { + min: self.min, + max: self.max, + centroids: &self.centroids, + centroids_weight: self.centroids_weight, + } + } + + /// Compute approximate normalized rank (from 0 to 1 inclusive) of the given value. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If the value is `NaN`. + pub fn rank(&self, value: f64) -> Option { + assert!(!value.is_nan(), "value must not be NaN"); + self.view().rank(value) + } + + /// Compute approximate quantile value corresponding to the given normalized rank. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If rank is not in [0.0, 1.0]. + pub fn quantile(&self, rank: f64) -> Option { + assert!((0.0..=1.0).contains(&rank), "rank must be in [0.0, 1.0]"); + self.view().quantile(rank) + } + + /// Converts this immutable TDigest into a mutable one. + pub fn into_mut(mut self) -> TDigestMut { + self.centroids.reserve(self.centroids_capacity); + TDigestMut::make( + self.k, + self.reverse_merge, + self.min, + self.max, + self.centroids, + self.centroids_weight, + Vec::with_capacity(self.centroids_capacity * BUFFER_MULTIPLIER), + ) + } +} + +struct TDigestView<'a> { + min: f64, + max: f64, + centroids: &'a [Centroid], + centroids_weight: u64, +} + +impl TDigestView<'_> { + fn rank(&self, value: f64) -> Option { + debug_assert!(!value.is_nan(), "value must not be NaN"); + + if self.centroids.is_empty() { + return None; + } + if value < self.min { + return Some(0.0); + } + if value > self.max { + return Some(1.0); + } + // one centroid and value == min == max + if self.centroids.len() == 1 { + return Some(0.5); + } + + let centroids_weight = self.centroids_weight as f64; + let num_centroids = self.centroids.len(); + + // left tail + let first_mean = self.centroids[0].mean; + if value < first_mean { + if first_mean - self.min > 0. { + return Some(if value == self.min { + 0.5 / centroids_weight + } else { + 1. + (((value - self.min) / (first_mean - self.min)) + * ((self.centroids[0].weight as f64 / 2.) - 1.)) + }); + } + return Some(0.); // should never happen + } + + // right tail + let last_mean = self.centroids[num_centroids - 1].mean; + if value > last_mean { + if self.max - last_mean > 0. { + return Some(if value == self.max { + 1. - (0.5 / centroids_weight) + } else { + 1.0 - ((1.0 + + (((self.max - value) / (self.max - last_mean)) + * ((self.centroids[num_centroids - 1].weight as f64 / 2.) - 1.))) + / centroids_weight) + }); + } + return Some(1.); // should never happen + } + + let mut lower = self + .centroids + .binary_search_by(|c| centroid_lower_bound(c, value)) + .unwrap_or_else(identity); + debug_assert_ne!(lower, num_centroids, "get_rank: lower == end"); + let mut upper = self + .centroids + .binary_search_by(|c| centroid_upper_bound(c, value)) + .unwrap_or_else(identity); + debug_assert_ne!(upper, 0, "get_rank: upper == begin"); + if value < self.centroids[lower].mean { + lower -= 1; + } + if (upper == num_centroids) || (self.centroids[upper - 1].mean >= value) { + upper -= 1; + } + + let mut weight_below = 0.; + let mut i = 0; + while i < lower { + weight_below += self.centroids[i].weight as f64; + i += 1; + } + weight_below += self.centroids[lower].weight as f64 / 2.; + + let mut weight_delta = 0.; + while i < upper { + weight_delta += self.centroids[i].weight as f64; + i += 1; + } + weight_delta -= self.centroids[lower].weight as f64 / 2.; + weight_delta += self.centroids[upper].weight as f64 / 2.; + Some( + if self.centroids[upper].mean - self.centroids[lower].mean > 0. { + (weight_below + + (weight_delta * (value - self.centroids[lower].mean) + / (self.centroids[upper].mean - self.centroids[lower].mean))) + / centroids_weight + } else { + (weight_below + weight_delta / 2.) / centroids_weight + }, + ) + } + + fn quantile(&self, rank: f64) -> Option { + debug_assert!((0.0..=1.0).contains(&rank), "rank must be in [0.0, 1.0]"); + + if self.centroids.is_empty() { + return None; + } + + if self.centroids.len() == 1 { + return Some(self.centroids[0].mean); + } + + // at least 2 centroids + let centroids_weight = self.centroids_weight as f64; + let num_centroids = self.centroids.len(); + let weight = rank * centroids_weight; + if weight < 1. { + return Some(self.min); + } + if weight > centroids_weight - 1. { + return Some(self.max); + } + let first_weight = self.centroids[0].weight as f64; + if first_weight > 1. && weight < first_weight / 2. { + return Some( + self.min + + (((weight - 1.) / ((first_weight / 2.) - 1.)) + * (self.centroids[0].mean - self.min)), + ); + } + let last_weight = self.centroids[num_centroids - 1].weight as f64; + if last_weight > 1. && (centroids_weight - weight <= last_weight / 2.) { + return Some( + self.max + + (((centroids_weight - weight - 1.) / ((last_weight / 2.) - 1.)) + * (self.max - self.centroids[num_centroids - 1].mean)), + ); + } + + // interpolate between extremes + let mut weight_so_far = first_weight / 2.; + for i in 0..(num_centroids - 1) { + let dw = (self.centroids[i].weight + self.centroids[i + 1].weight) as f64 / 2.; + if weight_so_far + dw > weight { + // the target weight is between centroids i and i+1 + let mut left_weight = 0.; + if self.centroids[i].weight == 1 { + if weight - weight_so_far < 0.5 { + return Some(self.centroids[i].mean); + } + left_weight = 0.5; + } + let mut right_weight = 0.; + if self.centroids[i + 1].weight == 1 { + if weight_so_far + dw - weight < 0.5 { + return Some(self.centroids[i + 1].mean); + } + right_weight = 0.5; + } + let w1 = weight - weight_so_far - left_weight; + let w2 = weight_so_far + dw - weight - right_weight; + return Some(weighted_average( + self.centroids[i].mean, + w1, + self.centroids[i + 1].mean, + w2, + )); + } + weight_so_far += dw; + } + + let w1 = weight + - (self.centroids_weight as f64) + - ((self.centroids[num_centroids - 1].weight as f64) / 2.); + let w2 = (self.centroids[num_centroids - 1].weight as f64 / 2.) - w1; + Some(weighted_average( + self.centroids[num_centroids - 1].mean, + w1, + self.max, + w2, + )) + } +} + fn centroid_cmp(a: &Centroid, b: &Centroid) -> Ordering { match a.mean.partial_cmp(&b.mean) { Some(order) => order, diff --git a/tests/tdigest_serialization_test.rs b/tests/tdigest_serialization_test.rs index 5b24d1e..c8d476c 100644 --- a/tests/tdigest_serialization_test.rs +++ b/tests/tdigest_serialization_test.rs @@ -15,17 +15,19 @@ // specific language governing permissions and limitations // under the License. -use datasketches::tdigest::TDigest; +use datasketches::tdigest::TDigestMut; #[test] fn test_empty() { - let mut td = TDigest::new(100); + let mut td = TDigestMut::new(100); assert!(td.is_empty()); let bytes = td.serialize(); assert_eq!(bytes.len(), 8); + let td = td.freeze(); - let deserialized_td = TDigest::deserialize(&bytes, false).unwrap(); + let deserialized_td = TDigestMut::deserialize(&bytes, false).unwrap(); + let deserialized_td = deserialized_td.freeze(); assert_eq!(td.k(), deserialized_td.k()); assert_eq!(td.total_weight(), deserialized_td.total_weight()); assert!(td.is_empty()); @@ -34,13 +36,14 @@ fn test_empty() { #[test] fn test_single_value() { - let mut td = TDigest::default(); + let mut td = TDigestMut::default(); td.update(123.0); let bytes = td.serialize(); assert_eq!(bytes.len(), 16); - let deserialized_td = TDigest::deserialize(&bytes, false).unwrap(); + let deserialized_td = TDigestMut::deserialize(&bytes, false).unwrap(); + let deserialized_td = deserialized_td.freeze(); assert_eq!(deserialized_td.k(), 200); assert_eq!(deserialized_td.total_weight(), 1); assert!(!deserialized_td.is_empty()); @@ -50,15 +53,17 @@ fn test_single_value() { #[test] fn test_many_values() { - let mut td = TDigest::new(100); + let mut td = TDigestMut::new(100); for i in 0..1000 { td.update(i as f64); } let bytes = td.serialize(); assert_eq!(bytes.len(), 1584); + let td = td.freeze(); - let mut deserialized_td = TDigest::deserialize(&bytes, false).unwrap(); + let deserialized_td = TDigestMut::deserialize(&bytes, false).unwrap(); + let deserialized_td = deserialized_td.freeze(); assert_eq!(td.k(), deserialized_td.k()); assert_eq!(td.total_weight(), deserialized_td.total_weight()); assert_eq!(td.is_empty(), deserialized_td.is_empty()); diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs index 4b43296..7107b2a 100644 --- a/tests/tdigest_test.rs +++ b/tests/tdigest_test.rs @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. -use datasketches::tdigest::TDigest; +use datasketches::tdigest::TDigestMut; use googletest::assert_that; use googletest::prelude::{eq, near}; #[test] fn test_empty() { - let mut tdigest = TDigest::new(10); + let mut tdigest = TDigestMut::new(10); assert!(tdigest.is_empty()); assert_eq!(tdigest.k(), 10); assert_eq!(tdigest.total_weight(), 0); @@ -38,7 +38,7 @@ fn test_empty() { #[test] fn test_one_value() { - let mut tdigest = TDigest::new(100); + let mut tdigest = TDigestMut::new(100); tdigest.update(1.0); assert_eq!(tdigest.k(), 100); assert_eq!(tdigest.total_weight(), 1); @@ -56,7 +56,7 @@ fn test_one_value() { fn test_many_values() { let n = 10000; - let mut tdigest = TDigest::default(); + let mut tdigest = TDigestMut::default(); for i in 0..n { tdigest.update(i as f64); } @@ -103,7 +103,7 @@ fn test_many_values() { #[test] fn test_rank_two_values() { - let mut tdigest = TDigest::new(100); + let mut tdigest = TDigestMut::new(100); tdigest.update(1.0); tdigest.update(2.0); assert_eq!(tdigest.rank(0.99), Some(0.0)); @@ -117,7 +117,7 @@ fn test_rank_two_values() { #[test] fn test_rank_repeated_values() { - let mut tdigest = TDigest::new(100); + let mut tdigest = TDigestMut::new(100); tdigest.update(1.0); tdigest.update(1.0); tdigest.update(1.0); @@ -129,7 +129,7 @@ fn test_rank_repeated_values() { #[test] fn test_repeated_blocks() { - let mut tdigest = TDigest::new(100); + let mut tdigest = TDigestMut::new(100); tdigest.update(1.0); tdigest.update(2.0); tdigest.update(2.0); @@ -143,10 +143,10 @@ fn test_repeated_blocks() { #[test] fn test_merge_small() { - let mut td1 = TDigest::new(10); + let mut td1 = TDigestMut::new(10); td1.update(1.0); td1.update(2.0); - let mut td2 = TDigest::new(10); + let mut td2 = TDigestMut::new(10); td2.update(2.0); td2.update(3.0); td1.merge(&td2); @@ -164,8 +164,8 @@ fn test_merge_small() { fn test_merge_large() { let n = 10000; - let mut td1 = TDigest::new(10); - let mut td2 = TDigest::new(10); + let mut td1 = TDigestMut::new(10); + let mut td2 = TDigestMut::new(10); let sup = n / 2; for i in 0..sup { td1.update(i as f64); From b37f08b6bef366321779c9251351addd65fbf365 Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 07:46:23 +0800 Subject: [PATCH 13/26] add serde compat test files Signed-off-by: tison --- .../tdigest_double_buf_n0_cpp.sk | Bin 0 -> 8 bytes .../tdigest_double_buf_n1000000_cpp.sk | Bin 0 -> 5392 bytes .../tdigest_double_buf_n100000_cpp.sk | Bin 0 -> 2480 bytes .../tdigest_double_buf_n10000_cpp.sk | Bin 0 -> 8032 bytes .../tdigest_double_buf_n1000_cpp.sk | Bin 0 -> 2736 bytes .../tdigest_double_buf_n100_cpp.sk | Bin 0 -> 832 bytes .../tdigest_double_buf_n10_cpp.sk | Bin 0 -> 112 bytes .../tdigest_double_buf_n1_cpp.sk | Bin 0 -> 16 bytes .../tdigest_double_n0_cpp.sk | Bin 0 -> 8 bytes .../tdigest_double_n1000000_cpp.sk | Bin 0 -> 2224 bytes .../tdigest_double_n100000_cpp.sk | Bin 0 -> 2192 bytes .../tdigest_double_n10000_cpp.sk | Bin 0 -> 1984 bytes .../tdigest_double_n1000_cpp.sk | Bin 0 -> 1584 bytes .../tdigest_double_n100_cpp.sk | Bin 0 -> 1248 bytes .../tdigest_double_n10_cpp.sk | Bin 0 -> 192 bytes .../tdigest_double_n1_cpp.sk | Bin 0 -> 16 bytes .../tdigest_float_buf_n0_cpp.sk | Bin 0 -> 8 bytes .../tdigest_float_buf_n1000000_cpp.sk | Bin 0 -> 2704 bytes .../tdigest_float_buf_n100000_cpp.sk | Bin 0 -> 1248 bytes .../tdigest_float_buf_n10000_cpp.sk | Bin 0 -> 4024 bytes .../tdigest_float_buf_n1000_cpp.sk | Bin 0 -> 1376 bytes .../tdigest_float_buf_n100_cpp.sk | Bin 0 -> 424 bytes .../tdigest_float_buf_n10_cpp.sk | Bin 0 -> 64 bytes .../tdigest_float_buf_n1_cpp.sk | Bin 0 -> 12 bytes .../tdigest_float_n0_cpp.sk | Bin 0 -> 8 bytes .../tdigest_float_n1000000_cpp.sk | Bin 0 -> 1120 bytes .../tdigest_float_n100000_cpp.sk | Bin 0 -> 1104 bytes .../tdigest_float_n10000_cpp.sk | Bin 0 -> 1000 bytes .../tdigest_float_n1000_cpp.sk | Bin 0 -> 800 bytes .../tdigest_float_n100_cpp.sk | Bin 0 -> 632 bytes .../tdigest_float_n10_cpp.sk | Bin 0 -> 104 bytes .../tdigest_float_n1_cpp.sk | Bin 0 -> 12 bytes .../tdigest_double_n0_java.sk | Bin 0 -> 8 bytes .../tdigest_double_n1000000_java.sk | Bin 0 -> 2224 bytes .../tdigest_double_n100000_java.sk | Bin 0 -> 2192 bytes .../tdigest_double_n10000_java.sk | Bin 0 -> 1984 bytes .../tdigest_double_n1000_java.sk | Bin 0 -> 1584 bytes .../tdigest_double_n100_java.sk | Bin 0 -> 1248 bytes .../tdigest_double_n10_java.sk | Bin 0 -> 192 bytes .../tdigest_double_n1_java.sk | Bin 0 -> 16 bytes tests/tdigest_serialization_test.rs | 91 ++++++++++++++++++ 41 files changed, 91 insertions(+) create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n0_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n0_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n100000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n10000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n100_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n10_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n1_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n0_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n1000000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n1000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n1_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n0_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n100000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n10000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n100_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n10_cpp.sk create mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n1_cpp.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n0_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n1000000_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n100000_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n10000_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n1000_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n100_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n10_java.sk create mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n1_java.sk diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n0_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n0_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..e84c2ea503d5ba25b64de9f767fb3490b28860f5 GIT binary patch literal 8 PcmZQ%6iH!VWMBXQ0)_y5 literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..e488b6a303779dcae01f1d7fd5d086ad1816c928 GIT binary patch literal 5392 zcmZA5e@s1v{EW2Fj{?OLs- z%1+5sQL+arc50O>k)5v@7f}PDOVYvN*uBziITePw9 zq}nV`u=&0EL$s^rjJht?tI6k7^$7d*+O}U%{=RmrpdVF~*bwSYyBa4qz^b2B{%CuB z+o#G_zpL8G_S~_vUwt#%ZfE`9 zYDui!u7*MNZM5BvwU5=R8KP^Ss#{{i1o!`|T;h0Rp_ks2Wv^F@z4da@{yrmh?_zu2 z)ILgIUTt^ahycB`!0s!51?rS`yAPQ`I_wv_n+qrEJ~8)i4$+so?D_unVcM(K?yfM0 z9xmp)@+Rx=#0CxcdWyby$zI=^5UvBR+TC&YRsGwqc00nR>5sDQuIrnoz4Gneb2`$k zKilz|HpF~ee3TwoV6Shz9i`t{Y4^6`>AGCJzgurj*Yf?|k`(PW-+V0EZJuDv&?>}! zy)&`yosjR|XFhlDS1!!ZjsW}kmABmci+|kv)_BuxePE=i$Bwj*kC|=iG2(hiNpzn( zm$=WbzH90fasBmXnfk1_o)_nux=Hk(1*RSp_k)x|Q?C{M&}r&fqC-D6^<(?bGd@^v z>U1%$-C*htF^|}2YOy_xmQquT?P1I+Gj*d_|K283SBRcpZtC|%Kejqq^p?%0miO0w zt6vxM1edA9MQ^q`RP-^c#pQ4GSUpzEA6YFGXwRpRYG>=aA=oKEF;nA5Mo#oNlRuBEVIhc!iSb%G= z5S>_r>#!J0uoTO%99>v}mADi+=KPF7x&?QY`_C}5D(#DY{Vnjgw1#q zkK+kEiKp;1wqPr^;W<2y7x6M)!FKGxPVB;NyoNn^9dF=G?8V!77w=&o-p77?fCKmt z2k{X;!Ke5PpQBOnV&4rzZ}dT5^h19fje$50gKz={V+e-gBy`{u495tJ#3+nL6Ju~D z#^M}|!?_rb^H5 z44lX@K|F+qu@R486E@>fJdP*uB%Z?4*n+LthUf4+ zUc}3I1>3O$JFyG9@f!Bvb-aN$u@`USUA%{Vcpv-m0S@3p9K=WX1fSwFe2xa6cMU^t z^g&Z~_Kn2!`S$bl?;W#|VtXD2zrEV{j(M;v9^_xfqZ0P+=m@#|4;# w3vm%9V+t_r>#!J0uoTO%99@?G0Xv~9-v9sr literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..9616a819437daff31c00764fd811b24c8d9066d6 GIT binary patch literal 2480 zcmZA3Z)jC@9LMpWwykCUxtk;IgXOMbI3~;mF%~QRl+yM<>f$7Iu5p@oxOiV+mXBz@zMN7nZV9uV6Xbaj?R07b@6Z29@k|59-)a9~#)TZ=jJq^cI@g z+5LEy-SjS=VHaGqvA2GJL+tH2yu)rif)RH00N!SQatv45FCWLJ?92)D@&G^o8OqqN ze{MeiMCbe2+vp?b|Gk6Nbbi$R2Ts!a3nu==_w;;W=@`DHfy|z`i-V6X zynZ5>K!FC{AO0I{v_9&HP(!abR{sxDWL0tx?~~nY_i%)sucz-mJi0#RGxw39-{;KX zN#pq7)Fh75dbe{5{p8%}6e{U@=BuYsLHirFPvdLa|HSEO{6u!gr*Vna>zX10ny=iJ zB8F-HysjetBkNodKhk>ld=XdZxHoF9PyJQIn|yqxh~4C|6&lgPb||$<;}>#ry+%MT zG-{k7r=jsNIcm}9BPUi2*+>evp+PHt`V$=4kFgAUBRqbAF{QYoZ)AlOQ@;9+*`&7Yl6q~k3 zsePssmQ$7+EW>izvST@G*|qG&{{Qmkr9)C*8c0KFBvn0g=Ov^mX@eBfwA7JirLNSI u=A}bYUm8e5X(Uy>^7_)0v_T4KTIxu%QdjCp^U@)yFAb!jG?J?L|FH)-Sy@E@ literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..0958b673f9f6a9076953d0b8fae7c2ad15db29b2 GIT binary patch literal 8032 zcmZA2d;Hb&9l-Hd_xt^RQArX>B_+E2-W7>%y68swm6V7sq(b?M$V`hio0&+4H6z4G zew|b7*6d`aaM5d(Pu=zK?U>r)rhvv#ActfJ0R) z9}XV}%MNGFt~lx{M^27DM&7kc+_=QeOWeA|?MvLL#GOmrrNmuJ+^xjjZk5BJsPGiFONn!&x$6g*UgTmsSnPH=Bd}siX?>#;+pvX_y5=t zcg3({s63gtHddu>e@`q;eee2sHT9(XVngc2_s5RZ`yYtSsfRukHPau|{EhKiI#=B(zJ9?I@o_pIvpL>Pz3|B>ORso}-ifmw+hR@n{%>!KeW@ROIX0#) z+a8+|XK!wgwTYSeN~}(twWx>#iP={XkEdR>Bj;#Vh)2@-j6&Rd9tvu82!@uoyF*pI5rfcYvRJw#pskct0+dh#OyBSevjFn zVl+$U@m4V!C(hn0MuXJv7jye0J6O!^pTY;l+2@Tcc0`P?9A`WsylQ0 zHLI3)q596;elFC=JFB@f|9?;`@2tesE}55@IwkWGQ@3PZV(OL5OHBQed8H5E{su>y zSsH3fBW-D{ElsqgskSuJmgd^hLR(sDODk<@tu1Y|rLDHK)0Xzya-6nw(3Xzc(n(v6 z*Ot!Oa)P#W(UudnrK`4_q%GaFO8Mr+F$Z5gXA7i!BmZ5gjE6SQTbwp^qwleA^BwoK8MsoFA4TQ1g? zOSEOWwp^+$Gqj~#TV`s@EN!_=TV`v^<=QewTjpxZ71}aSTdvfWtF&dlwp^_(*J#TE zZCR)-i?n62wp^<%*J;ZVZMj}smTJo~ZCS1@H)zX^+H#Y&tk9N~+H$kDWZH6zwye^Y zTeamjZCR}?w`y-UTU*v@%jdLZownSgE%$26dTsf+Enm@=2eoB`wmhUQ4{OUtZFxjnHfhVF+VWLxc}!a#*On)= z*OnUE zQd3)MX-jQwsiQ4*wWXf6)Yp~<+R{*48fi;oZE2z{O|_+&wlvq47TVHMTUu#LYi((x zEp4@>owl^smgBUggSK?kmQLDoytZ`KmJ_t4i?*DoEnT(cByH)YEhlTsDcaIqTTa!M z9@=u6w)E7NUfR-ITTa)OKH74Iw)EAOe%f-Tw)EGQv$W-GZ5f~~1GQz4whY#mGHnTM z8KNyiwPl#LoTDx0YRhnKIZsBb zwdEphnWQa~wPlL7Ox2cY+H$eBT%s-0wdGQ6nV~J^+A>pHW@*c1+A>>PF4vYh+A>#L zuF#fw+H$3~T%|4ZwdHDUxkg(SXv;!vS)?tCwdGoExlUV_Xv_85vQ%4^Y0GkLxj|cQ p)RvpHWren^)RvpICDWE$v}KjH+^Q|NY0GMDxm{cC(3Um3;(x|G!3zKY literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..8a1bdff0c8e8ed96a6d84f94a9ae2288e0921de0 GIT binary patch literal 2736 zcmZA3KWGzC9Ki8c|3vnQarl)xCB@f(A9?HSJ(doBGZucTNeC1bpWWk2zo_G1b?29@${BnA zTVLJtv#{o#zwVlN_V2@W_x!ik<+wdQyWw8H@`ihTyBqHH&!z7ASWMmZlcw(a>Zb1c z8>jC2^lEM|)ZF#Gq`Fdb?~hdVR?WS?eyi@P{-b(WbMNnw>apsH>P)rSbhnpN?WxYI z_EiU}3#voaqB`1??aN=p62@4@3RW?}6zkZ)CbqDR9qeHr2ROtLj&Xt+nr}PDokI`v z=wpBd3{fz`B9<`5GFGsP38q-b1~##UZR}tV`#8WMj&O_<%+P#i{^(&IeGIUGAqqxV z#1h6>#tK$3!4&J*z$Uh^jUDV^9|t(Z5sqUW8OBiDrD_F$@Q>%B6hzU2h>&0c0s@2(LI@$ehxvH#?c^-vEv=Wt5(xf*PEOxf<70#Gc>kaL z4Exil>;3EK2~KjrDNb{Svz+5R7r4kJE^~#eT;n=7xXCSUbBDX!<30~~$RiHf^%iri znG?;NYUWHc=bE|D%%x_oG;^()8_nEm=1w#Bnt9O7qh_A^?pSQhvz^7pyx3W6%&VQn e#=O~CY|Oiz#m0QtS!~RwoyEp{*;#DNsQU%>HB3MN literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..1bab9718be88774b7566e69797fb6fa0ec8cb9cf GIT binary patch literal 112 xcmZQ#6iH!#0xlSX;e$Pdsp0^kVSENCp94w@Kxq*uEdixvptJ&%R)W$h4ghwf22KC~ literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..802a0bde3ac82cdf31c4fdb77cdf7a5f26ce821a GIT binary patch literal 16 RcmZQ%6iH!VVt|4V_5cWw0jvN3 literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n0_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n0_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..e84c2ea503d5ba25b64de9f767fb3490b28860f5 GIT binary patch literal 8 PcmZQ%6iH!VWMBXQ0)_y5 literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..6b87e1eeb12f364780e191ff2d2839f602da4862 GIT binary patch literal 2224 zcmZA3X-HI26bJA-sV(HB`vEN<$}Yq8aE*RWt`( zQA2alv+5}W{aT|u^wMV9gI>0as?m$~Q4c!SM_bXY2k9Pqc^e%^mmZ;V9FR39=`p(d zG_|1l90jq#6&>UbQ44Cju23;p)Lf-TY_PmRU%^^S7rljgin~jD!qj?qcJ@&CNM*7; zq+}Qf`q)R~Z~%&)QTZ5Ef9JqUvcf<(yss(2rt-q4@2DN>=~jTwj#qi$;795K)5Oo@ zi&ME{-vC`qSFULLK{+mE?f4*drYbu&4pEU4?E6K27?|jR-=x9yVu^+83spU3g>oJ^ zeE0|sWU0KpZxlZ$QH~6=a_(B?N8cma)2Dn!+SvX=xnsiw{sMV;M-2BqSNYK@J6n9p z&+QHlfqZ{i9AAS0wVt2M_j^@+z&(Yn{mM^1P38BmlpXd&t}Iq={gTL*a^*wUlMFt* zcNz=GcVr}UaG|Qd{UMnb7bx%Wrf>s1-zI+wSk?6fexpzPG}<2}4fT;plE;ov*QcdRj)48(nQzQ1a*TQZ3dtVWe}O{D z-LRjt)<|v#e_Jbg5cYr5M#-ljkKHVJNPQi#u1az~)cdwdegb(yjbs=HakEx3jDwh7 zC;1%IudbK89o*U=xfA@dQSt%s8%?qod|{j9Rp9LHk~6`5x@Ur&O_K3^uhtFQU3BV( z_KWwr;W!bY`tNWAtT#z_IM}5dkE;c`p`9XEH?&i%)D7(t#k%3R6lEr_*A49vl_qb| Y4eb$LldDaxHM!2@29y8(5Bb0TAANiA%m4rY literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..fbb27272ea654dc17b107216bbe2e9ce2ac40c54 GIT binary patch literal 2192 zcmZA3Ur19?90%~Lsaa|MNn#I{Q;I^t3PKDkzx^TiASEhUX0q0-84X4eBSMYtr>mn%vKy`pF59-?5!7Yk?m|pJqLLGB_y!-UzW$ee+52ncU(gnyZi<|usxlS z9+<8!XcSQ0dkYQ@Si@}`=K$v4!w^~b_23=VBfA$(6tKYQJ|t0Kknse$6fk^#hI9%L zT7wv(z}ahG;uzHjS_Y6zf%3n8gB-4}ev7!|=?S$5-=mWP=1B;nkopHaAK@nZe|BdfLC^#Qf>&rv`@+qQD1wf1rZujT!@4biKj+pRkg( zxDlKr>jNWbrtw_vQTS+n5+_FCq~FtVW=v{t8y~}2s@H4B(L(n2k0YJt)1MhZ8m(`* zI)Xd2{;Ag^ctO^OBIu!dkrhRd^5xwaMK9&i=ZNArS$Ly(PWAfDC>~I|uV1dWev9HD zw@*Z|gFKibU`#KErY#hBMYiM#1j(TSfevyR1RBXkiGZ7I*(|V)oLV7J!mbinMYdH7 zG9PpE+M@iJf`V!*Y298IB14xii2jYmu)jm%dNQ->!xWrHJ@U?;%>zO#X-ek#fDYW8E|jx>9zu{k+>IJ`X9BhC>K<%l zf6-`Rk0-H)U6jHecDN76*tm##cK9-q?BEp~WnUe@d-kqtxXCWQj!|}Y5JT)Ow~)sh zs^bpEIM2O@PwavFNV8)PP{QlukMN80rXhT1cRq&L&>x>*DXrJ-!^radlTWdoay9Y{ z%g8$Y96xw|dIW`>e|UjX_UD)IvGZRcMpn(Q5n>O%!9KE{c#8(IDt`w*SuaRqoS)}w z8V}jYQDOKE?s6V6xJ1^s4HE1LBi^S$1{XL_WpJLX#xm$;`#kiLO|6GcvhMN_Vc+o3 zOg5f}1~Rf9HjveC4|wgqu!#7-DJi7)~#0b0U}oPhuLpMzu=%1 zePrTR^p`$tMZcLNR`j1vSkaH>gcW_Nd#vo=UTJ76`WvSlC#BVCEB+tau{z`Aj#bLZ V9jmiW?pU33a>r`wzx@Am{{xGM!vX*R literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..7832fc376c8e67faa721dbf0ab4e95953b079262 GIT binary patch literal 1584 zcmZA2%}N4M6bJCptfp@ZD+?IPoj4oTu zBlHNfaM_{xi(!6qX6`-r&bfEoE?-J%A1A-3?$M0>T~e7kx0*3m zH`a-uRYH4(eZqd>fN)TFQ8*;LBpeoA7G4ovB{v_RzAn5Wyopw#)WhFbw$S=Ed3c<5 z(E2X*^0=~xuEwbk;tA^KuI`h{tsKw*x0a+qZsm}Mxa&tW%pE+YQEpA6G4Lskb5~L{ z!QDI~A9v+~rnt2D8MJm!3p^fwz2yMXaf5%aN&SRZPMMtHvGC9E%}g!QMISf2?K>(?-`z8w?mzhUC~@LK4Ih3hLR zd}ZPKG=yyn*Y}68Bm5;?vv7Z@3pa$D!YyH?jP0qyUSYp*P&h2SCL9%R{|El}_aCJV BY%c%+ literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..9d1d60a90feb1ee89c5f72fcaabfec50c18cd8c7 GIT binary patch literal 1248 zcmZ9}IZ8x96o%2;je(&-2L=WrA|fKrBl605YNvMW>;*2QmR3t~3B3e^26=TVDGonh zbo;%*Ps~INqx&Y!RO2-LCrz(AN2J=bcn%NnJYK+ycnL4#6}*bq@H*bWn|KRv;~l(< z_wYVGz=!wso)!WkDF@ANq1g5!f8XIyZ6(&LN^jxTzgal!FTk25Yf#{c~vmF`4X literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n10_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n10_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..2c1026eed9a55fb4a38dd5f9ac9bebf162eddfb2 GIT binary patch literal 192 zcmZQ#6iH!VVPN25Kn5S|Av_fa2>rpH5m}mn!2y-efyNg=!xXA*jieb}e$;xsnmt zp#(SWVT2we!lqC#Qkb%b!X8efX(=|OKGaJiMEHkl2*D);`<}V8@Zr0^yYsu;Id>j* zQdj?W!OsCiUG_JepsDb^lWuO(TVEfq65RrZFO~cb1!OEOxLYx z%&5>$TaDHBAQrvcm}3v1a;PynlkPj#=(Pv&#rKVs{q)^G8z=4SwSpYB}N z+Dv_Su$4VScWi83c!~E}HPYI&lgsuUXnB{o{XL(w7GL3Z_MBbacD;|Pjrnaxk!kK{$JTkyE^mHps1F+Ro_(Z_p{w<7T<%OGuD~k94qF%m&cT6zrU%W z`i}2;{_Te z4sqmqs7}xuw?g#}b^Z<24*K=oP;I1@d!e$=tA9UKYw1K+TdlC|ZmR)0KDn*z*Kek@ zRUb{JwiUN2Mk;O9Lno+xyhf(Am3_PnwU1q;r>*SmNNOL)5z6g|@v<@HV~eD$DVvtf z%I0Oo^v?COsRI`h9()AQh!7)zL5d7H#EiC@4F@hHJopHp5g|qbgA^HZh+g*Lz=ebd z9|1HX#7JO}B0~-_lYKaFA>qMC0F4MS5*Vb&kVDL39}Zkdc<>QGBSMS>1}QS+5VP5b z0~Zn=d<4*l5F>#>iVQhKANz3NLc)WO02&cuBrr&kA%~d5J{-7^@ZckWMuZp%3{qss zA?C6V2QDN$_z0j8Aw~j&6d7`ee)i$Og@gwm0W>1SNMMj6Lk=;IeK>F-;lW1$jR-Lk z7^KLML(FF%4qQlh@DV^GLW~3kDKg{`3)qJP7ZM(P1ki{OBY{DR3^~L?_Tj*Vga;o1 zG$O=EV2~n14)GxSaNt71gO30j5n?1TNRc6jSj0XYxRCJRBY;MP7zqqgWXK^NVjm7% zNOQGBSMS>1}QS+5Dxor;6lQKj{q7GVk9s~ zks*gz%03*pknrFmfJTHE2@FzX$RU=o4+kzJJopHp5g|qbgA^HZh~@0VfeQ%_J_2Y& Ph>^e`MTQ*q|IYsbH?aMl literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..efe220d12d1d73109524bbb98fa8a84b3bcd484b GIT binary patch literal 1248 zcmXZcSx6OO6bJB=nwd*(Q4(cl1`(2mWm=YVXFFyq)Ps-@(n=c%LHUw>v?ndMhzN>` zx==)h7^$>5%@l>vLO!&-)Ju!hgOCtJQ2+DY@4}xS=gc^F=06uxMvdPS@)4; z@yYgReZ11>+v$34vBzlJ?F?<)9;>}(XKMTGEbVJMTRUWr)22GP+CnE^RAo+q)^=uT zcRH1#b=s-cbEmUV`@&fy218DrHr-t)3(x|zn`}TyNBSm%g_@%NR z9Pra};DN9HEfl7zh->8d;Vj-EzMRO@a9mYP=LD{&HdXL|6kB2OGZd)?*YN%cP;KJ( zQv8J7yju3(_VZNyower}m&o-?;(S0%oZ}NX&)Rl@u}&L`zQ zM{e>dIo^Dm3*#0T1Ie8V-061Wk|mjo9;;Z$On2@)5B<s zn%PKoW`?vVVujFRh)_L5$btyPAVM}o$bksC5FrmD4`qfJ!vFvP literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..e7ceecf8db3b6dd363e1c924d5b789d7b36e2ec8 GIT binary patch literal 4024 zcmXZfLC7NK8Nl($x0{W->+UzZYqD-4tp^DnY%xR{A(oxhnL(%pX+22rAdL_Y9;8TO zi7Z2Ef+Qw*utb6hCb6v)Tck+wAnhD1NF&4+DMEypM2aCEWRXRJhyLH$5q|!8RB#^W zy?FMRD<@r)q_QMQ$1|QJ^3k6}#zs z*4Ig8yLG#5yM4Rny$jZ5EbSidak)IML?6`?ecYOZcJ=5bYtBZWuzEl0u-b@Tw)$!G ziq%o{s@2b_rlbDOY z(7q8hUe>G|oqtvPfp|akTkUJ{`u-j5=VI>sPWz$g;Y01A_4@C%%VN&_MY}25|C@Fu z8ZotBiPrw5JttcDw^k$4@xR&^qT7}Bu{D3CeJVQspZ1=p<2&t5(f(fBh%O&$e-xEZ zkNSb6e-sG*Ke(>TqJeY$nrP-+x6VoRT({28AI^2_JWbAZ>wK-w zb?f}??VicHbuMLDx6ZA{?z6IPo%bKx4avIrnWV7Yh+W05YBz2-VK-?vWjAd%V>fFz zC+kuk1r$+285J0)qJ}yeXrhHSI_RQ@J_Z02OM#Nbd&wzfD)X_i_Ews@=7d`Ybzz`#hVPb+QW|(7vC01BtgDrO0 z$Y_P=+dmM1Y36hKb;eZn|TyVn!FMRMLfFME$ zBLW2~;z%Hg6w=5biyZPOpokL6sK7uKHPq2S6D_pSK^HyrF~ATbjA3GeDQ1{sfhAU0 zV}mVr*yDgBPLQ5te>mWT3>Vz+zzZMz2q1_M!iYeDiZ~KTB84Q0jSRBLA&&xzD4~oB3{+7=9St(`u zIpk455havSfq^P&sH1@5rF~~aU_sL3Tb4JMGkosP(%r3RA8Wr8tQ1Ei5A-Epo<>* z7+{DI#xOC#6f?}Rz!EF0vB4HQ>~X*mCrHn*KOAsEh6`?Z;Drx<1Q0|BVML%nMH~qv zkwO|7WRXK21r$+285J0)qJ}yeXrhHSI_RQ@J_ZH>qei3As8KOQlOm0U5DQTd@x3#zdyFxzaUD(E_dP1esl~$1#^Qot zWocn~zz<+yV>#Ga3Vr~eIdcL(9{zXkGBY>aMLk3FvRGtRM7j{sy&|u2(;nZF+NP*R zyHShQQR%eLAB>JgC!#4j9j&4Zokjk6@1h^0U!vCO_2a*y|2mvDfBi+Ug=;*%3Qh8 z`?}oJG`%f{o7#tRq^W!^H#f1b&yQ7dLaAzrJ>xu z|8Vxtv(dc1d3`;e{d~Gxv!7qREe|%2+xG0|pLXPdW>>qiUtijt{d(&?*{?sP?DwN8 zvfrPSv)`|JU-tW#4rITtda!NxQ1<)XJ=|80WdB~$WZQZ&`}bHKZCgjHsdn2`ZvQ`y zwM|itT2!XvcvPbnmE&dp)kmGHWUWA!G^*hH`!1aL}ZA_5RpSf zhKLLi86q-7WYK6`G#ZV5qL26S-tbVd7XSG>*?dGljriS2l#HAn17abfK#4L56{^(9 zXwalZPMZ#0dh~hcg8`p>@lB)=BaAXefg+Y?HCWE_*cC=YT_+9C6GEElxS(oSX|TxuVTAH{8Qjt91APwso=mTWsYoLP(IUmTBh2y7%eMRms-&QUEM&9$UCfNQ3I^qs(%^UHmKUD z5%->|>+R^pZ|V?@&o0((<)An6wP)n3a;>)z^*3n4TX7lT%i5kTm}7%lxD-A8NISR@ z?fjs9!Nx?qKeZs;uS(O;dC{gKy$1C<%k}6kv~jmSb_m~7;?t{}a9Z0Xy=nr>KXFqp zc#q|rc%)Bb_68>P7w@oLLvy;N7u)rNK{DomoDL7Mp*}qup1;8N|1AXD8ysK138p47 z&y+)RJqtW(f*|x@C5hSx7mOpvetkY?tkuC0|qch`waMn)9UUTP>16kj}3T@+4<4{YD09= z0Bi&6n=;@wIz4N^1+*t-Ko|}EG2jY1mSVz5w9aP2J~WVJf`)o?OxT9{R+&KOJ+j&a z+V;Kz6R><{U2B3JbvR6*?lCC{AW0c33-i7UOo6x zm#Aj@ePT>hC&U{(F4qcepVrmRXjwQuKU*8rhQf*Jd3D-0?L}=goUF#(aFTLbyQuqV zMtppYu}qDZX&oa))eXjSWxJ88j2J7F_l-2=sFALmG%}QP#!6+9nWfA%15gy3IZDGU zQJyl(Airu>sCvNMrhIQ!L3hsFu1v9NVMCGC3;PVK4f5mG8#r^`YK4*8)@wL8WZi+m zm~{++W&gEOU_-jC=BF0g5!ElZ129r!e}m$%tv06Ztj!Rpb=^LMz}`KvI}lLI7keId z|FgfMpB8X>5m?1$Cj-IU+u@WTu+>MM90b(c?aU*%-`AXW^xLCOCW0Cmc8buCO*qM! z(ykv)zYI*2GYVn+Tsk$w536Xs92cb6-_ja;PcN= zE&8*av`*f~TQ5)$&kshZ3r26yB{`3uxJ429KK$ohYLx%Ok3?1N_CKJ@c+bU0bOqz} zk0}T5`F)TASnuZ{8ja)r*I*t0*7#)=I4wn|>G`8SUOybs!(-a&Yr-xFJ=ur=>_NW>C7LV#s-}H#! zs7IAj{>h`QQa0<+M)d!9v{vdS`&58B%coo^EAR>D(k=7}vGGkl;arL>J|T8q9`Cy$ zpK!i;l~34Sm-~cxgy|Eu-L-u}41#<@JZxuNUgHy%&3DCRyM2<&WJ1gK=vvG6>RQY8 L=~|0#2JZh4R0rPy literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_n10000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_n10000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..c89d39292141f81c865719c5c17d1f6d7d1ed322 GIT binary patch literal 1000 zcmXZbO=uHA7zW_&woPqXYW~RIMh%ifgd(XH6pb|5nfXq7P%jb$gYn?8l&VpoE_$i8 zLNB67l%R+pQHqD+K{N*iQ9%(8IoYBfS}4*(DW&va-Y?%A_TilwcG;bG?XJiLt39sa z!owT03MsNIZ>*~`QRr9@J+Uq}#U;@ycJ>Fvw3rhM;+dk&_pgexVohv_&0;t2Yl-W{ z9?pKZW6JI2oD<3QaW1=k?3BBMopbxyr>>uU;|{Rj+(CB59b&uHPByH9(95V0dq_pt z<0=ZRii)w60Ezr3aye>pgy5* z!lh|_1Du^$T`O`|#n*1Lh-gJZ^r#2O0AY1`e7a)~m{FBafdl zf1p<}e9#v!o84F+y=8u5e(jDqfn2|5_Q3T=W`pOSnSSK6uS^Iw7K{z^O>-Vje>Nkq z_0v?~T1&6qQUmfCJ{&I+Wy6O=3cvG}fWaBx0F(k%Q bgwJH9#I%?Zv*NIr6Gz3vVqR?j%edcvW9p5A literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..e5d099c7ffdcc00e99485010d633bb7e590d6e5b GIT binary patch literal 800 zcmXZbJ4-_`6bJCMKC1PtRZ-M-=;9#MMHj)`Y6t>j=vl(A_+_F-aq?B9EaD!&G=){)j z#UQGL#}A4nu_9`*CZ^(flEr$r;)D1s`lJp0pW<)QjyP}FDbs;N3GLs3_@!*J0S;Hn*qs=6JAS9T(*Z|r2$-`lCEda~0I z|6*t08>>ESF~<6fWBnuTiZH&EBg_wdgnYU{oPUzAo-fj9oFB@B{j_EY`wMe47~B0k zVgG)C=eZ>wuM@)e=ql$)S9o5j#(Bb;#Os_VY!K?9Q|gO(b)8VJuuZ6+-XYX8-6hml z?-A-9_6haZ2ZVERhivPJa9*ur)}0W}RX5nsAl!rWlnr9qMOIG}5wA1}G=efj; z8{Fa^3m);5J(lcqz#%Kv{Nz`=IsWH|znsidKgC(jae<3m<_cH2&P{f>!(Dc{&jTLv um?u2rd7gId@q(AUV##aX@Rohv@tzMH@R3h^=8!LZD6Igd E0i&!3<^TWy literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1_cpp.sk new file mode 100644 index 0000000000000000000000000000000000000000..94e38d9dc40c413c7397d29ff299d0ec3a27f07c GIT binary patch literal 12 RcmZQ%6iH!VV*r8%djJL40YCr% literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n0_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n0_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..e84c2ea503d5ba25b64de9f767fb3490b28860f5 GIT binary patch literal 8 PcmZQ%6iH!VWMBXQ0)_y5 literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n1000000_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n1000000_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..6b87e1eeb12f364780e191ff2d2839f602da4862 GIT binary patch literal 2224 zcmZA3X-HI26bJA-sV(HB`vEN<$}Yq8aE*RWt`( zQA2alv+5}W{aT|u^wMV9gI>0as?m$~Q4c!SM_bXY2k9Pqc^e%^mmZ;V9FR39=`p(d zG_|1l90jq#6&>UbQ44Cju23;p)Lf-TY_PmRU%^^S7rljgin~jD!qj?qcJ@&CNM*7; zq+}Qf`q)R~Z~%&)QTZ5Ef9JqUvcf<(yss(2rt-q4@2DN>=~jTwj#qi$;795K)5Oo@ zi&ME{-vC`qSFULLK{+mE?f4*drYbu&4pEU4?E6K27?|jR-=x9yVu^+83spU3g>oJ^ zeE0|sWU0KpZxlZ$QH~6=a_(B?N8cma)2Dn!+SvX=xnsiw{sMV;M-2BqSNYK@J6n9p z&+QHlfqZ{i9AAS0wVt2M_j^@+z&(Yn{mM^1P38BmlpXd&t}Iq={gTL*a^*wUlMFt* zcNz=GcVr}UaG|Qd{UMnb7bx%Wrf>s1-zI+wSk?6fexpzPG}<2}4fT;plE;ov*QcdRj)48(nQzQ1a*TQZ3dtVWe}O{D z-LRjt)<|v#e_Jbg5cYr5M#-ljkKHVJNPQi#u1az~)cdwdegb(yjbs=HakEx3jDwh7 zC;1%IudbK89o*U=xfA@dQSt%s8%?qod|{j9Rp9LHk~6`5x@Ur&O_K3^uhtFQU3BV( z_KWwr;W!bY`tNWAtT#z_IM}5dkE;c`p`9XEH?&i%)D7(t#k%3R6lEr_*A49vl_qb| Y4eb$LldDaxHM!2@29y8(5Bb0TAANiA%m4rY literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n100000_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n100000_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..fbb27272ea654dc17b107216bbe2e9ce2ac40c54 GIT binary patch literal 2192 zcmZA3Ur19?90%~Lsaa|MNn#I{Q;I^t3PKDkzx^TiASEhUX0q0-84X4eBSMYtr>mn%vKy`pF59-?5!7Yk?m|pJqLLGB_y!-UzW$ee+52ncU(gnyZi<|usxlS z9+<8!XcSQ0dkYQ@Si@}`=K$v4!w^~b_23=VBfA$(6tKYQJ|t0Kknse$6fk^#hI9%L zT7wv(z}ahG;uzHjS_Y6zf%3n8gB-4}ev7!|=?S$5-=mWP=1B;nkopHaAK@nZe|BdfLC^#Qf>&rv`@+qQD1wf1rZujT!@4biKj+pRkg( zxDlKr>jNWbrtw_vQTS+n5+_FCq~FtVW=v{t8y~}2s@H4B(L(n2k0YJt)1MhZ8m(`* zI)Xd2{;Ag^ctO^OBIu!dkrhRd^5xwaMK9&i=ZNArS$Ly(PWAfDC>~I|uV1dWev9HD zw@*Z|gFKibU`#KErY#hBMYiM#1j(TSfevyR1RBXkiGZ7I*(|V)oLV7J!mbinMYdH7 zG9PpE+M@iJf`V!*Y298IB14xii2jYmu)jm%dNQ->!xWrHJ@U?;%>zO#X-ek#fDYW8E|jx>9zu{k+>IJ`X9BhC>K<%l zf6-`Rk0-H)U6jHecDN76*tm##cK9-q?BEp~WnUe@d-kqtxXCWQj!|}Y5JT)Ow~)sh zs^bpEIM2O@PwavFNV8)PP{QlukMN80rXhT1cRq&L&>x>*DXrJ-!^radlTWdoay9Y{ z%g8$Y96xw|dIW`>e|UjX_UD)IvGZRcMpn(Q5n>O%!9KE{c#8(IDt`w*SuaRqoS)}w z8V}jYQDOKE?s6V6xJ1^s4HE1LBi^S$1{XL_WpJLX#xm$;`#kiLO|6GcvhMN_Vc+o3 zOg5f}1~Rf9HjveC4|wgqu!#7-DJi7)~#0b0U}oPhuLpMzu=%1 zePrTR^p`$tMZcLNR`j1vSkaH>gcW_Nd#vo=UTJ76`WvSlC#BVCEB+tau{z`Aj#bLZ V9jmiW?pU33a>r`wzx@Am{{xGM!vX*R literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n1000_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n1000_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..7832fc376c8e67faa721dbf0ab4e95953b079262 GIT binary patch literal 1584 zcmZA2%}N4M6bJCptfp@ZD+?IPoj4oTu zBlHNfaM_{xi(!6qX6`-r&bfEoE?-J%A1A-3?$M0>T~e7kx0*3m zH`a-uRYH4(eZqd>fN)TFQ8*;LBpeoA7G4ovB{v_RzAn5Wyopw#)WhFbw$S=Ed3c<5 z(E2X*^0=~xuEwbk;tA^KuI`h{tsKw*x0a+qZsm}Mxa&tW%pE+YQEpA6G4Lskb5~L{ z!QDI~A9v+~rnt2D8MJm!3p^fwz2yMXaf5%aN&SRZPMMtHvGC9E%}g!QMISf2?K>(?-`z8w?mzhUC~@LK4Ih3hLR zd}ZPKG=yyn*Y}68Bm5;?vv7Z@3pa$D!YyH?jP0qyUSYp*P&h2SCL9%R{|El}_aCJV BY%c%+ literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n100_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n100_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..9d1d60a90feb1ee89c5f72fcaabfec50c18cd8c7 GIT binary patch literal 1248 zcmZ9}IZ8x96o%2;je(&-2L=WrA|fKrBl605YNvMW>;*2QmR3t~3B3e^26=TVDGonh zbo;%*Ps~INqx&Y!RO2-LCrz(AN2J=bcn%NnJYK+ycnL4#6}*bq@H*bWn|KRv;~l(< z_wYVGz=!wso)!WkDF@ANq1g5!f8XIyZ6(&LN^jxTzgal!FTk25Yf#{c~vmF`4X literal 0 HcmV?d00001 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n10_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n10_java.sk new file mode 100644 index 0000000000000000000000000000000000000000..2c1026eed9a55fb4a38dd5f9ac9bebf162eddfb2 GIT binary patch literal 192 zcmZQ#6iH!VVPN25Kn5S|Av_fa2>rpH5m}mn!2y-efyNg= PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join(TEST_DATA_DIR) + .join(sub_dir) + .join(name) +} + +fn test_sketch_file(path: PathBuf, n: u64, with_buffer: bool, is_f32: bool) { + let bytes = fs::read(&path).unwrap(); + let td = TDigestMut::deserialize(&bytes, is_f32).unwrap(); + let td = td.freeze(); + if n == 0 { + assert!(td.is_empty(), "filepath: {}", path.display()); + assert_eq!(td.total_weight(), 0, "filepath: {}", path.display()); + } else { + assert!(!td.is_empty(), "filepath: {}", path.display()); + assert_eq!(td.total_weight(), n, "filepath: {}", path.display()); + assert_eq!(td.min_value(), Some(1.0), "filepath: {}", path.display()); + assert_eq!( + td.max_value(), + Some(n as f64), + "filepath: {}", + path.display() + ); + assert_eq!(td.rank(0.0), Some(0.0), "filepath: {}", path.display()); + assert_eq!( + td.rank((n + 1) as f64), + Some(1.0), + "filepath: {}", + path.display() + ); + if n == 1 { + assert_eq!(td.rank(n as f64), Some(0.5), "filepath: {}", path.display()); + } else { + assert_that!( + td.rank(n as f64 / 2.).unwrap(), + near(0.5, 0.05), + "filepath: {}", + path.display() + ); + } + } + + if !with_buffer && !is_f32 { + let mut td = td.into_mut(); + let roundtrip_bytes = td.serialize(); + assert_eq!(bytes, roundtrip_bytes, "filepath: {}", path.display()); + } +} + +#[test] +fn test_deserialize_from_cpp_snapshots() { + let ns = [0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000]; + for n in ns { + let filename = format!("tdigest_double_n{}_cpp.sk", n); + let path = get_test_data_path("cpp_generated_files", &filename); + test_sketch_file(path, n, false, false); + } + for n in ns { + let filename = format!("tdigest_double_buf_n{}_cpp.sk", n); + let path = get_test_data_path("cpp_generated_files", &filename); + test_sketch_file(path, n, true, false); + } + for n in ns { + let filename = format!("tdigest_float_n{}_cpp.sk", n); + let path = get_test_data_path("cpp_generated_files", &filename); + test_sketch_file(path, n, false, true); + } + for n in ns { + let filename = format!("tdigest_float_buf_n{}_cpp.sk", n); + let path = get_test_data_path("cpp_generated_files", &filename); + test_sketch_file(path, n, true, true); + } +} + +#[test] +fn test_deserialize_from_java_snapshots() { + let ns = [0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000]; + for n in ns { + let filename = format!("tdigest_double_n{}_java.sk", n); + let path = get_test_data_path("java_generated_files", &filename); + test_sketch_file(path, n, false, false); + } +} #[test] fn test_empty() { From 88837bf06ce96436a332e2cc0778cf1226bb4771 Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 08:29:42 +0800 Subject: [PATCH 14/26] support deserialize_compat Signed-off-by: tison --- src/tdigest/serialization.rs | 4 + src/tdigest/sketch.rs | 93 ++++++++++++++++- tests/tdigest_serialization_test.rs | 95 ++++++++++++------ .../tdigest_ref_k100_n10000_double.sk | Bin 0 -> 976 bytes .../tdigest_ref_k100_n10000_float.sk | Bin 0 -> 502 bytes 5 files changed, 158 insertions(+), 34 deletions(-) create mode 100644 tests/test_data/tdigest_ref_k100_n10000_double.sk create mode 100644 tests/test_data/tdigest_ref_k100_n10000_float.sk diff --git a/src/tdigest/serialization.rs b/src/tdigest/serialization.rs index 6d6ef34..e5b9788 100644 --- a/src/tdigest/serialization.rs +++ b/src/tdigest/serialization.rs @@ -22,3 +22,7 @@ pub(super) const TDIGEST_FAMILY_ID: u8 = 20; pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0; pub(super) const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1; pub(super) const FLAGS_REVERSE_MERGE: u8 = 1 << 2; +/// the format of the reference implementation is using double (f64) precision +pub(super) const COMPAT_DOUBLE: u32 = 1; +/// the format of the reference implementation is using float (f32) precision +pub(super) const COMPAT_FLOAT: u32 = 2; diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 7855729..8ca2ab7 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use byteorder::{LE, ReadBytesExt}; +use byteorder::{BE, LE, ReadBytesExt}; use std::cmp::Ordering; use std::convert::identity; use std::io::Cursor; @@ -315,11 +315,13 @@ impl TDigestMut { /// Supports reading compact format with (float, int) centroids as opposed to (double, long) to /// represent (mean, weight). [^1] /// + /// Supports reading format of the reference implementation (auto-detected) [^2]. + /// /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. + /// [^2]: https://github.com/tdunning/t-digest pub fn deserialize(bytes: &[u8], is_f32: bool) -> Result { fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> SerdeError { - let tag = tag.to_string(); - move |_| SerdeError::InsufficientData(tag) + move |_| SerdeError::InsufficientData(tag.to_string()) } let mut cursor = Cursor::new(bytes); @@ -328,7 +330,9 @@ impl TDigestMut { let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; let family_id = cursor.read_u8().map_err(make_error("family_id"))?; if family_id != TDIGEST_FAMILY_ID { - // TODO: Support reading format of the reference implementation + if preamble_longs == 0 && serial_version == 0 && family_id == 0 { + return Self::deserialize_compat(bytes); + } return Err(SerdeError::InvalidFamily(format!( "expected {} (TDigest), got {}", TDIGEST_FAMILY_ID, family_id @@ -441,6 +445,87 @@ impl TDigestMut { )) } + // compatibility with the format of the reference implementation + // default byte order of ByteBuffer is used there, which is big endian + fn deserialize_compat(bytes: &[u8]) -> Result { + fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> SerdeError { + move |_| SerdeError::InsufficientData(format!("{tag} in compat format")) + } + + let mut cursor = Cursor::new(bytes); + + let ty = cursor.read_u32::().map_err(make_error("type"))?; + match ty { + COMPAT_DOUBLE => { + fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> SerdeError { + move |_| SerdeError::InsufficientData(format!("{tag} in compat double format")) + } + // compatibility with asBytes() + let min = cursor.read_f64::().map_err(make_error("min"))?; + let max = cursor.read_f64::().map_err(make_error("max"))?; + let k = cursor.read_f64::().map_err(make_error("k"))? as u16; + let num_centroids = cursor + .read_u32::() + .map_err(make_error("num_centroids"))? + as usize; + let mut total_weight = 0; + let mut centroids = Vec::with_capacity(num_centroids); + for _ in 0..num_centroids { + let weight = cursor.read_f64::().map_err(make_error("weight"))? as u64; + let mean = cursor.read_f64::().map_err(make_error("mean"))?; + total_weight += weight; + centroids.push(Centroid { mean, weight }); + } + Ok(TDigestMut::make( + k, + false, + min, + max, + centroids, + total_weight, + vec![], + )) + } + COMPAT_FLOAT => { + fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> SerdeError { + move |_| SerdeError::InsufficientData(format!("{tag} in compat float format")) + } + // COMPAT_FLOAT: compatibility with asSmallBytes() + // reference implementation uses doubles for min and max + let min = cursor.read_f64::().map_err(make_error("min"))?; + let max = cursor.read_f64::().map_err(make_error("max"))?; + let k = cursor.read_f32::().map_err(make_error("k"))? as u16; + // reference implementation stores capacities of the array of centroids and the + // buffer as shorts they can be derived from k in the constructor + cursor.read_u32::().map_err(make_error(""))?; + let num_centroids = cursor + .read_u16::() + .map_err(make_error("num_centroids"))? + as usize; + let mut total_weight = 0; + let mut centroids = Vec::with_capacity(num_centroids); + for _ in 0..num_centroids { + let weight = cursor.read_f32::().map_err(make_error("weight"))? as u64; + let mean = cursor.read_f32::().map_err(make_error("mean"))? as f64; + total_weight += weight; + centroids.push(Centroid { mean, weight }); + } + Ok(TDigestMut::make( + k, + false, + min, + max, + centroids, + total_weight, + vec![], + )) + } + ty => Err(SerdeError::InvalidParameter(format!( + "unknown TDigest compat type {ty}", + ))), + } + } + fn is_single_value(&self) -> bool { self.total_weight() == 1 } diff --git a/tests/tdigest_serialization_test.rs b/tests/tdigest_serialization_test.rs index 3957edb..3f8b437 100644 --- a/tests/tdigest_serialization_test.rs +++ b/tests/tdigest_serialization_test.rs @@ -17,15 +17,22 @@ use datasketches::tdigest::TDigestMut; use googletest::assert_that; -use googletest::prelude::near; +use googletest::prelude::{eq, near}; use std::fs; use std::path::PathBuf; -const TEST_DATA_DIR: &str = "tests/serialization_test_data"; +const TEST_DATA_DIR: &str = "tests/test_data"; +const SERDE_TEST_DATA_DIR: &str = "tests/serialization_test_data"; -fn get_test_data_path(sub_dir: &str, name: &str) -> PathBuf { +fn get_test_data_path(name: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join(TEST_DATA_DIR) + .join(name) +} + +fn get_serde_test_data_path(sub_dir: &str, name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join(SERDE_TEST_DATA_DIR) .join(sub_dir) .join(name) } @@ -34,34 +41,25 @@ fn test_sketch_file(path: PathBuf, n: u64, with_buffer: bool, is_f32: bool) { let bytes = fs::read(&path).unwrap(); let td = TDigestMut::deserialize(&bytes, is_f32).unwrap(); let td = td.freeze(); + + let path = path.display(); if n == 0 { - assert!(td.is_empty(), "filepath: {}", path.display()); - assert_eq!(td.total_weight(), 0, "filepath: {}", path.display()); + assert!(td.is_empty(), "filepath: {path}"); + assert_eq!(td.total_weight(), 0, "filepath: {path}"); } else { - assert!(!td.is_empty(), "filepath: {}", path.display()); - assert_eq!(td.total_weight(), n, "filepath: {}", path.display()); - assert_eq!(td.min_value(), Some(1.0), "filepath: {}", path.display()); - assert_eq!( - td.max_value(), - Some(n as f64), - "filepath: {}", - path.display() - ); - assert_eq!(td.rank(0.0), Some(0.0), "filepath: {}", path.display()); - assert_eq!( - td.rank((n + 1) as f64), - Some(1.0), - "filepath: {}", - path.display() - ); + assert!(!td.is_empty(), "filepath: {path}"); + assert_eq!(td.total_weight(), n, "filepath: {path}"); + assert_eq!(td.min_value(), Some(1.0), "filepath: {path}"); + assert_eq!(td.max_value(), Some(n as f64), "filepath: {path}"); + assert_eq!(td.rank(0.0), Some(0.0), "filepath: {path}"); + assert_eq!(td.rank((n + 1) as f64), Some(1.0), "filepath: {path}"); if n == 1 { - assert_eq!(td.rank(n as f64), Some(0.5), "filepath: {}", path.display()); + assert_eq!(td.rank(n as f64), Some(0.5), "filepath: {path}"); } else { assert_that!( td.rank(n as f64 / 2.).unwrap(), near(0.5, 0.05), - "filepath: {}", - path.display() + "filepath: {path}", ); } } @@ -69,7 +67,7 @@ fn test_sketch_file(path: PathBuf, n: u64, with_buffer: bool, is_f32: bool) { if !with_buffer && !is_f32 { let mut td = td.into_mut(); let roundtrip_bytes = td.serialize(); - assert_eq!(bytes, roundtrip_bytes, "filepath: {}", path.display()); + assert_eq!(bytes, roundtrip_bytes, "filepath: {path}"); } } @@ -78,32 +76,69 @@ fn test_deserialize_from_cpp_snapshots() { let ns = [0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000]; for n in ns { let filename = format!("tdigest_double_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = get_serde_test_data_path("cpp_generated_files", &filename); test_sketch_file(path, n, false, false); } for n in ns { let filename = format!("tdigest_double_buf_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = get_serde_test_data_path("cpp_generated_files", &filename); test_sketch_file(path, n, true, false); } for n in ns { let filename = format!("tdigest_float_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = get_serde_test_data_path("cpp_generated_files", &filename); test_sketch_file(path, n, false, true); } for n in ns { let filename = format!("tdigest_float_buf_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = get_serde_test_data_path("cpp_generated_files", &filename); test_sketch_file(path, n, true, true); } } +#[test] +fn test_deserialize_from_reference_implementation() { + for filename in [ + "tdigest_ref_k100_n10000_double.sk", + "tdigest_ref_k100_n10000_float.sk", + ] { + let path = get_test_data_path(filename); + let bytes = fs::read(&path).unwrap(); + let td = TDigestMut::deserialize(&bytes, false).unwrap(); + let td = td.freeze(); + + let n = 10000; + let path = path.display(); + assert_eq!(td.k(), 100, "filepath: {path}"); + assert_eq!(td.total_weight(), n, "filepath: {path}"); + assert_eq!(td.min_value(), Some(0.0), "filepath: {path}"); + assert_eq!(td.max_value(), Some((n - 1) as f64), "filepath: {path}"); + assert_that!(td.rank(0.0).unwrap(), near(0.0, 0.0001), "filepath: {path}"); + assert_that!( + td.rank(n as f64 / 4.).unwrap(), + near(0.25, 0.0001), + "filepath: {path}" + ); + assert_that!( + td.rank(n as f64 / 2.).unwrap(), + near(0.5, 0.0001), + "filepath: {path}" + ); + assert_that!( + td.rank((n * 3) as f64 / 4.).unwrap(), + near(0.75, 0.0001), + "filepath: {path}" + ); + assert_that!(td.rank(n as f64).unwrap(), eq(1.0), "filepath: {path}"); + } +} + #[test] fn test_deserialize_from_java_snapshots() { let ns = [0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000]; for n in ns { let filename = format!("tdigest_double_n{}_java.sk", n); - let path = get_test_data_path("java_generated_files", &filename); + let path = get_serde_test_data_path("java_generated_files", &filename); test_sketch_file(path, n, false, false); } } diff --git a/tests/test_data/tdigest_ref_k100_n10000_double.sk b/tests/test_data/tdigest_ref_k100_n10000_double.sk new file mode 100644 index 0000000000000000000000000000000000000000..f6f4510ed714e80d76af612ce028222e703068cb GIT binary patch literal 976 zcmZ9L%_~Gv7{-sCg@pyOkVeS{<)ct&-h8E|WKvTSCKKhOB$Gl@C+^HN7A#myS-2^z z6s1VXHTRETF$+6e`3v0lJnxxi<}U8<`91G@HZ$icrD~KsJiw#UuM1-u{Z1Z?itmXh2z^R?&4W92;HnGeWWE)LpM8M`28u5J;Dq8W;O z{E;+neZ~YD;VI9T$*`Z_?3^9Ji0Ztt&u1jjhoKlV5h ezk-K+$77bh4?nFdJ;Pcy&*)0evX;&9?fwH&3x#z6 literal 0 HcmV?d00001 diff --git a/tests/test_data/tdigest_ref_k100_n10000_float.sk b/tests/test_data/tdigest_ref_k100_n10000_float.sk new file mode 100644 index 0000000000000000000000000000000000000000..16d798111318e3bcd42b4fe5fc5f308ba3928a14 GIT binary patch literal 502 zcmXxhF-XHe7zN;mWDo~Yv1pqhbty$EWDqG;FuA*A5FH$ZA`S%?-McE;1gX-oh=U;5 zO&p4JPzbn)lZcDr(7{1UhmIm5zDv)*pYMM>xWEO#5X|GTWu`boQ-F+ipwh$Fhn z%rj&yhTxg96+>odk1hf#TY;RXYs@?JfIiV?9r87TdXJ&<5rXkys9cPo)Q4IJg1ud+ zP3Dsh)ExWE52%ud;LfG&U#d{E?7Pes{b@j60HO5@*`y;m$bC9`3%Nlj%8+TADnPu^ zsUpM~oxXzDrs+0>Lo-$Q9ql*#2DQt{eT4QY{H)H8@N{qY;GI*)O71>%CX;&)of%^; Q`5&QEFuKMCW6`*bKP#MQ5dZ)H literal 0 HcmV?d00001 From fe04e845e894013831bea8058308842649d1c32f Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 09:05:44 +0800 Subject: [PATCH 15/26] impl cdf and pmf Signed-off-by: tison --- src/tdigest/sketch.rs | 154 +++++++++++++++++++++++++++++++++++++++++- tests/tdigest_test.rs | 39 +++++++---- 2 files changed, 176 insertions(+), 17 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 8ca2ab7..c1a1882 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -182,7 +182,8 @@ impl TDigestMut { } } - fn view(&self) -> TDigestView<'_> { + fn view(&mut self) -> TDigestView<'_> { + self.compress(); // side effect TDigestView { min: self.min, max: self.max, @@ -191,6 +192,65 @@ impl TDigestMut { } } + /// Returns an approximation to the Cumulative Distribution Function (CDF), which is the + /// cumulative analog of the PMF, of the input stream given a set of split points. + /// + /// # Arguments + /// + /// * `split_points`: An array of _m_ unique, monotonically increasing values that divide the + /// input domain into _m+1_ consecutive disjoint intervals. + /// + /// # Returns + /// + /// An array of m+1 doubles, which are a consecutive approximation to the CDF of the input + /// stream given the split points. The value at array position j of the returned CDF array + /// is the sum of the returned values in positions 0 through j of the returned PMF array. + /// This can be viewed as array of ranks of the given split points plus one more value that + /// is always 1. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If `split_points` is not unique, not monotonically increasing, or contains `NaN` values. + pub fn cdf(&mut self, split_points: &[f64]) -> Option> { + check_split_points(split_points); + + if self.is_empty() { + return None; + } + + self.view().cdf(split_points) + } + + /// Returns an approximation to the Probability Mass Function (PMF) of the input stream + /// given a set of split points. + /// + /// # Arguments + /// + /// * `split_points`: An array of _m_ unique, monotonically increasing values that divide the + /// input domain into _m+1_ consecutive disjoint intervals (bins). + /// + /// # Returns + /// + /// An array of m+1 doubles each of which is an approximation to the fraction of the input + /// stream values (the mass) that fall into one of those intervals. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If `split_points` is not unique, not monotonically increasing, or contains `NaN` values. + pub fn pmf(&mut self, split_points: &[f64]) -> Option> { + check_split_points(split_points); + + if self.is_empty() { + return None; + } + + self.view().pmf(split_points) + } + /// Compute approximate normalized rank (from 0 to 1 inclusive) of the given value. /// /// Returns `None` if TDigest is empty. @@ -215,7 +275,6 @@ impl TDigestMut { return Some(0.5); } - self.compress(); // side effect self.view().rank(value) } @@ -233,7 +292,6 @@ impl TDigestMut { return None; } - self.compress(); // side effect self.view().quantile(rank) } @@ -658,6 +716,53 @@ impl TDigest { } } + /// Returns an approximation to the Cumulative Distribution Function (CDF), which is the + /// cumulative analog of the PMF, of the input stream given a set of split points. + /// + /// # Arguments + /// + /// * `split_points`: An array of _m_ unique, monotonically increasing values that divide the + /// input domain into _m+1_ consecutive disjoint intervals. + /// + /// # Returns + /// + /// An array of m+1 doubles, which are a consecutive approximation to the CDF of the input + /// stream given the split points. The value at array position j of the returned CDF array + /// is the sum of the returned values in positions 0 through j of the returned PMF array. + /// This can be viewed as array of ranks of the given split points plus one more value that + /// is always 1. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If `split_points` is not unique, not monotonically increasing, or contains `NaN` values. + pub fn cdf(&self, split_points: &[f64]) -> Option> { + self.view().cdf(split_points) + } + + /// Returns an approximation to the Probability Mass Function (PMF) of the input stream + /// given a set of split points. + /// + /// # Arguments + /// + /// * `split_points`: An array of _m_ unique, monotonically increasing values that divide the + /// input domain into _m+1_ consecutive disjoint intervals (bins). + /// + /// # Returns + /// + /// An array of m+1 doubles each of which is an approximation to the fraction of the input + /// stream values (the mass) that fall into one of those intervals. + /// + /// Returns `None` if TDigest is empty. + /// + /// # Panics + /// + /// If `split_points` is not unique, not monotonically increasing, or contains `NaN` values. + pub fn pmf(&self, split_points: &[f64]) -> Option> { + self.view().pmf(split_points) + } + /// Compute approximate normalized rank (from 0 to 1 inclusive) of the given value. /// /// Returns `None` if TDigest is empty. @@ -705,6 +810,32 @@ struct TDigestView<'a> { } impl TDigestView<'_> { + fn pmf(&self, split_points: &[f64]) -> Option> { + let mut buckets = self.cdf(split_points)?; + for i in (1..buckets.len()).rev() { + buckets[i] -= buckets[i - 1]; + } + Some(buckets) + } + + fn cdf(&self, split_points: &[f64]) -> Option> { + check_split_points(split_points); + + if self.centroids.is_empty() { + return None; + } + + let mut ranks = Vec::with_capacity(split_points.len() + 1); + for &p in split_points { + match self.rank(p) { + Some(rank) => ranks.push(rank), + None => unreachable!("non-empty TDigest never returns None from rank"), + } + } + ranks.push(1.0); + Some(ranks) + } + fn rank(&self, value: f64) -> Option { debug_assert!(!value.is_nan(), "value must not be NaN"); @@ -882,6 +1013,23 @@ impl TDigestView<'_> { } } +/// Checks the sequential validity of the given array of double values. +/// They must be unique, monotonically increasing and not NaN. +#[track_caller] +fn check_split_points(split_points: &[f64]) { + let len = split_points.len(); + if len == 1 && split_points[0].is_nan() { + panic!("split_points must not contain NaN values: {split_points:?}"); + } + for i in 0..len - 1 { + if split_points[i] < split_points[i + 1] { + // we must use this positive condition because NaN comparisons are always false + continue; + } + panic!("split_points must be unique and monotonically increasing: {split_points:?}"); + } +} + fn centroid_cmp(a: &Centroid, b: &Centroid) -> Ordering { match a.mean.partial_cmp(&b.mean) { Some(order) => order, diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs index 7107b2a..1eeeb55 100644 --- a/tests/tdigest_test.rs +++ b/tests/tdigest_test.rs @@ -30,10 +30,22 @@ fn test_empty() { assert_eq!(tdigest.rank(0.0), None); assert_eq!(tdigest.quantile(0.5), None); - // TODO: Support PMF and CDF - // const double split_points[1] {0}; - // REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error); - // REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error); + let split_points = [0.0]; + assert_eq!(tdigest.pmf(&split_points), None); + assert_eq!(tdigest.cdf(&split_points), None); + + let tdigest = TDigestMut::new(10).freeze(); + assert!(tdigest.is_empty()); + assert_eq!(tdigest.k(), 10); + assert_eq!(tdigest.total_weight(), 0); + assert_eq!(tdigest.min_value(), None); + assert_eq!(tdigest.max_value(), None); + assert_eq!(tdigest.rank(0.0), None); + assert_eq!(tdigest.quantile(0.5), None); + + let split_points = [0.0]; + assert_eq!(tdigest.pmf(&split_points), None); + assert_eq!(tdigest.cdf(&split_points), None); } #[test] @@ -89,16 +101,15 @@ fn test_many_values() { ); assert_that!(tdigest.quantile(1.0).unwrap(), eq((n - 1) as f64)); - // TODO: Later until PMF and CDF are supported - // const double split_points[1] {n / 2}; - // const auto pmf = td.get_PMF(split_points, 1); - // REQUIRE(pmf.size() == 2); - // REQUIRE(pmf[0] == Approx(0.5).margin(0.0001)); - // REQUIRE(pmf[1] == Approx(0.5).margin(0.0001)); - // const auto cdf = td.get_CDF(split_points, 1); - // REQUIRE(cdf.size() == 2); - // REQUIRE(cdf[0] == Approx(0.5).margin(0.0001)); - // REQUIRE(cdf[1] == 1); + let split_points = [n as f64 / 2.0]; + let pmf = tdigest.pmf(&split_points).unwrap(); + assert_eq!(pmf.len(), 2); + assert_that!(pmf[0], near(0.5, 0.0001)); + assert_that!(pmf[1], near(0.5, 0.0001)); + let cdf = tdigest.cdf(&split_points).unwrap(); + assert_eq!(cdf.len(), 2); + assert_that!(cdf[0], near(0.5, 0.0001)); + assert_that!(cdf[1], eq(1.0)); } #[test] From c2e322a6403fe32a759ecaed316e0f8939060e41 Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 09:08:16 +0800 Subject: [PATCH 16/26] fine tune docs Signed-off-by: tison --- src/tdigest/sketch.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index c1a1882..d20e3e0 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -30,7 +30,7 @@ const BUFFER_MULTIPLIER: usize = 4; /// T-Digest sketch for estimating quantiles and ranks. /// -/// See the [module documentation](self) for more details. +/// See the [module documentation](super) for more details. #[derive(Debug, Clone)] pub struct TDigestMut { k: u16, @@ -376,7 +376,7 @@ impl TDigestMut { /// Supports reading format of the reference implementation (auto-detected) [^2]. /// /// [^1]: This is to support reading the `tdigest` format from the C++ implementation. - /// [^2]: https://github.com/tdunning/t-digest + /// [^2]: pub fn deserialize(bytes: &[u8], is_f32: bool) -> Result { fn make_error(tag: &'static str) -> impl FnOnce(std::io::Error) -> SerdeError { move |_| SerdeError::InsufficientData(tag.to_string()) @@ -660,7 +660,7 @@ impl TDigestMut { /// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks. /// -/// See the [module documentation](self) for more details. +/// See the [module documentation](super) for more details. pub struct TDigest { k: u16, From 8d7ed90fbe24e64db5ced7fe0526c9be9c1c67c3 Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 09:31:44 +0800 Subject: [PATCH 17/26] naming and let to do the reserve Signed-off-by: tison --- src/tdigest/sketch.rs | 5 ++--- tests/tdigest_serialization_test.rs | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index d20e3e0..1cd7b7a 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -788,8 +788,7 @@ impl TDigest { } /// Converts this immutable TDigest into a mutable one. - pub fn into_mut(mut self) -> TDigestMut { - self.centroids.reserve(self.centroids_capacity); + pub fn unfreeze(self) -> TDigestMut { TDigestMut::make( self.k, self.reverse_merge, @@ -797,7 +796,7 @@ impl TDigest { self.max, self.centroids, self.centroids_weight, - Vec::with_capacity(self.centroids_capacity * BUFFER_MULTIPLIER), + vec![], ) } } diff --git a/tests/tdigest_serialization_test.rs b/tests/tdigest_serialization_test.rs index 3f8b437..74fbdb8 100644 --- a/tests/tdigest_serialization_test.rs +++ b/tests/tdigest_serialization_test.rs @@ -65,7 +65,7 @@ fn test_sketch_file(path: PathBuf, n: u64, with_buffer: bool, is_f32: bool) { } if !with_buffer && !is_f32 { - let mut td = td.into_mut(); + let mut td = td.unfreeze(); let roundtrip_bytes = td.serialize(); assert_eq!(bytes, roundtrip_bytes, "filepath: {path}"); } From 11fee5fe48afd4deff8f0c7d170a9c5275d100f8 Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 09:32:23 +0800 Subject: [PATCH 18/26] further tidy Signed-off-by: tison --- src/tdigest/sketch.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 1cd7b7a..e1e8b4c 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -178,7 +178,6 @@ impl TDigestMut { max: self.max, centroids: self.centroids, centroids_weight: self.centroids_weight, - centroids_capacity: self.centroids_capacity, } } @@ -670,7 +669,6 @@ pub struct TDigest { centroids: Vec, centroids_weight: u64, - centroids_capacity: usize, } impl TDigest { From bebd87cdbdfea74ed11d59f37904411c1798d9ce Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 10:02:11 +0800 Subject: [PATCH 19/26] best effort avoid NaN Signed-off-by: tison --- src/tdigest/sketch.rs | 54 +++++++++++++++++++++++++++++++++++++++---- tests/tdigest_test.rs | 26 +++++++++++++++++++++ 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index e1e8b4c..653e865 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -381,6 +381,20 @@ impl TDigestMut { move |_| SerdeError::InsufficientData(tag.to_string()) } + fn check_non_nan(value: f64, tag: &'static str) -> Result<(), SerdeError> { + if value.is_nan() { + return Err(SerdeError::MalformedData(format!("{tag} cannot be NaN"))); + } + Ok(()) + } + + fn check_nonzero(value: u64, tag: &'static str) -> Result<(), SerdeError> { + if value == 0 { + return Err(SerdeError::MalformedData(format!("{tag} cannot be zero"))); + } + Ok(()) + } + let mut cursor = Cursor::new(bytes); let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; @@ -432,6 +446,7 @@ impl TDigestMut { .read_f64::() .map_err(make_error("single_value"))? }; + check_non_nan(value, "single_value")?; return Ok(TDigestMut::make( k, reverse_merge, @@ -462,6 +477,8 @@ impl TDigestMut { cursor.read_f64::().map_err(make_error("max"))?, ) }; + check_non_nan(min, "min")?; + check_non_nan(max, "max")?; let mut centroids = Vec::with_capacity(num_centroids); let mut centroids_weight = 0; for _ in 0..num_centroids { @@ -476,12 +493,14 @@ impl TDigestMut { cursor.read_u64::().map_err(make_error("weight"))?, ) }; + check_non_nan(mean, "centroid mean")?; + check_nonzero(weight, "centroid weight")?; centroids_weight += weight; centroids.push(Centroid { mean, weight }); } let mut buffer = Vec::with_capacity(num_buffered); for _ in 0..num_buffered { - buffer.push(if is_f32 { + let value = if is_f32 { cursor .read_f32::() .map_err(make_error("buffered_value"))? as f64 @@ -489,7 +508,9 @@ impl TDigestMut { cursor .read_f64::() .map_err(make_error("buffered_value"))? - }) + }; + check_non_nan(value, "buffered_value mean")?; + buffer.push(value); } Ok(TDigestMut::make( k, @@ -826,7 +847,7 @@ impl TDigestView<'_> { for &p in split_points { match self.rank(p) { Some(rank) => ranks.push(rank), - None => unreachable!("non-empty TDigest never returns None from rank"), + None => unreachable!("checked non-empty above"), } } ranks.push(1.0); @@ -1030,7 +1051,7 @@ fn check_split_points(split_points: &[f64]) { fn centroid_cmp(a: &Centroid, b: &Centroid) -> Ordering { match a.mean.partial_cmp(&b.mean) { Some(order) => order, - None => unreachable!("NaN values should never be present in centroids"), + None => unreachable!("NaN values should not be present in centroids"), } } @@ -1060,7 +1081,30 @@ impl Centroid { fn add(&mut self, other: Centroid) { if self.weight != 0 { let total_weight = self.weight + other.weight; - self.mean += (other.weight as f64) * (other.mean - self.mean) / (total_weight as f64); + let (self_mean, other_mean) = (self.mean, other.mean); + match (self_mean, other_mean) { + (f64::INFINITY, f64::INFINITY) => self.mean = f64::INFINITY, + (f64::NEG_INFINITY, f64::NEG_INFINITY) => self.mean = f64::NEG_INFINITY, + _ => { + debug_assert!( + !self_mean.is_nan() && !other_mean.is_nan(), + "NaN values should never be present in centroids; self: {}, other: {}", + self_mean, + other_mean + ); + self.mean = self_mean + + ((other.weight as f64) * (other_mean - self_mean) + / (total_weight as f64)); + } + } + + debug_assert!( + !self.mean.is_nan(), + "NaN values should never be present in centroids; self: {}, other: {}", + self_mean, + other_mean + ); + self.weight = total_weight; } else { self.mean = other.mean; diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs index 1eeeb55..e948915 100644 --- a/tests/tdigest_test.rs +++ b/tests/tdigest_test.rs @@ -194,3 +194,29 @@ fn test_merge_large() { assert_that!(td1.rank((n * 3 / 4) as f64).unwrap(), near(0.75, 0.0001)); assert_that!(td1.rank(n as f64).unwrap(), eq(1.0)); } + +#[test] +fn test_infinite() { + let mut td = TDigestMut::new(10); + for _ in 0..10000 { + td.update(f64::INFINITY); + } + assert_eq!(td.quantile(0.5), Some(f64::INFINITY)); + + let mut td = TDigestMut::new(10); + for _ in 0..10000 { + td.update(f64::NEG_INFINITY); + } + assert_eq!(td.quantile(0.5), Some(f64::NEG_INFINITY)); + + // FIXME: merging -inf and inf results in NaN centroid mean + // let mut td = TDigestMut::new(10); + // for i in 0..10000 { + // if i % 2 == 0 { + // td.update(f64::INFINITY); + // } else { + // td.update(f64::NEG_INFINITY); + // } + // } + // assert!(td.quantile(0.5).is_some()); +} From 243dc289b4e65d7b1df74c206570ec7fbc8242f7 Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 10:13:29 +0800 Subject: [PATCH 20/26] fixup! best effort avoid NaN Signed-off-by: tison --- src/tdigest/sketch.rs | 54 +++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 653e865..3cf2675 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -381,20 +381,6 @@ impl TDigestMut { move |_| SerdeError::InsufficientData(tag.to_string()) } - fn check_non_nan(value: f64, tag: &'static str) -> Result<(), SerdeError> { - if value.is_nan() { - return Err(SerdeError::MalformedData(format!("{tag} cannot be NaN"))); - } - Ok(()) - } - - fn check_nonzero(value: u64, tag: &'static str) -> Result<(), SerdeError> { - if value == 0 { - return Err(SerdeError::MalformedData(format!("{tag} cannot be zero"))); - } - Ok(()) - } - let mut cursor = Cursor::new(bytes); let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; @@ -416,6 +402,11 @@ impl TDigestMut { ))); } let k = cursor.read_u16::().map_err(make_error("k"))?; + if k < 10 { + return Err(SerdeError::InvalidParameter(format!( + "k must be at least 10, got {k}" + ))); + } let flags = cursor.read_u8().map_err(make_error("flags"))?; let is_empty = (flags & FLAGS_IS_EMPTY) != 0; let is_single_value = (flags & FLAGS_IS_SINGLE_VALUE) != 0; @@ -540,8 +531,15 @@ impl TDigestMut { } // compatibility with asBytes() let min = cursor.read_f64::().map_err(make_error("min"))?; + check_non_nan(min, "min in compat format")?; let max = cursor.read_f64::().map_err(make_error("max"))?; + check_non_nan(max, "max in compat format")?; let k = cursor.read_f64::().map_err(make_error("k"))? as u16; + if k < 10 { + return Err(SerdeError::InvalidParameter(format!( + "k must be at least 10, got {k}" + ))); + } let num_centroids = cursor .read_u32::() .map_err(make_error("num_centroids"))? @@ -550,7 +548,9 @@ impl TDigestMut { let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f64::().map_err(make_error("weight"))? as u64; + check_nonzero(weight, "centroid weight in compat format")?; let mean = cursor.read_f64::().map_err(make_error("mean"))?; + check_non_nan(mean, "centroid mean in compat format")?; total_weight += weight; centroids.push(Centroid { mean, weight }); } @@ -571,8 +571,15 @@ impl TDigestMut { // COMPAT_FLOAT: compatibility with asSmallBytes() // reference implementation uses doubles for min and max let min = cursor.read_f64::().map_err(make_error("min"))?; + check_non_nan(min, "min in compat format")?; let max = cursor.read_f64::().map_err(make_error("max"))?; + check_non_nan(max, "max in compat format")?; let k = cursor.read_f32::().map_err(make_error("k"))? as u16; + if k < 10 { + return Err(SerdeError::InvalidParameter(format!( + "k must be at least 10, got {k}" + ))); + } // reference implementation stores capacities of the array of centroids and the // buffer as shorts they can be derived from k in the constructor cursor.read_u32::().map_err(make_error(""))?; @@ -584,7 +591,9 @@ impl TDigestMut { let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f32::().map_err(make_error("weight"))? as u64; + check_nonzero(weight, "centroid weight in compat format")?; let mean = cursor.read_f32::().map_err(make_error("mean"))? as f64; + check_non_nan(mean, "centroid mean in compat format")?; total_weight += weight; centroids.push(Centroid { mean, weight }); } @@ -1051,7 +1060,8 @@ fn check_split_points(split_points: &[f64]) { fn centroid_cmp(a: &Centroid, b: &Centroid) -> Ordering { match a.mean.partial_cmp(&b.mean) { Some(order) => order, - None => unreachable!("NaN values should not be present in centroids"), + // FIXME: avoid panic on NaN but since now on the result is undefined + None => f64::total_cmp(&a.mean, &b.mean), } } @@ -1113,6 +1123,20 @@ impl Centroid { } } +fn check_non_nan(value: f64, tag: &'static str) -> Result<(), SerdeError> { + if value.is_nan() { + return Err(SerdeError::MalformedData(format!("{tag} cannot be NaN"))); + } + Ok(()) +} + +fn check_nonzero(value: u64, tag: &'static str) -> Result<(), SerdeError> { + if value == 0 { + return Err(SerdeError::MalformedData(format!("{tag} cannot be zero"))); + } + Ok(()) +} + /// Generates cluster sizes proportional to `q*(1-q)`. /// /// The use of a normalizing function results in a strictly bounded number of clusters no matter From 2a4ad3db11958d319ed9dcea19ddb9977f9c686e Mon Sep 17 00:00:00 2001 From: tison Date: Wed, 17 Dec 2025 10:35:04 +0800 Subject: [PATCH 21/26] concrete tag Signed-off-by: tison --- src/tdigest/sketch.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 3cf2675..93d1d94 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -531,13 +531,13 @@ impl TDigestMut { } // compatibility with asBytes() let min = cursor.read_f64::().map_err(make_error("min"))?; - check_non_nan(min, "min in compat format")?; + check_non_nan(min, "min in compat double format")?; let max = cursor.read_f64::().map_err(make_error("max"))?; - check_non_nan(max, "max in compat format")?; + check_non_nan(max, "max in compat double format")?; let k = cursor.read_f64::().map_err(make_error("k"))? as u16; if k < 10 { return Err(SerdeError::InvalidParameter(format!( - "k must be at least 10, got {k}" + "k must be at least 10, got {k} in compat double format" ))); } let num_centroids = cursor @@ -548,9 +548,9 @@ impl TDigestMut { let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f64::().map_err(make_error("weight"))? as u64; - check_nonzero(weight, "centroid weight in compat format")?; + check_nonzero(weight, "centroid weight in compat double format")?; let mean = cursor.read_f64::().map_err(make_error("mean"))?; - check_non_nan(mean, "centroid mean in compat format")?; + check_non_nan(mean, "centroid mean in compat double format")?; total_weight += weight; centroids.push(Centroid { mean, weight }); } @@ -571,13 +571,13 @@ impl TDigestMut { // COMPAT_FLOAT: compatibility with asSmallBytes() // reference implementation uses doubles for min and max let min = cursor.read_f64::().map_err(make_error("min"))?; - check_non_nan(min, "min in compat format")?; + check_non_nan(min, "min in compat float format")?; let max = cursor.read_f64::().map_err(make_error("max"))?; - check_non_nan(max, "max in compat format")?; + check_non_nan(max, "max in compat float format")?; let k = cursor.read_f32::().map_err(make_error("k"))? as u16; if k < 10 { return Err(SerdeError::InvalidParameter(format!( - "k must be at least 10, got {k}" + "k must be at least 10, got {k} in compat float format" ))); } // reference implementation stores capacities of the array of centroids and the @@ -591,9 +591,9 @@ impl TDigestMut { let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f32::().map_err(make_error("weight"))? as u64; - check_nonzero(weight, "centroid weight in compat format")?; + check_nonzero(weight, "centroid weight in compat float format")?; let mean = cursor.read_f32::().map_err(make_error("mean"))? as f64; - check_non_nan(mean, "centroid mean in compat format")?; + check_non_nan(mean, "centroid mean in compat float format")?; total_weight += weight; centroids.push(Centroid { mean, weight }); } From ddbe0e2e6ed2c4f5c70c50ea990aed535d816913 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 18 Dec 2025 07:19:32 +0800 Subject: [PATCH 22/26] filter invalid inputs Signed-off-by: tison --- src/tdigest/sketch.rs | 52 +++++++++++++++++++++---------------------- tests/tdigest_test.rs | 39 +++++++++++++++++++------------- 2 files changed, 48 insertions(+), 43 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 93d1d94..932f9d2 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -99,9 +99,11 @@ impl TDigestMut { } } - /// Update this TDigest with the given value (`NaN` values are ignored). + /// Update this TDigest with the given value. + /// + /// [f64::NAN], [f64::INFINITY], and [f64::NEG_INFINITY] values are ignored. pub fn update(&mut self, value: f64) { - if value.is_nan() { + if value.is_nan() || value.is_infinite() { return; } @@ -438,6 +440,7 @@ impl TDigestMut { .map_err(make_error("single_value"))? }; check_non_nan(value, "single_value")?; + check_non_infinite(value, "single_value")?; return Ok(TDigestMut::make( k, reverse_merge, @@ -485,6 +488,7 @@ impl TDigestMut { ) }; check_non_nan(mean, "centroid mean")?; + check_non_infinite(mean, "centroid")?; check_nonzero(weight, "centroid weight")?; centroids_weight += weight; centroids.push(Centroid { mean, weight }); @@ -501,6 +505,7 @@ impl TDigestMut { .map_err(make_error("buffered_value"))? }; check_non_nan(value, "buffered_value mean")?; + check_non_infinite(value, "buffered_value mean")?; buffer.push(value); } Ok(TDigestMut::make( @@ -531,8 +536,8 @@ impl TDigestMut { } // compatibility with asBytes() let min = cursor.read_f64::().map_err(make_error("min"))?; - check_non_nan(min, "min in compat double format")?; let max = cursor.read_f64::().map_err(make_error("max"))?; + check_non_nan(min, "min in compat double format")?; check_non_nan(max, "max in compat double format")?; let k = cursor.read_f64::().map_err(make_error("k"))? as u16; if k < 10 { @@ -548,9 +553,10 @@ impl TDigestMut { let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f64::().map_err(make_error("weight"))? as u64; - check_nonzero(weight, "centroid weight in compat double format")?; let mean = cursor.read_f64::().map_err(make_error("mean"))?; + check_nonzero(weight, "centroid weight in compat double format")?; check_non_nan(mean, "centroid mean in compat double format")?; + check_non_infinite(mean, "centroid mean in compat double format")?; total_weight += weight; centroids.push(Centroid { mean, weight }); } @@ -571,8 +577,8 @@ impl TDigestMut { // COMPAT_FLOAT: compatibility with asSmallBytes() // reference implementation uses doubles for min and max let min = cursor.read_f64::().map_err(make_error("min"))?; - check_non_nan(min, "min in compat float format")?; let max = cursor.read_f64::().map_err(make_error("max"))?; + check_non_nan(min, "min in compat float format")?; check_non_nan(max, "max in compat float format")?; let k = cursor.read_f32::().map_err(make_error("k"))? as u16; if k < 10 { @@ -591,9 +597,10 @@ impl TDigestMut { let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f32::().map_err(make_error("weight"))? as u64; - check_nonzero(weight, "centroid weight in compat float format")?; let mean = cursor.read_f32::().map_err(make_error("mean"))? as f64; + check_nonzero(weight, "centroid weight in compat float format")?; check_non_nan(mean, "centroid mean in compat float format")?; + check_non_infinite(mean, "centroid mean in compat float format")?; total_weight += weight; centroids.push(Centroid { mean, weight }); } @@ -1060,8 +1067,7 @@ fn check_split_points(split_points: &[f64]) { fn centroid_cmp(a: &Centroid, b: &Centroid) -> Ordering { match a.mean.partial_cmp(&b.mean) { Some(order) => order, - // FIXME: avoid panic on NaN but since now on the result is undefined - None => f64::total_cmp(&a.mean, &b.mean), + None => unreachable!("NaN values should never be present in centroids"), } } @@ -1090,32 +1096,17 @@ struct Centroid { impl Centroid { fn add(&mut self, other: Centroid) { if self.weight != 0 { - let total_weight = self.weight + other.weight; - let (self_mean, other_mean) = (self.mean, other.mean); - match (self_mean, other_mean) { - (f64::INFINITY, f64::INFINITY) => self.mean = f64::INFINITY, - (f64::NEG_INFINITY, f64::NEG_INFINITY) => self.mean = f64::NEG_INFINITY, - _ => { - debug_assert!( - !self_mean.is_nan() && !other_mean.is_nan(), - "NaN values should never be present in centroids; self: {}, other: {}", - self_mean, - other_mean - ); - self.mean = self_mean - + ((other.weight as f64) * (other_mean - self_mean) - / (total_weight as f64)); - } - } + self.weight += other.weight; + let (total_weight, other_weight) = (self.weight as f64, other.weight as f64); + let (self_mean, other_mean) = (self.mean, other.mean); + self.mean = self_mean + (other_weight * (other_mean - self_mean) / (total_weight)); debug_assert!( !self.mean.is_nan(), "NaN values should never be present in centroids; self: {}, other: {}", self_mean, other_mean ); - - self.weight = total_weight; } else { self.mean = other.mean; self.weight = other.weight; @@ -1130,6 +1121,13 @@ fn check_non_nan(value: f64, tag: &'static str) -> Result<(), SerdeError> { Ok(()) } +fn check_non_infinite(value: f64, tag: &'static str) -> Result<(), SerdeError> { + if value.is_infinite() { + return Err(SerdeError::MalformedData(format!("{tag} cannot be is_infinite"))); + } + Ok(()) +} + fn check_nonzero(value: u64, tag: &'static str) -> Result<(), SerdeError> { if value == 0 { return Err(SerdeError::MalformedData(format!("{tag} cannot be zero"))); diff --git a/tests/tdigest_test.rs b/tests/tdigest_test.rs index e948915..1ae1ae3 100644 --- a/tests/tdigest_test.rs +++ b/tests/tdigest_test.rs @@ -196,27 +196,34 @@ fn test_merge_large() { } #[test] -fn test_infinite() { +fn test_invalid_inputs() { + let n = 100; + + let mut td = TDigestMut::new(10); + for _ in 0..n { + td.update(f64::NAN); + } + assert!(td.is_empty()); + let mut td = TDigestMut::new(10); - for _ in 0..10000 { + for _ in 0..n { td.update(f64::INFINITY); } - assert_eq!(td.quantile(0.5), Some(f64::INFINITY)); + assert!(td.is_empty()); let mut td = TDigestMut::new(10); - for _ in 0..10000 { + for _ in 0..n { td.update(f64::NEG_INFINITY); } - assert_eq!(td.quantile(0.5), Some(f64::NEG_INFINITY)); - - // FIXME: merging -inf and inf results in NaN centroid mean - // let mut td = TDigestMut::new(10); - // for i in 0..10000 { - // if i % 2 == 0 { - // td.update(f64::INFINITY); - // } else { - // td.update(f64::NEG_INFINITY); - // } - // } - // assert!(td.quantile(0.5).is_some()); + assert!(td.is_empty()); + + let mut td = TDigestMut::new(10); + for i in 0..n { + if i % 2 == 0 { + td.update(f64::INFINITY); + } else { + td.update(f64::NEG_INFINITY); + } + } + assert!(td.is_empty()); } From 2f61d4f825581a39d02c28ccf687dc7a9086a2c1 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 18 Dec 2025 07:52:58 +0800 Subject: [PATCH 23/26] weight nonzero and should not overflow Signed-off-by: tison --- src/tdigest/sketch.rs | 131 ++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 62 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 932f9d2..17dc147 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -15,22 +15,24 @@ // specific language governing permissions and limitations // under the License. +use crate::error::SerdeError; +use crate::tdigest::serialization::*; use byteorder::{BE, LE, ReadBytesExt}; use std::cmp::Ordering; use std::convert::identity; use std::io::Cursor; - -use crate::error::SerdeError; -use crate::tdigest::serialization::*; +use std::num::NonZeroU64; /// The default value of K if one is not specified. const DEFAULT_K: u16 = 200; /// Multiplier for buffer size relative to centroids capacity. const BUFFER_MULTIPLIER: usize = 4; +/// Default weight for single values. +const DEFAULT_WEIGHT: NonZeroU64 = NonZeroU64::new(1).unwrap(); /// T-Digest sketch for estimating quantiles and ranks. /// -/// See the [module documentation](super) for more details. +/// See the [tdigest module level documentation](crate::tdigest) for more. #[derive(Debug, Clone)] pub struct TDigestMut { k: u16, @@ -146,7 +148,7 @@ impl TDigestMut { /// Returns total weight. pub fn total_weight(&self) -> u64 { - self.centroids_weight + (self.buffer.len() as u64) + self.centroids_weight + self.buffer.len() as u64 } /// Merge the given TDigest into this one @@ -159,10 +161,16 @@ impl TDigestMut { self.centroids.len() + self.buffer.len() + other.centroids.len() + other.buffer.len(), ); for &v in &self.buffer { - tmp.push(Centroid { mean: v, weight: 1 }); + tmp.push(Centroid { + mean: v, + weight: DEFAULT_WEIGHT, + }); } for &v in &other.buffer { - tmp.push(Centroid { mean: v, weight: 1 }); + tmp.push(Centroid { + mean: v, + weight: DEFAULT_WEIGHT, + }); } for &c in &other.centroids { tmp.push(c); @@ -364,7 +372,7 @@ impl TDigestMut { bytes.extend_from_slice(&self.max.to_le_bytes()); for centroid in &self.centroids { bytes.extend_from_slice(¢roid.mean.to_le_bytes()); - bytes.extend_from_slice(¢roid.weight.to_le_bytes()); + bytes.extend_from_slice(¢roid.weight.get().to_le_bytes()); } bytes } @@ -448,7 +456,7 @@ impl TDigestMut { value, vec![Centroid { mean: value, - weight: 1, + weight: DEFAULT_WEIGHT, }], 1, vec![], @@ -474,7 +482,7 @@ impl TDigestMut { check_non_nan(min, "min")?; check_non_nan(max, "max")?; let mut centroids = Vec::with_capacity(num_centroids); - let mut centroids_weight = 0; + let mut centroids_weight = 0u64; for _ in 0..num_centroids { let (mean, weight) = if is_f32 { ( @@ -489,8 +497,8 @@ impl TDigestMut { }; check_non_nan(mean, "centroid mean")?; check_non_infinite(mean, "centroid")?; - check_nonzero(weight, "centroid weight")?; - centroids_weight += weight; + let weight = check_nonzero(weight, "centroid weight")?; + centroids_weight += weight.get(); centroids.push(Centroid { mean, weight }); } let mut buffer = Vec::with_capacity(num_buffered); @@ -549,15 +557,15 @@ impl TDigestMut { .read_u32::() .map_err(make_error("num_centroids"))? as usize; - let mut total_weight = 0; + let mut total_weight = 0u64; let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f64::().map_err(make_error("weight"))? as u64; let mean = cursor.read_f64::().map_err(make_error("mean"))?; - check_nonzero(weight, "centroid weight in compat double format")?; + let weight = check_nonzero(weight, "centroid weight in compat double format")?; check_non_nan(mean, "centroid mean in compat double format")?; check_non_infinite(mean, "centroid mean in compat double format")?; - total_weight += weight; + total_weight += weight.get(); centroids.push(Centroid { mean, weight }); } Ok(TDigestMut::make( @@ -593,15 +601,15 @@ impl TDigestMut { .read_u16::() .map_err(make_error("num_centroids"))? as usize; - let mut total_weight = 0; + let mut total_weight = 0u64; let mut centroids = Vec::with_capacity(num_centroids); for _ in 0..num_centroids { let weight = cursor.read_f32::().map_err(make_error("weight"))? as u64; let mean = cursor.read_f32::().map_err(make_error("mean"))? as f64; - check_nonzero(weight, "centroid weight in compat float format")?; + let weight = check_nonzero(weight, "centroid weight in compat float format")?; check_non_nan(mean, "centroid mean in compat float format")?; check_non_infinite(mean, "centroid mean in compat float format")?; - total_weight += weight; + total_weight += weight.get(); centroids.push(Centroid { mean, weight }); } Ok(TDigestMut::make( @@ -631,7 +639,10 @@ impl TDigestMut { } let mut tmp = Vec::with_capacity(self.buffer.len() + self.centroids.len()); for &v in &self.buffer { - tmp.push(Centroid { mean: v, weight: 1 }); + tmp.push(Centroid { + mean: v, + weight: DEFAULT_WEIGHT, + }); } self.do_merge(tmp, self.buffer.len() as u64) } @@ -660,7 +671,7 @@ impl TDigestMut { let mut weight_so_far = 0.; while current < len { let c = buffer[current]; - let proposed_weight = (self.centroids[num_centroids - 1].weight + c.weight) as f64; + let proposed_weight = self.centroids[num_centroids - 1].weight() + c.weight(); let mut add_this = false; if (current != 1) && (current != (len - 1)) { let centroids_weight = self.centroids_weight as f64; @@ -677,7 +688,7 @@ impl TDigestMut { self.centroids[num_centroids - 1].add(c); } else { // copy to a new centroid - weight_so_far += self.centroids[num_centroids - 1].weight as f64; + weight_so_far += self.centroids[num_centroids - 1].weight(); self.centroids.push(c); num_centroids += 1; } @@ -898,7 +909,7 @@ impl TDigestView<'_> { 0.5 / centroids_weight } else { 1. + (((value - self.min) / (first_mean - self.min)) - * ((self.centroids[0].weight as f64 / 2.) - 1.)) + * ((self.centroids[0].weight() / 2.) - 1.)) }); } return Some(0.); // should never happen @@ -913,7 +924,7 @@ impl TDigestView<'_> { } else { 1.0 - ((1.0 + (((self.max - value) / (self.max - last_mean)) - * ((self.centroids[num_centroids - 1].weight as f64 / 2.) - 1.))) + * ((self.centroids[num_centroids - 1].weight() / 2.) - 1.))) / centroids_weight) }); } @@ -924,12 +935,12 @@ impl TDigestView<'_> { .centroids .binary_search_by(|c| centroid_lower_bound(c, value)) .unwrap_or_else(identity); - debug_assert_ne!(lower, num_centroids, "get_rank: lower == end"); + assert_ne!(lower, num_centroids, "get_rank: lower == end"); let mut upper = self .centroids .binary_search_by(|c| centroid_upper_bound(c, value)) .unwrap_or_else(identity); - debug_assert_ne!(upper, 0, "get_rank: upper == begin"); + assert_ne!(upper, 0, "get_rank: upper == begin"); if value < self.centroids[lower].mean { lower -= 1; } @@ -940,18 +951,18 @@ impl TDigestView<'_> { let mut weight_below = 0.; let mut i = 0; while i < lower { - weight_below += self.centroids[i].weight as f64; + weight_below += self.centroids[i].weight(); i += 1; } - weight_below += self.centroids[lower].weight as f64 / 2.; + weight_below += self.centroids[lower].weight() / 2.; let mut weight_delta = 0.; while i < upper { - weight_delta += self.centroids[i].weight as f64; + weight_delta += self.centroids[i].weight(); i += 1; } - weight_delta -= self.centroids[lower].weight as f64 / 2.; - weight_delta += self.centroids[upper].weight as f64 / 2.; + weight_delta -= self.centroids[lower].weight() / 2.; + weight_delta += self.centroids[upper].weight() / 2.; Some( if self.centroids[upper].mean - self.centroids[lower].mean > 0. { (weight_below @@ -985,7 +996,7 @@ impl TDigestView<'_> { if weight > centroids_weight - 1. { return Some(self.max); } - let first_weight = self.centroids[0].weight as f64; + let first_weight = self.centroids[0].weight(); if first_weight > 1. && weight < first_weight / 2. { return Some( self.min @@ -993,7 +1004,7 @@ impl TDigestView<'_> { * (self.centroids[0].mean - self.min)), ); } - let last_weight = self.centroids[num_centroids - 1].weight as f64; + let last_weight = self.centroids[num_centroids - 1].weight(); if last_weight > 1. && (centroids_weight - weight <= last_weight / 2.) { return Some( self.max @@ -1005,18 +1016,18 @@ impl TDigestView<'_> { // interpolate between extremes let mut weight_so_far = first_weight / 2.; for i in 0..(num_centroids - 1) { - let dw = (self.centroids[i].weight + self.centroids[i + 1].weight) as f64 / 2.; + let dw = (self.centroids[i].weight() + self.centroids[i + 1].weight()) / 2.; if weight_so_far + dw > weight { // the target weight is between centroids i and i+1 let mut left_weight = 0.; - if self.centroids[i].weight == 1 { + if self.centroids[i].weight.get() == 1 { if weight - weight_so_far < 0.5 { return Some(self.centroids[i].mean); } left_weight = 0.5; } let mut right_weight = 0.; - if self.centroids[i + 1].weight == 1 { + if self.centroids[i + 1].weight.get() == 1 { if weight_so_far + dw - weight < 0.5 { return Some(self.centroids[i + 1].mean); } @@ -1034,10 +1045,8 @@ impl TDigestView<'_> { weight_so_far += dw; } - let w1 = weight - - (self.centroids_weight as f64) - - ((self.centroids[num_centroids - 1].weight as f64) / 2.); - let w2 = (self.centroids[num_centroids - 1].weight as f64 / 2.) - w1; + let w1 = weight - (centroids_weight) - ((self.centroids[num_centroids - 1].weight()) / 2.); + let w2 = (self.centroids[num_centroids - 1].weight() / 2.) - w1; Some(weighted_average( self.centroids[num_centroids - 1].mean, w1, @@ -1090,27 +1099,26 @@ fn centroid_upper_bound(c: &Centroid, value: f64) -> Ordering { #[derive(Debug, Clone, Copy, PartialEq)] struct Centroid { mean: f64, - weight: u64, + weight: NonZeroU64, } impl Centroid { fn add(&mut self, other: Centroid) { - if self.weight != 0 { - self.weight += other.weight; - - let (total_weight, other_weight) = (self.weight as f64, other.weight as f64); - let (self_mean, other_mean) = (self.mean, other.mean); - self.mean = self_mean + (other_weight * (other_mean - self_mean) / (total_weight)); - debug_assert!( - !self.mean.is_nan(), - "NaN values should never be present in centroids; self: {}, other: {}", - self_mean, - other_mean - ); - } else { - self.mean = other.mean; - self.weight = other.weight; - } + self.weight = self.weight.saturating_add(other.weight.get()); + + let (total_weight, other_weight) = (self.weight(), other.weight()); + let (self_mean, other_mean) = (self.mean, other.mean); + self.mean = self_mean + (other_weight * (other_mean - self_mean) / (total_weight)); + debug_assert!( + !self.mean.is_nan(), + "NaN values should never be present in centroids; self: {}, other: {}", + self_mean, + other_mean + ); + } + + fn weight(&self) -> f64 { + self.weight.get() as f64 } } @@ -1123,16 +1131,15 @@ fn check_non_nan(value: f64, tag: &'static str) -> Result<(), SerdeError> { fn check_non_infinite(value: f64, tag: &'static str) -> Result<(), SerdeError> { if value.is_infinite() { - return Err(SerdeError::MalformedData(format!("{tag} cannot be is_infinite"))); + return Err(SerdeError::MalformedData(format!( + "{tag} cannot be is_infinite" + ))); } Ok(()) } -fn check_nonzero(value: u64, tag: &'static str) -> Result<(), SerdeError> { - if value == 0 { - return Err(SerdeError::MalformedData(format!("{tag} cannot be zero"))); - } - Ok(()) +fn check_nonzero(value: u64, tag: &'static str) -> Result { + NonZeroU64::new(value).ok_or_else(|| SerdeError::MalformedData(format!("{tag} cannot be zero"))) } /// Generates cluster sizes proportional to `q*(1-q)`. From b35cdb270916e5dd70fa7d5ebd5e50b5b96d4c90 Mon Sep 17 00:00:00 2001 From: tison Date: Thu, 18 Dec 2025 08:10:58 +0800 Subject: [PATCH 24/26] other_mean - self_mean may produce inf Signed-off-by: tison --- src/tdigest/sketch.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tdigest/sketch.rs b/src/tdigest/sketch.rs index 17dc147..7f125d9 100644 --- a/src/tdigest/sketch.rs +++ b/src/tdigest/sketch.rs @@ -1104,11 +1104,14 @@ struct Centroid { impl Centroid { fn add(&mut self, other: Centroid) { + let (self_weight, other_weight) = (self.weight(), other.weight()); + let total_weight = self_weight + other_weight; self.weight = self.weight.saturating_add(other.weight.get()); - let (total_weight, other_weight) = (self.weight(), other.weight()); let (self_mean, other_mean) = (self.mean, other.mean); - self.mean = self_mean + (other_weight * (other_mean - self_mean) / (total_weight)); + let ratio_self = self_weight / total_weight; + let ratio_other = other_weight / total_weight; + self.mean = self_mean.mul_add(ratio_self, other_mean * ratio_other); debug_assert!( !self.mean.is_nan(), "NaN values should never be present in centroids; self: {}, other: {}", From 743ede9c5295678422cf4f2dabb114dce99de6f5 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 19 Dec 2025 13:07:57 +0800 Subject: [PATCH 25/26] no need for checking in sk files now Signed-off-by: tison --- .../tdigest_double_buf_n0_cpp.sk | Bin 8 -> 0 bytes .../tdigest_double_buf_n1000000_cpp.sk | Bin 5392 -> 0 bytes .../tdigest_double_buf_n100000_cpp.sk | Bin 2480 -> 0 bytes .../tdigest_double_buf_n10000_cpp.sk | Bin 8032 -> 0 bytes .../tdigest_double_buf_n1000_cpp.sk | Bin 2736 -> 0 bytes .../tdigest_double_buf_n100_cpp.sk | Bin 832 -> 0 bytes .../tdigest_double_buf_n10_cpp.sk | Bin 112 -> 0 bytes .../tdigest_double_buf_n1_cpp.sk | Bin 16 -> 0 bytes .../cpp_generated_files/tdigest_double_n0_cpp.sk | Bin 8 -> 0 bytes .../tdigest_double_n1000000_cpp.sk | Bin 2224 -> 0 bytes .../tdigest_double_n100000_cpp.sk | Bin 2192 -> 0 bytes .../tdigest_double_n10000_cpp.sk | Bin 1984 -> 0 bytes .../tdigest_double_n1000_cpp.sk | Bin 1584 -> 0 bytes .../tdigest_double_n100_cpp.sk | Bin 1248 -> 0 bytes .../tdigest_double_n10_cpp.sk | Bin 192 -> 0 bytes .../cpp_generated_files/tdigest_double_n1_cpp.sk | Bin 16 -> 0 bytes .../tdigest_float_buf_n0_cpp.sk | Bin 8 -> 0 bytes .../tdigest_float_buf_n1000000_cpp.sk | Bin 2704 -> 0 bytes .../tdigest_float_buf_n100000_cpp.sk | Bin 1248 -> 0 bytes .../tdigest_float_buf_n10000_cpp.sk | Bin 4024 -> 0 bytes .../tdigest_float_buf_n1000_cpp.sk | Bin 1376 -> 0 bytes .../tdigest_float_buf_n100_cpp.sk | Bin 424 -> 0 bytes .../tdigest_float_buf_n10_cpp.sk | Bin 64 -> 0 bytes .../tdigest_float_buf_n1_cpp.sk | Bin 12 -> 0 bytes .../cpp_generated_files/tdigest_float_n0_cpp.sk | Bin 8 -> 0 bytes .../tdigest_float_n1000000_cpp.sk | Bin 1120 -> 0 bytes .../tdigest_float_n100000_cpp.sk | Bin 1104 -> 0 bytes .../tdigest_float_n10000_cpp.sk | Bin 1000 -> 0 bytes .../tdigest_float_n1000_cpp.sk | Bin 800 -> 0 bytes .../tdigest_float_n100_cpp.sk | Bin 632 -> 0 bytes .../cpp_generated_files/tdigest_float_n10_cpp.sk | Bin 104 -> 0 bytes .../cpp_generated_files/tdigest_float_n1_cpp.sk | Bin 12 -> 0 bytes .../tdigest_double_n0_java.sk | Bin 8 -> 0 bytes .../tdigest_double_n1000000_java.sk | Bin 2224 -> 0 bytes .../tdigest_double_n100000_java.sk | Bin 2192 -> 0 bytes .../tdigest_double_n10000_java.sk | Bin 1984 -> 0 bytes .../tdigest_double_n1000_java.sk | Bin 1584 -> 0 bytes .../tdigest_double_n100_java.sk | Bin 1248 -> 0 bytes .../tdigest_double_n10_java.sk | Bin 192 -> 0 bytes .../tdigest_double_n1_java.sk | Bin 16 -> 0 bytes 40 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n0_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n0_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n100000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n10000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n100_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n10_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_double_n1_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n0_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n1000000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n1000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n1_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n0_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n100000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n10000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n100_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n10_cpp.sk delete mode 100644 tests/serialization_test_data/cpp_generated_files/tdigest_float_n1_cpp.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n0_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n1000000_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n100000_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n10000_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n1000_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n100_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n10_java.sk delete mode 100644 tests/serialization_test_data/java_generated_files/tdigest_double_n1_java.sk diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n0_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n0_cpp.sk deleted file mode 100644 index e84c2ea503d5ba25b64de9f767fb3490b28860f5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmZQ%6iH!VWMBXQ0)_y5 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000000_cpp.sk deleted file mode 100644 index e488b6a303779dcae01f1d7fd5d086ad1816c928..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5392 zcmZA5e@s1v{EW2Fj{?OLs- z%1+5sQL+arc50O>k)5v@7f}PDOVYvN*uBziITePw9 zq}nV`u=&0EL$s^rjJht?tI6k7^$7d*+O}U%{=RmrpdVF~*bwSYyBa4qz^b2B{%CuB z+o#G_zpL8G_S~_vUwt#%ZfE`9 zYDui!u7*MNZM5BvwU5=R8KP^Ss#{{i1o!`|T;h0Rp_ks2Wv^F@z4da@{yrmh?_zu2 z)ILgIUTt^ahycB`!0s!51?rS`yAPQ`I_wv_n+qrEJ~8)i4$+so?D_unVcM(K?yfM0 z9xmp)@+Rx=#0CxcdWyby$zI=^5UvBR+TC&YRsGwqc00nR>5sDQuIrnoz4Gneb2`$k zKilz|HpF~ee3TwoV6Shz9i`t{Y4^6`>AGCJzgurj*Yf?|k`(PW-+V0EZJuDv&?>}! zy)&`yosjR|XFhlDS1!!ZjsW}kmABmci+|kv)_BuxePE=i$Bwj*kC|=iG2(hiNpzn( zm$=WbzH90fasBmXnfk1_o)_nux=Hk(1*RSp_k)x|Q?C{M&}r&fqC-D6^<(?bGd@^v z>U1%$-C*htF^|}2YOy_xmQquT?P1I+Gj*d_|K283SBRcpZtC|%Kejqq^p?%0miO0w zt6vxM1edA9MQ^q`RP-^c#pQ4GSUpzEA6YFGXwRpRYG>=aA=oKEF;nA5Mo#oNlRuBEVIhc!iSb%G= z5S>_r>#!J0uoTO%99>v}mADi+=KPF7x&?QY`_C}5D(#DY{Vnjgw1#q zkK+kEiKp;1wqPr^;W<2y7x6M)!FKGxPVB;NyoNn^9dF=G?8V!77w=&o-p77?fCKmt z2k{X;!Ke5PpQBOnV&4rzZ}dT5^h19fje$50gKz={V+e-gBy`{u495tJ#3+nL6Ju~D z#^M}|!?_rb^H5 z44lX@K|F+qu@R486E@>fJdP*uB%Z?4*n+LthUf4+ zUc}3I1>3O$JFyG9@f!Bvb-aN$u@`USUA%{Vcpv-m0S@3p9K=WX1fSwFe2xa6cMU^t z^g&Z~_Kn2!`S$bl?;W#|VtXD2zrEV{j(M;v9^_xfqZ0P+=m@#|4;# w3vm%9V+t_r>#!J0uoTO%99@?G0Xv~9-v9sr diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n100000_cpp.sk deleted file mode 100644 index 9616a819437daff31c00764fd811b24c8d9066d6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2480 zcmZA3Z)jC@9LMpWwykCUxtk;IgXOMbI3~;mF%~QRl+yM<>f$7Iu5p@oxOiV+mXBz@zMN7nZV9uV6Xbaj?R07b@6Z29@k|59-)a9~#)TZ=jJq^cI@g z+5LEy-SjS=VHaGqvA2GJL+tH2yu)rif)RH00N!SQatv45FCWLJ?92)D@&G^o8OqqN ze{MeiMCbe2+vp?b|Gk6Nbbi$R2Ts!a3nu==_w;;W=@`DHfy|z`i-V6X zynZ5>K!FC{AO0I{v_9&HP(!abR{sxDWL0tx?~~nY_i%)sucz-mJi0#RGxw39-{;KX zN#pq7)Fh75dbe{5{p8%}6e{U@=BuYsLHirFPvdLa|HSEO{6u!gr*Vna>zX10ny=iJ zB8F-HysjetBkNodKhk>ld=XdZxHoF9PyJQIn|yqxh~4C|6&lgPb||$<;}>#ry+%MT zG-{k7r=jsNIcm}9BPUi2*+>evp+PHt`V$=4kFgAUBRqbAF{QYoZ)AlOQ@;9+*`&7Yl6q~k3 zsePssmQ$7+EW>izvST@G*|qG&{{Qmkr9)C*8c0KFBvn0g=Ov^mX@eBfwA7JirLNSI u=A}bYUm8e5X(Uy>^7_)0v_T4KTIxu%QdjCp^U@)yFAb!jG?J?L|FH)-Sy@E@ diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10000_cpp.sk deleted file mode 100644 index 0958b673f9f6a9076953d0b8fae7c2ad15db29b2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8032 zcmZA2d;Hb&9l-Hd_xt^RQArX>B_+E2-W7>%y68swm6V7sq(b?M$V`hio0&+4H6z4G zew|b7*6d`aaM5d(Pu=zK?U>r)rhvv#ActfJ0R) z9}XV}%MNGFt~lx{M^27DM&7kc+_=QeOWeA|?MvLL#GOmrrNmuJ+^xjjZk5BJsPGiFONn!&x$6g*UgTmsSnPH=Bd}siX?>#;+pvX_y5=t zcg3({s63gtHddu>e@`q;eee2sHT9(XVngc2_s5RZ`yYtSsfRukHPau|{EhKiI#=B(zJ9?I@o_pIvpL>Pz3|B>ORso}-ifmw+hR@n{%>!KeW@ROIX0#) z+a8+|XK!wgwTYSeN~}(twWx>#iP={XkEdR>Bj;#Vh)2@-j6&Rd9tvu82!@uoyF*pI5rfcYvRJw#pskct0+dh#OyBSevjFn zVl+$U@m4V!C(hn0MuXJv7jye0J6O!^pTY;l+2@Tcc0`P?9A`WsylQ0 zHLI3)q596;elFC=JFB@f|9?;`@2tesE}55@IwkWGQ@3PZV(OL5OHBQed8H5E{su>y zSsH3fBW-D{ElsqgskSuJmgd^hLR(sDODk<@tu1Y|rLDHK)0Xzya-6nw(3Xzc(n(v6 z*Ot!Oa)P#W(UudnrK`4_q%GaFO8Mr+F$Z5gXA7i!BmZ5gjE6SQTbwp^qwleA^BwoK8MsoFA4TQ1g? zOSEOWwp^+$Gqj~#TV`s@EN!_=TV`v^<=QewTjpxZ71}aSTdvfWtF&dlwp^_(*J#TE zZCR)-i?n62wp^<%*J;ZVZMj}smTJo~ZCS1@H)zX^+H#Y&tk9N~+H$kDWZH6zwye^Y zTeamjZCR}?w`y-UTU*v@%jdLZownSgE%$26dTsf+Enm@=2eoB`wmhUQ4{OUtZFxjnHfhVF+VWLxc}!a#*On)= z*OnUE zQd3)MX-jQwsiQ4*wWXf6)Yp~<+R{*48fi;oZE2z{O|_+&wlvq47TVHMTUu#LYi((x zEp4@>owl^smgBUggSK?kmQLDoytZ`KmJ_t4i?*DoEnT(cByH)YEhlTsDcaIqTTa!M z9@=u6w)E7NUfR-ITTa)OKH74Iw)EAOe%f-Tw)EGQv$W-GZ5f~~1GQz4whY#mGHnTM z8KNyiwPl#LoTDx0YRhnKIZsBb zwdEphnWQa~wPlL7Ox2cY+H$eBT%s-0wdGQ6nV~J^+A>pHW@*c1+A>>PF4vYh+A>#L zuF#fw+H$3~T%|4ZwdHDUxkg(SXv;!vS)?tCwdGoExlUV_Xv_85vQ%4^Y0GkLxj|cQ p)RvpHWren^)RvpICDWE$v}KjH+^Q|NY0GMDxm{cC(3Um3;(x|G!3zKY diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1000_cpp.sk deleted file mode 100644 index 8a1bdff0c8e8ed96a6d84f94a9ae2288e0921de0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2736 zcmZA3KWGzC9Ki8c|3vnQarl)xCB@f(A9?HSJ(doBGZucTNeC1bpWWk2zo_G1b?29@${BnA zTVLJtv#{o#zwVlN_V2@W_x!ik<+wdQyWw8H@`ihTyBqHH&!z7ASWMmZlcw(a>Zb1c z8>jC2^lEM|)ZF#Gq`Fdb?~hdVR?WS?eyi@P{-b(WbMNnw>apsH>P)rSbhnpN?WxYI z_EiU}3#voaqB`1??aN=p62@4@3RW?}6zkZ)CbqDR9qeHr2ROtLj&Xt+nr}PDokI`v z=wpBd3{fz`B9<`5GFGsP38q-b1~##UZR}tV`#8WMj&O_<%+P#i{^(&IeGIUGAqqxV z#1h6>#tK$3!4&J*z$Uh^jUDV^9|t(Z5sqUW8OBiDrD_F$@Q>%B6hzU2h>&0c0s@2(LI@$ehxvH#?c^-vEv=Wt5(xf*PEOxf<70#Gc>kaL z4Exil>;3EK2~KjrDNb{Svz+5R7r4kJE^~#eT;n=7xXCSUbBDX!<30~~$RiHf^%iri znG?;NYUWHc=bE|D%%x_oG;^()8_nEm=1w#Bnt9O7qh_A^?pSQhvz^7pyx3W6%&VQn e#=O~CY|Oiz#m0QtS!~RwoyEp{*;#DNsQU%>HB3MN diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n10_cpp.sk deleted file mode 100644 index 1bab9718be88774b7566e69797fb6fa0ec8cb9cf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 112 xcmZQ#6iH!#0xlSX;e$Pdsp0^kVSENCp94w@Kxq*uEdixvptJ&%R)W$h4ghwf22KC~ diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_buf_n1_cpp.sk deleted file mode 100644 index 802a0bde3ac82cdf31c4fdb77cdf7a5f26ce821a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 RcmZQ%6iH!VVt|4V_5cWw0jvN3 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n0_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n0_cpp.sk deleted file mode 100644 index e84c2ea503d5ba25b64de9f767fb3490b28860f5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmZQ%6iH!VWMBXQ0)_y5 diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000000_cpp.sk deleted file mode 100644 index 6b87e1eeb12f364780e191ff2d2839f602da4862..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2224 zcmZA3X-HI26bJA-sV(HB`vEN<$}Yq8aE*RWt`( zQA2alv+5}W{aT|u^wMV9gI>0as?m$~Q4c!SM_bXY2k9Pqc^e%^mmZ;V9FR39=`p(d zG_|1l90jq#6&>UbQ44Cju23;p)Lf-TY_PmRU%^^S7rljgin~jD!qj?qcJ@&CNM*7; zq+}Qf`q)R~Z~%&)QTZ5Ef9JqUvcf<(yss(2rt-q4@2DN>=~jTwj#qi$;795K)5Oo@ zi&ME{-vC`qSFULLK{+mE?f4*drYbu&4pEU4?E6K27?|jR-=x9yVu^+83spU3g>oJ^ zeE0|sWU0KpZxlZ$QH~6=a_(B?N8cma)2Dn!+SvX=xnsiw{sMV;M-2BqSNYK@J6n9p z&+QHlfqZ{i9AAS0wVt2M_j^@+z&(Yn{mM^1P38BmlpXd&t}Iq={gTL*a^*wUlMFt* zcNz=GcVr}UaG|Qd{UMnb7bx%Wrf>s1-zI+wSk?6fexpzPG}<2}4fT;plE;ov*QcdRj)48(nQzQ1a*TQZ3dtVWe}O{D z-LRjt)<|v#e_Jbg5cYr5M#-ljkKHVJNPQi#u1az~)cdwdegb(yjbs=HakEx3jDwh7 zC;1%IudbK89o*U=xfA@dQSt%s8%?qod|{j9Rp9LHk~6`5x@Ur&O_K3^uhtFQU3BV( z_KWwr;W!bY`tNWAtT#z_IM}5dkE;c`p`9XEH?&i%)D7(t#k%3R6lEr_*A49vl_qb| Y4eb$LldDaxHM!2@29y8(5Bb0TAANiA%m4rY diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100000_cpp.sk deleted file mode 100644 index fbb27272ea654dc17b107216bbe2e9ce2ac40c54..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2192 zcmZA3Ur19?90%~Lsaa|MNn#I{Q;I^t3PKDkzx^TiASEhUX0q0-84X4eBSMYtr>mn%vKy`pF59-?5!7Yk?m|pJqLLGB_y!-UzW$ee+52ncU(gnyZi<|usxlS z9+<8!XcSQ0dkYQ@Si@}`=K$v4!w^~b_23=VBfA$(6tKYQJ|t0Kknse$6fk^#hI9%L zT7wv(z}ahG;uzHjS_Y6zf%3n8gB-4}ev7!|=?S$5-=mWP=1B;nkopHaAK@nZe|BdfLC^#Qf>&rv`@+qQD1wf1rZujT!@4biKj+pRkg( zxDlKr>jNWbrtw_vQTS+n5+_FCq~FtVW=v{t8y~}2s@H4B(L(n2k0YJt)1MhZ8m(`* zI)Xd2{;Ag^ctO^OBIu!dkrhRd^5xwaMK9&i=ZNArS$Ly(PWAfDC>~I|uV1dWev9HD zw@*Z|gFKibU`#KErY#hBMYiM#1j(TSfevyR1RBXkiGZ7I*(|V)oLV7J!mbinMYdH7 zG9PpE+M@iJf`V!*Y298IB14xii2jYmu)jm%dNQ->!xWrHJ@U?;%>zO#X-ek#fDYW8E|jx>9zu{k+>IJ`X9BhC>K<%l zf6-`Rk0-H)U6jHecDN76*tm##cK9-q?BEp~WnUe@d-kqtxXCWQj!|}Y5JT)Ow~)sh zs^bpEIM2O@PwavFNV8)PP{QlukMN80rXhT1cRq&L&>x>*DXrJ-!^radlTWdoay9Y{ z%g8$Y96xw|dIW`>e|UjX_UD)IvGZRcMpn(Q5n>O%!9KE{c#8(IDt`w*SuaRqoS)}w z8V}jYQDOKE?s6V6xJ1^s4HE1LBi^S$1{XL_WpJLX#xm$;`#kiLO|6GcvhMN_Vc+o3 zOg5f}1~Rf9HjveC4|wgqu!#7-DJi7)~#0b0U}oPhuLpMzu=%1 zePrTR^p`$tMZcLNR`j1vSkaH>gcW_Nd#vo=UTJ76`WvSlC#BVCEB+tau{z`Aj#bLZ V9jmiW?pU33a>r`wzx@Am{{xGM!vX*R diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n1000_cpp.sk deleted file mode 100644 index 7832fc376c8e67faa721dbf0ab4e95953b079262..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1584 zcmZA2%}N4M6bJCptfp@ZD+?IPoj4oTu zBlHNfaM_{xi(!6qX6`-r&bfEoE?-J%A1A-3?$M0>T~e7kx0*3m zH`a-uRYH4(eZqd>fN)TFQ8*;LBpeoA7G4ovB{v_RzAn5Wyopw#)WhFbw$S=Ed3c<5 z(E2X*^0=~xuEwbk;tA^KuI`h{tsKw*x0a+qZsm}Mxa&tW%pE+YQEpA6G4Lskb5~L{ z!QDI~A9v+~rnt2D8MJm!3p^fwz2yMXaf5%aN&SRZPMMtHvGC9E%}g!QMISf2?K>(?-`z8w?mzhUC~@LK4Ih3hLR zd}ZPKG=yyn*Y}68Bm5;?vv7Z@3pa$D!YyH?jP0qyUSYp*P&h2SCL9%R{|El}_aCJV BY%c%+ diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n100_cpp.sk deleted file mode 100644 index 9d1d60a90feb1ee89c5f72fcaabfec50c18cd8c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1248 zcmZ9}IZ8x96o%2;je(&-2L=WrA|fKrBl605YNvMW>;*2QmR3t~3B3e^26=TVDGonh zbo;%*Ps~INqx&Y!RO2-LCrz(AN2J=bcn%NnJYK+ycnL4#6}*bq@H*bWn|KRv;~l(< z_wYVGz=!wso)!WkDF@ANq1g5!f8XIyZ6(&LN^jxTzgal!FTk25Yf#{c~vmF`4X diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_double_n10_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_double_n10_cpp.sk deleted file mode 100644 index 2c1026eed9a55fb4a38dd5f9ac9bebf162eddfb2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 192 zcmZQ#6iH!VVPN25Kn5S|Av_fa2>rpH5m}mn!2y-efyNg=!xXA*jieb}e$;xsnmt zp#(SWVT2we!lqC#Qkb%b!X8efX(=|OKGaJiMEHkl2*D);`<}V8@Zr0^yYsu;Id>j* zQdj?W!OsCiUG_JepsDb^lWuO(TVEfq65RrZFO~cb1!OEOxLYx z%&5>$TaDHBAQrvcm}3v1a;PynlkPj#=(Pv&#rKVs{q)^G8z=4SwSpYB}N z+Dv_Su$4VScWi83c!~E}HPYI&lgsuUXnB{o{XL(w7GL3Z_MBbacD;|Pjrnaxk!kK{$JTkyE^mHps1F+Ro_(Z_p{w<7T<%OGuD~k94qF%m&cT6zrU%W z`i}2;{_Te z4sqmqs7}xuw?g#}b^Z<24*K=oP;I1@d!e$=tA9UKYw1K+TdlC|ZmR)0KDn*z*Kek@ zRUb{JwiUN2Mk;O9Lno+xyhf(Am3_PnwU1q;r>*SmNNOL)5z6g|@v<@HV~eD$DVvtf z%I0Oo^v?COsRI`h9()AQh!7)zL5d7H#EiC@4F@hHJopHp5g|qbgA^HZh+g*Lz=ebd z9|1HX#7JO}B0~-_lYKaFA>qMC0F4MS5*Vb&kVDL39}Zkdc<>QGBSMS>1}QS+5VP5b z0~Zn=d<4*l5F>#>iVQhKANz3NLc)WO02&cuBrr&kA%~d5J{-7^@ZckWMuZp%3{qss zA?C6V2QDN$_z0j8Aw~j&6d7`ee)i$Og@gwm0W>1SNMMj6Lk=;IeK>F-;lW1$jR-Lk z7^KLML(FF%4qQlh@DV^GLW~3kDKg{`3)qJP7ZM(P1ki{OBY{DR3^~L?_Tj*Vga;o1 zG$O=EV2~n14)GxSaNt71gO30j5n?1TNRc6jSj0XYxRCJRBY;MP7zqqgWXK^NVjm7% zNOQGBSMS>1}QS+5Dxor;6lQKj{q7GVk9s~ zks*gz%03*pknrFmfJTHE2@FzX$RU=o4+kzJJopHp5g|qbgA^HZh~@0VfeQ%_J_2Y& Ph>^e`MTQ*q|IYsbH?aMl diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n100000_cpp.sk deleted file mode 100644 index efe220d12d1d73109524bbb98fa8a84b3bcd484b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1248 zcmXZcSx6OO6bJB=nwd*(Q4(cl1`(2mWm=YVXFFyq)Ps-@(n=c%LHUw>v?ndMhzN>` zx==)h7^$>5%@l>vLO!&-)Ju!hgOCtJQ2+DY@4}xS=gc^F=06uxMvdPS@)4; z@yYgReZ11>+v$34vBzlJ?F?<)9;>}(XKMTGEbVJMTRUWr)22GP+CnE^RAo+q)^=uT zcRH1#b=s-cbEmUV`@&fy218DrHr-t)3(x|zn`}TyNBSm%g_@%NR z9Pra};DN9HEfl7zh->8d;Vj-EzMRO@a9mYP=LD{&HdXL|6kB2OGZd)?*YN%cP;KJ( zQv8J7yju3(_VZNyower}m&o-?;(S0%oZ}NX&)Rl@u}&L`zQ zM{e>dIo^Dm3*#0T1Ie8V-061Wk|mjo9;;Z$On2@)5B<s zn%PKoW`?vVVujFRh)_L5$btyPAVM}o$bksC5FrmD4`qfJ!vFvP diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_buf_n10000_cpp.sk deleted file mode 100644 index e7ceecf8db3b6dd363e1c924d5b789d7b36e2ec8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4024 zcmXZfLC7NK8Nl($x0{W->+UzZYqD-4tp^DnY%xR{A(oxhnL(%pX+22rAdL_Y9;8TO zi7Z2Ef+Qw*utb6hCb6v)Tck+wAnhD1NF&4+DMEypM2aCEWRXRJhyLH$5q|!8RB#^W zy?FMRD<@r)q_QMQ$1|QJ^3k6}#zs z*4Ig8yLG#5yM4Rny$jZ5EbSidak)IML?6`?ecYOZcJ=5bYtBZWuzEl0u-b@Tw)$!G ziq%o{s@2b_rlbDOY z(7q8hUe>G|oqtvPfp|akTkUJ{`u-j5=VI>sPWz$g;Y01A_4@C%%VN&_MY}25|C@Fu z8ZotBiPrw5JttcDw^k$4@xR&^qT7}Bu{D3CeJVQspZ1=p<2&t5(f(fBh%O&$e-xEZ zkNSb6e-sG*Ke(>TqJeY$nrP-+x6VoRT({28AI^2_JWbAZ>wK-w zb?f}??VicHbuMLDx6ZA{?z6IPo%bKx4avIrnWV7Yh+W05YBz2-VK-?vWjAd%V>fFz zC+kuk1r$+285J0)qJ}yeXrhHSI_RQ@J_Z02OM#Nbd&wzfD)X_i_Ews@=7d`Ybzz`#hVPb+QW|(7vC01BtgDrO0 z$Y_P=+dmM1Y36hKb;eZn|TyVn!FMRMLfFME$ zBLW2~;z%Hg6w=5biyZPOpokL6sK7uKHPq2S6D_pSK^HyrF~ATbjA3GeDQ1{sfhAU0 zV}mVr*yDgBPLQ5te>mWT3>Vz+zzZMz2q1_M!iYeDiZ~KTB84Q0jSRBLA&&xzD4~oB3{+7=9St(`u zIpk455havSfq^P&sH1@5rF~~aU_sL3Tb4JMGkosP(%r3RA8Wr8tQ1Ei5A-Epo<>* z7+{DI#xOC#6f?}Rz!EF0vB4HQ>~X*mCrHn*KOAsEh6`?Z;Drx<1Q0|BVML%nMH~qv zkwO|7WRXK21r$+285J0)qJ}yeXrhHSI_RQ@J_ZH>qei3As8KOQlOm0U5DQTd@x3#zdyFxzaUD(E_dP1esl~$1#^Qot zWocn~zz<+yV>#Ga3Vr~eIdcL(9{zXkGBY>aMLk3FvRGtRM7j{sy&|u2(;nZF+NP*R zyHShQQR%eLAB>JgC!#4j9j&4Zokjk6@1h^0U!vCO_2a*y|2mvDfBi+Ug=;*%3Qh8 z`?}oJG`%f{o7#tRq^W!^H#f1b&yQ7dLaAzrJ>xu z|8Vxtv(dc1d3`;e{d~Gxv!7qREe|%2+xG0|pLXPdW>>qiUtijt{d(&?*{?sP?DwN8 zvfrPSv)`|JU-tW#4rITtda!NxQ1<)XJ=|80WdB~$WZQZ&`}bHKZCgjHsdn2`ZvQ`y zwM|itT2!XvcvPbnmE&dp)kmGHWUWA!G^*hH`!1aL}ZA_5RpSf zhKLLi86q-7WYK6`G#ZV5qL26S-tbVd7XSG>*?dGljriS2l#HAn17abfK#4L56{^(9 zXwalZPMZ#0dh~hcg8`p>@lB)=BaAXefg+Y?HCWE_*cC=YT_+9C6GEElxS(oSX|TxuVTAH{8Qjt91APwso=mTWsYoLP(IUmTBh2y7%eMRms-&QUEM&9$UCfNQ3I^qs(%^UHmKUD z5%->|>+R^pZ|V?@&o0((<)An6wP)n3a;>)z^*3n4TX7lT%i5kTm}7%lxD-A8NISR@ z?fjs9!Nx?qKeZs;uS(O;dC{gKy$1C<%k}6kv~jmSb_m~7;?t{}a9Z0Xy=nr>KXFqp zc#q|rc%)Bb_68>P7w@oLLvy;N7u)rNK{DomoDL7Mp*}qup1;8N|1AXD8ysK138p47 z&y+)RJqtW(f*|x@C5hSx7mOpvetkY?tkuC0|qch`waMn)9UUTP>16kj}3T@+4<4{YD09= z0Bi&6n=;@wIz4N^1+*t-Ko|}EG2jY1mSVz5w9aP2J~WVJf`)o?OxT9{R+&KOJ+j&a z+V;Kz6R><{U2B3JbvR6*?lCC{AW0c33-i7UOo6x zm#Aj@ePT>hC&U{(F4qcepVrmRXjwQuKU*8rhQf*Jd3D-0?L}=goUF#(aFTLbyQuqV zMtppYu}qDZX&oa))eXjSWxJ88j2J7F_l-2=sFALmG%}QP#!6+9nWfA%15gy3IZDGU zQJyl(Airu>sCvNMrhIQ!L3hsFu1v9NVMCGC3;PVK4f5mG8#r^`YK4*8)@wL8WZi+m zm~{++W&gEOU_-jC=BF0g5!ElZ129r!e}m$%tv06Ztj!Rpb=^LMz}`KvI}lLI7keId z|FgfMpB8X>5m?1$Cj-IU+u@WTu+>MM90b(c?aU*%-`AXW^xLCOCW0Cmc8buCO*qM! z(ykv)zYI*2GYVn+Tsk$w536Xs92cb6-_ja;PcN= zE&8*av`*f~TQ5)$&kshZ3r26yB{`3uxJ429KK$ohYLx%Ok3?1N_CKJ@c+bU0bOqz} zk0}T5`F)TASnuZ{8ja)r*I*t0*7#)=I4wn|>G`8SUOybs!(-a&Yr-xFJ=ur=>_NW>C7LV#s-}H#! zs7IAj{>h`QQa0<+M)d!9v{vdS`&58B%coo^EAR>D(k=7}vGGkl;arL>J|T8q9`Cy$ zpK!i;l~34Sm-~cxgy|Eu-L-u}41#<@JZxuNUgHy%&3DCRyM2<&WJ1gK=vvG6>RQY8 L=~|0#2JZh4R0rPy diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_n10000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_n10000_cpp.sk deleted file mode 100644 index c89d39292141f81c865719c5c17d1f6d7d1ed322..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1000 zcmXZbO=uHA7zW_&woPqXYW~RIMh%ifgd(XH6pb|5nfXq7P%jb$gYn?8l&VpoE_$i8 zLNB67l%R+pQHqD+K{N*iQ9%(8IoYBfS}4*(DW&va-Y?%A_TilwcG;bG?XJiLt39sa z!owT03MsNIZ>*~`QRr9@J+Uq}#U;@ycJ>Fvw3rhM;+dk&_pgexVohv_&0;t2Yl-W{ z9?pKZW6JI2oD<3QaW1=k?3BBMopbxyr>>uU;|{Rj+(CB59b&uHPByH9(95V0dq_pt z<0=ZRii)w60Ezr3aye>pgy5* z!lh|_1Du^$T`O`|#n*1Lh-gJZ^r#2O0AY1`e7a)~m{FBafdl zf1p<}e9#v!o84F+y=8u5e(jDqfn2|5_Q3T=W`pOSnSSK6uS^Iw7K{z^O>-Vje>Nkq z_0v?~T1&6qQUmfCJ{&I+Wy6O=3cvG}fWaBx0F(k%Q bgwJH9#I%?Zv*NIr6Gz3vVqR?j%edcvW9p5A diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1000_cpp.sk deleted file mode 100644 index e5d099c7ffdcc00e99485010d633bb7e590d6e5b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 800 zcmXZbJ4-_`6bJCMKC1PtRZ-M-=;9#MMHj)`Y6t>j=vl(A_+_F-aq?B9EaD!&G=){)j z#UQGL#}A4nu_9`*CZ^(flEr$r;)D1s`lJp0pW<)QjyP}FDbs;N3GLs3_@!*J0S;Hn*qs=6JAS9T(*Z|r2$-`lCEda~0I z|6*t08>>ESF~<6fWBnuTiZH&EBg_wdgnYU{oPUzAo-fj9oFB@B{j_EY`wMe47~B0k zVgG)C=eZ>wuM@)e=ql$)S9o5j#(Bb;#Os_VY!K?9Q|gO(b)8VJuuZ6+-XYX8-6hml z?-A-9_6haZ2ZVERhivPJa9*ur)}0W}RX5nsAl!rWlnr9qMOIG}5wA1}G=efj; z8{Fa^3m);5J(lcqz#%Kv{Nz`=IsWH|znsidKgC(jae<3m<_cH2&P{f>!(Dc{&jTLv um?u2rd7gId@q(AUV##aX@Rohv@tzMH@R3h^=8!LZD6Igd E0i&!3<^TWy diff --git a/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1_cpp.sk b/tests/serialization_test_data/cpp_generated_files/tdigest_float_n1_cpp.sk deleted file mode 100644 index 94e38d9dc40c413c7397d29ff299d0ec3a27f07c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 RcmZQ%6iH!VV*r8%djJL40YCr% diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n0_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n0_java.sk deleted file mode 100644 index e84c2ea503d5ba25b64de9f767fb3490b28860f5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmZQ%6iH!VWMBXQ0)_y5 diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n1000000_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n1000000_java.sk deleted file mode 100644 index 6b87e1eeb12f364780e191ff2d2839f602da4862..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2224 zcmZA3X-HI26bJA-sV(HB`vEN<$}Yq8aE*RWt`( zQA2alv+5}W{aT|u^wMV9gI>0as?m$~Q4c!SM_bXY2k9Pqc^e%^mmZ;V9FR39=`p(d zG_|1l90jq#6&>UbQ44Cju23;p)Lf-TY_PmRU%^^S7rljgin~jD!qj?qcJ@&CNM*7; zq+}Qf`q)R~Z~%&)QTZ5Ef9JqUvcf<(yss(2rt-q4@2DN>=~jTwj#qi$;795K)5Oo@ zi&ME{-vC`qSFULLK{+mE?f4*drYbu&4pEU4?E6K27?|jR-=x9yVu^+83spU3g>oJ^ zeE0|sWU0KpZxlZ$QH~6=a_(B?N8cma)2Dn!+SvX=xnsiw{sMV;M-2BqSNYK@J6n9p z&+QHlfqZ{i9AAS0wVt2M_j^@+z&(Yn{mM^1P38BmlpXd&t}Iq={gTL*a^*wUlMFt* zcNz=GcVr}UaG|Qd{UMnb7bx%Wrf>s1-zI+wSk?6fexpzPG}<2}4fT;plE;ov*QcdRj)48(nQzQ1a*TQZ3dtVWe}O{D z-LRjt)<|v#e_Jbg5cYr5M#-ljkKHVJNPQi#u1az~)cdwdegb(yjbs=HakEx3jDwh7 zC;1%IudbK89o*U=xfA@dQSt%s8%?qod|{j9Rp9LHk~6`5x@Ur&O_K3^uhtFQU3BV( z_KWwr;W!bY`tNWAtT#z_IM}5dkE;c`p`9XEH?&i%)D7(t#k%3R6lEr_*A49vl_qb| Y4eb$LldDaxHM!2@29y8(5Bb0TAANiA%m4rY diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n100000_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n100000_java.sk deleted file mode 100644 index fbb27272ea654dc17b107216bbe2e9ce2ac40c54..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2192 zcmZA3Ur19?90%~Lsaa|MNn#I{Q;I^t3PKDkzx^TiASEhUX0q0-84X4eBSMYtr>mn%vKy`pF59-?5!7Yk?m|pJqLLGB_y!-UzW$ee+52ncU(gnyZi<|usxlS z9+<8!XcSQ0dkYQ@Si@}`=K$v4!w^~b_23=VBfA$(6tKYQJ|t0Kknse$6fk^#hI9%L zT7wv(z}ahG;uzHjS_Y6zf%3n8gB-4}ev7!|=?S$5-=mWP=1B;nkopHaAK@nZe|BdfLC^#Qf>&rv`@+qQD1wf1rZujT!@4biKj+pRkg( zxDlKr>jNWbrtw_vQTS+n5+_FCq~FtVW=v{t8y~}2s@H4B(L(n2k0YJt)1MhZ8m(`* zI)Xd2{;Ag^ctO^OBIu!dkrhRd^5xwaMK9&i=ZNArS$Ly(PWAfDC>~I|uV1dWev9HD zw@*Z|gFKibU`#KErY#hBMYiM#1j(TSfevyR1RBXkiGZ7I*(|V)oLV7J!mbinMYdH7 zG9PpE+M@iJf`V!*Y298IB14xii2jYmu)jm%dNQ->!xWrHJ@U?;%>zO#X-ek#fDYW8E|jx>9zu{k+>IJ`X9BhC>K<%l zf6-`Rk0-H)U6jHecDN76*tm##cK9-q?BEp~WnUe@d-kqtxXCWQj!|}Y5JT)Ow~)sh zs^bpEIM2O@PwavFNV8)PP{QlukMN80rXhT1cRq&L&>x>*DXrJ-!^radlTWdoay9Y{ z%g8$Y96xw|dIW`>e|UjX_UD)IvGZRcMpn(Q5n>O%!9KE{c#8(IDt`w*SuaRqoS)}w z8V}jYQDOKE?s6V6xJ1^s4HE1LBi^S$1{XL_WpJLX#xm$;`#kiLO|6GcvhMN_Vc+o3 zOg5f}1~Rf9HjveC4|wgqu!#7-DJi7)~#0b0U}oPhuLpMzu=%1 zePrTR^p`$tMZcLNR`j1vSkaH>gcW_Nd#vo=UTJ76`WvSlC#BVCEB+tau{z`Aj#bLZ V9jmiW?pU33a>r`wzx@Am{{xGM!vX*R diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n1000_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n1000_java.sk deleted file mode 100644 index 7832fc376c8e67faa721dbf0ab4e95953b079262..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1584 zcmZA2%}N4M6bJCptfp@ZD+?IPoj4oTu zBlHNfaM_{xi(!6qX6`-r&bfEoE?-J%A1A-3?$M0>T~e7kx0*3m zH`a-uRYH4(eZqd>fN)TFQ8*;LBpeoA7G4ovB{v_RzAn5Wyopw#)WhFbw$S=Ed3c<5 z(E2X*^0=~xuEwbk;tA^KuI`h{tsKw*x0a+qZsm}Mxa&tW%pE+YQEpA6G4Lskb5~L{ z!QDI~A9v+~rnt2D8MJm!3p^fwz2yMXaf5%aN&SRZPMMtHvGC9E%}g!QMISf2?K>(?-`z8w?mzhUC~@LK4Ih3hLR zd}ZPKG=yyn*Y}68Bm5;?vv7Z@3pa$D!YyH?jP0qyUSYp*P&h2SCL9%R{|El}_aCJV BY%c%+ diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n100_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n100_java.sk deleted file mode 100644 index 9d1d60a90feb1ee89c5f72fcaabfec50c18cd8c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1248 zcmZ9}IZ8x96o%2;je(&-2L=WrA|fKrBl605YNvMW>;*2QmR3t~3B3e^26=TVDGonh zbo;%*Ps~INqx&Y!RO2-LCrz(AN2J=bcn%NnJYK+ycnL4#6}*bq@H*bWn|KRv;~l(< z_wYVGz=!wso)!WkDF@ANq1g5!f8XIyZ6(&LN^jxTzgal!FTk25Yf#{c~vmF`4X diff --git a/tests/serialization_test_data/java_generated_files/tdigest_double_n10_java.sk b/tests/serialization_test_data/java_generated_files/tdigest_double_n10_java.sk deleted file mode 100644 index 2c1026eed9a55fb4a38dd5f9ac9bebf162eddfb2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 192 zcmZQ#6iH!VVPN25Kn5S|Av_fa2>rpH5m}mn!2y-efyNg= Date: Fri, 19 Dec 2025 13:15:17 +0800 Subject: [PATCH 26/26] reuse test data loading logics Signed-off-by: tison --- tests/common.rs | 52 +++++++++++++++++++++++++++++ tests/hll_serialization_test.rs | 42 ++++++----------------- tests/tdigest_serialization_test.rs | 37 ++++++++------------ 3 files changed, 75 insertions(+), 56 deletions(-) create mode 100644 tests/common.rs diff --git a/tests/common.rs b/tests/common.rs new file mode 100644 index 0000000..e97b920 --- /dev/null +++ b/tests/common.rs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::path::PathBuf; + +#[allow(dead_code)] // false-positive +pub fn test_data(name: &str) -> PathBuf { + const TEST_DATA_DIR: &str = "tests/test_data"; + + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join(TEST_DATA_DIR) + .join(name) +} + +pub fn serialization_test_data(sub_dir: &str, name: &str) -> PathBuf { + const SERDE_TEST_DATA_DIR: &str = "tests/serialization_test_data"; + + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join(SERDE_TEST_DATA_DIR) + .join(sub_dir) + .join(name); + + if !path.exists() { + panic!( + r#"serialization test data file not found: {} + + Please ensure test data files are present in the repository. Generally, you can + run the following commands from the project root to regenerate the test data files + if they are missing: + + $ ./tools/generate_serialization_test_data.py + "#, + path.display(), + ); + } + + path +} diff --git a/tests/hll_serialization_test.rs b/tests/hll_serialization_test.rs index a3c397b..fc1969c 100644 --- a/tests/hll_serialization_test.rs +++ b/tests/hll_serialization_test.rs @@ -24,36 +24,14 @@ //! Test data is generated by the reference implementations and stored in: //! `tests/serialization_test_data/` +mod common; + use std::fs; use std::path::PathBuf; +use common::serialization_test_data; use datasketches::hll::HllSketch; -const TEST_DATA_DIR: &str = "tests/serialization_test_data"; - -fn get_test_data_path(sub_dir: &str, name: &str) -> PathBuf { - let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join(TEST_DATA_DIR) - .join(sub_dir) - .join(name); - - if !path.exists() { - panic!( - r#"serialization test data file not found: {} - - Please ensure test data files are present in the repository. Generally, you can - run the following commands from the project root to regenerate the test data files - if they are missing: - - $ ./tools/generate_serialization_test_data.py - "#, - path.display(), - ); - } - - path -} - fn test_sketch_file(path: PathBuf, expected_cardinality: usize, expected_lg_k: u8) { let expected = expected_cardinality as f64; @@ -133,7 +111,7 @@ fn test_java_hll4_compatibility() { for n in test_cases { let filename = format!("hll4_n{}_java.sk", n); - let path = get_test_data_path("java_generated_files", &filename); + let path = serialization_test_data("java_generated_files", &filename); test_sketch_file(path, n, 12); } } @@ -144,7 +122,7 @@ fn test_java_hll6_compatibility() { for n in test_cases { let filename = format!("hll6_n{}_java.sk", n); - let path = get_test_data_path("java_generated_files", &filename); + let path = serialization_test_data("java_generated_files", &filename); test_sketch_file(path, n, 12); } } @@ -155,7 +133,7 @@ fn test_java_hll8_compatibility() { for n in test_cases { let filename = format!("hll8_n{}_java.sk", n); - let path = get_test_data_path("java_generated_files", &filename); + let path = serialization_test_data("java_generated_files", &filename); test_sketch_file(path, n, 12); } } @@ -166,7 +144,7 @@ fn test_cpp_hll4_compatibility() { for n in test_cases { let filename = format!("hll4_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, 12); } } @@ -177,7 +155,7 @@ fn test_cpp_hll6_compatibility() { for n in test_cases { let filename = format!("hll6_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, 12); } } @@ -188,7 +166,7 @@ fn test_cpp_hll8_compatibility() { for n in test_cases { let filename = format!("hll8_n{}_cpp.sk", n); - let path = get_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, 12); } } @@ -208,7 +186,7 @@ fn test_estimate_accuracy() { println!("{:-<40}", ""); for (dir, file, expected) in test_cases { - let path = get_test_data_path(dir, file); + let path = serialization_test_data(dir, file); let bytes = fs::read(&path).unwrap(); let sketch = HllSketch::deserialize(&bytes).unwrap(); let estimate = sketch.estimate(); diff --git a/tests/tdigest_serialization_test.rs b/tests/tdigest_serialization_test.rs index 74fbdb8..0ad68e4 100644 --- a/tests/tdigest_serialization_test.rs +++ b/tests/tdigest_serialization_test.rs @@ -15,27 +15,16 @@ // specific language governing permissions and limitations // under the License. -use datasketches::tdigest::TDigestMut; -use googletest::assert_that; -use googletest::prelude::{eq, near}; +mod common; + use std::fs; use std::path::PathBuf; -const TEST_DATA_DIR: &str = "tests/test_data"; -const SERDE_TEST_DATA_DIR: &str = "tests/serialization_test_data"; - -fn get_test_data_path(name: &str) -> PathBuf { - PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join(TEST_DATA_DIR) - .join(name) -} - -fn get_serde_test_data_path(sub_dir: &str, name: &str) -> PathBuf { - PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join(SERDE_TEST_DATA_DIR) - .join(sub_dir) - .join(name) -} +use common::serialization_test_data; +use common::test_data; +use datasketches::tdigest::TDigestMut; +use googletest::assert_that; +use googletest::prelude::{eq, near}; fn test_sketch_file(path: PathBuf, n: u64, with_buffer: bool, is_f32: bool) { let bytes = fs::read(&path).unwrap(); @@ -76,22 +65,22 @@ fn test_deserialize_from_cpp_snapshots() { let ns = [0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000]; for n in ns { let filename = format!("tdigest_double_n{}_cpp.sk", n); - let path = get_serde_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, false, false); } for n in ns { let filename = format!("tdigest_double_buf_n{}_cpp.sk", n); - let path = get_serde_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, true, false); } for n in ns { let filename = format!("tdigest_float_n{}_cpp.sk", n); - let path = get_serde_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, false, true); } for n in ns { let filename = format!("tdigest_float_buf_n{}_cpp.sk", n); - let path = get_serde_test_data_path("cpp_generated_files", &filename); + let path = serialization_test_data("cpp_generated_files", &filename); test_sketch_file(path, n, true, true); } } @@ -102,7 +91,7 @@ fn test_deserialize_from_reference_implementation() { "tdigest_ref_k100_n10000_double.sk", "tdigest_ref_k100_n10000_float.sk", ] { - let path = get_test_data_path(filename); + let path = test_data(filename); let bytes = fs::read(&path).unwrap(); let td = TDigestMut::deserialize(&bytes, false).unwrap(); let td = td.freeze(); @@ -138,7 +127,7 @@ fn test_deserialize_from_java_snapshots() { let ns = [0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000]; for n in ns { let filename = format!("tdigest_double_n{}_java.sk", n); - let path = get_serde_test_data_path("java_generated_files", &filename); + let path = serialization_test_data("java_generated_files", &filename); test_sketch_file(path, n, false, false); } }