From 24cc351546b82c0ee86561270fc6cfa103081185 Mon Sep 17 00:00:00 2001 From: Ed Schouten Date: Mon, 16 Feb 2026 15:25:27 +0100 Subject: [PATCH] Give some guidance on what the average chunk size of RepMaxCDC is MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though the minimum and maximum chunk sizes are good to know, the most interesting piece of information is the average chunk size. That determines the number of chunks it takes to store a given file. For RepMaxCDC the chunk size distribution is analogous to that of the distance between cars in Rényi's parking problem. This means that in order to convert the minimum chunk size to the average, we only need to divide it by the expected density. --- build/bazel/remote/execution/v2/remote_execution.pb.go | 5 +++++ build/bazel/remote/execution/v2/remote_execution.proto | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/build/bazel/remote/execution/v2/remote_execution.pb.go b/build/bazel/remote/execution/v2/remote_execution.pb.go index ad7d498b..1ece63f9 100755 --- a/build/bazel/remote/execution/v2/remote_execution.pb.go +++ b/build/bazel/remote/execution/v2/remote_execution.pb.go @@ -4643,6 +4643,11 @@ func (x *FastCdc2020Params) GetSeed() uint32 { // selected where the Gear rolling hash is maximized within a lookahead // window of horizon_size_bytes. // +// For sufficiently large files, the average chunk size prior to +// deduplication will approximately be min_chunk_size_bytes divided by +// Rényi's parking constant (0.7475979203...). More details: +// https://mathworld.wolfram.com/RenyisParkingConstants.html +// // If any of the advertised parameters are not within the expected range, // the client SHOULD ignore RepMaxCDC chunking function support. type RepMaxCdcParams struct { diff --git a/build/bazel/remote/execution/v2/remote_execution.proto b/build/bazel/remote/execution/v2/remote_execution.proto index e9cab05d..8fa38597 100644 --- a/build/bazel/remote/execution/v2/remote_execution.proto +++ b/build/bazel/remote/execution/v2/remote_execution.proto @@ -2409,6 +2409,11 @@ message FastCdc2020Params { // selected where the Gear rolling hash is maximized within a lookahead // window of horizon_size_bytes. // +// For sufficiently large files, the average chunk size prior to +// deduplication will approximately be min_chunk_size_bytes divided by +// Rényi's parking constant (0.7475979203...). More details: +// https://mathworld.wolfram.com/RenyisParkingConstants.html +// // If any of the advertised parameters are not within the expected range, // the client SHOULD ignore RepMaxCDC chunking function support. message RepMaxCdcParams {