diff --git a/build/bazel/remote/execution/v2/fastcdc2020_test_vectors.txt b/build/bazel/remote/execution/v2/fastcdc2020_test_vectors.txt new file mode 100644 index 00000000..db89efd7 --- /dev/null +++ b/build/bazel/remote/execution/v2/fastcdc2020_test_vectors.txt @@ -0,0 +1,35 @@ +# Test vectors for the FastCDC 2020 content-defined chunking algorithm +# +# Reference implementations: +# - Rust: https://github.com/nlfiedler/fastcdc-rs +# - Go: https://github.com/buildbuddy-io/fastcdc2020 +# +# Test input: +# Image: https://github.com/nlfiedler/fastcdc-rs/blob/49c3d0b/test/fixtures/SekienAkashita.jpg +# SHA256: d9e749d9367fc908876749d6502eb212fee88c9a94892fb07da5ef3ba8bc39ed +# Size: 109466 bytes +# +# Parameters: +# MinSize: 4096 +# AvgSize: 16384 (must be power of 2) +# MaxSize: 65535 +# Normalization: 2 +# +# Format: offset, length, sha256, fingerprint +# The fingerprint is the 64-bit gear hash value at the chunk boundary. + +# Seed: 0 +0 19186 0f9efa589121d5d9e9e2c4ace91337d77cae866537143f6f15a0ffd525a77c2d 17583755766661134474 +19186 19279 c7c86a165573c16448cda35c9169742e85645af42be22889f8b96b8ee0ec7cb0 4098594969649699419 +38465 17354 bc88521e28a8b4479cdea5f75aa721a24f3a0a7d0be903aa6d505c574e51e89d 2365586132076908760 +55819 16387 4b8dac2652e4685c629d2bb1ae9d4448e676b86f2e67ca0b2fff3d9580184b79 16009206469796846404 +72206 19940 c0a7062da6f2386c28e086ee0cedd5732252741269838773cff1ddb05b2df6ed 2473608525189754172 +92146 17320 7fa5b12134dc75cd2ac8dc60d3a8f3c8d22f0ee9d4cf74a4aa937e2a0d2d79a5 2504464741100432583 + +# Seed: 666 +0 17635 cb3a9d80a3569772d4ed331ca37ab0c862c759897b890fc1aac90a4f2ea3a407 17021115692437263050 +17635 17334 d758c6b7b0b7eef1e996f8ccd17de6c645360b03a26c35541e7581348ac08944 8231525949846907466 +34969 19136 24846aefd89e510594bae3e9d7d5ea5012067601512610fed126a3c57ba993f5 10944310959829698982 +54105 17467 efa785e1fefb49f190e665f72fd246c1442079874508c312196da1fb3040d00b 13602876513398592944 +71572 23593 a2f557bdd8d40d8faada963ad5f91ec54b10ccee7c5ae72754a65137592dc607 2945079350535657389 +95165 14301 e131100b4a7147ccad19dc63c4a2fac1f5d8b644e1373eeb6803825024234efc 8981594897574481255 diff --git a/build/bazel/remote/execution/v2/remote_execution.proto b/build/bazel/remote/execution/v2/remote_execution.proto index d649338e..2a26e42a 100644 --- a/build/bazel/remote/execution/v2/remote_execution.proto +++ b/build/bazel/remote/execution/v2/remote_execution.proto @@ -486,7 +486,7 @@ service ContentAddressableStorage { // // When blob splitting and splicing is used at the same time, the clients and // the server SHOULD agree out-of-band upon a chunking algorithm used by both - // parties to benefit from each others chunk data and avoid unnecessary data + // parties to benefit from each other's chunk data and avoid unnecessary data // duplication. // // Errors: @@ -1986,6 +1986,11 @@ message SplitBlobRequest { // length of the blob digest hashes and the digest functions announced // in the server's capabilities. DigestFunction.Value digest_function = 3; + + // The chunking function that the client prefers to use. + // + // The server MAY use a different chunking function. + ChunkingFunction.Value chunking_function = 4; } // A response message for @@ -1998,6 +2003,9 @@ message SplitBlobResponse { // The server MUST use the same digest function as the one explicitly or // implicitly (through hash length) specified in the split request. repeated Digest chunk_digests = 1; + + // The chunking function used to split the blob. + ChunkingFunction.Value chunking_function = 2; } // A request message for @@ -2036,6 +2044,9 @@ message SpliceBlobRequest { // server SHOULD infer the digest function using the length of the blob digest // hashes and the digest functions announced in the server's capabilities. DigestFunction.Value digest_function = 4; + + // The chunking function that the client used to split the blob. + ChunkingFunction.Value chunking_function = 5; } // A response message for @@ -2178,6 +2189,34 @@ message DigestFunction { } } +// The chunking function is used to split a blob into chunks. +// +// The server advertises support for a chunking function by setting the +// corresponding params field in +// [CacheCapabilities][build.bazel.remote.execution.v2.CacheCapabilities]. +// For example, if fast_cdc_2020_params is set, the server supports FAST_CDC_2020. +// +// For optimal deduplication, clients SHOULD use an advertised chunking function. +// When clients use UNKNOWN, the server chooses an algorithm for SplitBlob and +// simply verifies chunk concatenation for SpliceBlob. +message ChunkingFunction { + enum Value { + // No specific algorithm. Servers MUST always accept this value. + // For SplitBlob, the server chooses the algorithm. For SpliceBlob, the + // server only verifies that chunks concatenate to form the expected blob. + UNKNOWN = 0; + + // The FastCDC chunking algorithm as described in the 2020 paper by + // Wen Xia, et al. See https://ieeexplore.ieee.org/document/9055082 + // for details. + FAST_CDC_2020 = 1; + + // The RepMaxCDC chunking algorithm as implemented by buildbarn/go-cdc. + // See https://github.com/buildbarn/go-cdc for details. + REP_MAX_CDC = 2; + } +} + // Describes the server/instance capabilities for updating the action cache. message ActionCacheUpdateCapabilities { bool update_enabled = 1; @@ -2299,6 +2338,91 @@ message CacheCapabilities { // [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob] // operation. bool splice_blob_support = 10; + + // The parameters for the FastCDC 2020 chunking algorithm. + // If set, the server supports the FastCDC chunking algorithm. + FastCdc2020Params fast_cdc_2020_params = 11; + + // The parameters for the RepMaxCDC chunking algorithm. + // If set, the server supports the RepMaxCDC chunking algorithm. + RepMaxCdcParams rep_max_cdc_params = 12; +} + +// Parameters for the FastCDC content-defined chunking algorithm. +// +// Implementations MUST follow the FastCDC 2020 paper by Wen Xia, et al.: +// https://ieeexplore.ieee.org/document/9055082 +// +// Supported implementations: +// - Rust: https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html +// - Go: https://github.com/buildbuddy-io/fastcdc2020 +// +// Test vectors can be found in the accompanying fastcdc2020_test_vectors.txt file. +// +// Implementations MUST use normalization level 2, which has been found +// successful for build artifacts with an average chunk size of 512 KiB. +// +// Key algorithm components from the paper: +// +// GEAR table: 256 64-bit integers for the rolling hash, computed as: +// GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255 +// +// MASKS table: Bit patterns for chunk boundary detection, derived from +// the C reference implementation. The mask selection based on average +// chunk size SHOULD match the paper. +// +// The minimum and maximum chunk sizes MUST be derived from the average: +// - min_chunk_size = avg_chunk_size_bytes / 4 +// - max_chunk_size = avg_chunk_size_bytes * 4 +// +// Blobs smaller than max_chunk_size (avg_chunk_size_bytes * 4) SHOULD be +// uploaded without chunking. +// +// If any of the advertised parameters are not within the expected range, +// the client SHOULD ignore FastCDC chunking function support. +message FastCdc2020Params { + // The average (expected) chunk size for the FastCDC chunking algorithm. + // The value MUST be between 1 KiB and 1 MiB. The recommended value is + // 524288 (512 KiB). + uint64 avg_chunk_size_bytes = 1; + + // The seed for the FastCDC mask generation. + // The recommended value is 0. + // + // All clients sharing a cache SHOULD use the same seed to maximize + // chunk reuse. + uint32 seed = 2; +} + +// Parameters for the RepMaxCDC content-defined chunking algorithm. +// +// Supported implementations: +// - Go: https://github.com/buildbarn/go-cdc +// +// Key algorithm components: +// +// GEAR table: 256 64-bit integers for the rolling hash, computed as: +// GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255 +// +// The algorithm repeatedly applies chunking until all chunks are in the +// range [min_chunk_size_bytes, 2*min_chunk_size_bytes). Cutting points are +// selected where the Gear rolling hash is maximized within a lookahead +// window of horizon_size_bytes. +// +// If any of the advertised parameters are not within the expected range, +// the client SHOULD ignore RepMaxCDC chunking function support. +message RepMaxCdcParams { + // The minimum chunk size for the RepMaxCDC chunking algorithm. + // The value MUST be at least 64 bytes (the Gear hash window size). + // All chunks will be in the range [min_chunk_size_bytes, 2*min_chunk_size_bytes). + // The recommended value is 262144 (256 KiB). + uint64 min_chunk_size_bytes = 1; + + // The lookahead window for finding optimal cutting points. + // Larger values improve deduplication quality with diminishing returns. + // Setting to 0 produces uniform chunks of min_chunk_size_bytes. + // The recommended value is 8 * min_chunk_size_bytes. + uint64 horizon_size_bytes = 2; } // Capabilities of the remote execution system.