Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions build/bazel/remote/execution/v2/fastcdc2020_test_vectors.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Test vectors for the FastCDC 2020 content-defined chunking algorithm
#
# Reference implementations:
# - Rust: https://github.com/nlfiedler/fastcdc-rs
# - Go: https://github.com/buildbuddy-io/fastcdc2020
#
# Test input:
# Image: https://github.com/nlfiedler/fastcdc-rs/blob/49c3d0b/test/fixtures/SekienAkashita.jpg
# SHA256: d9e749d9367fc908876749d6502eb212fee88c9a94892fb07da5ef3ba8bc39ed
# Size: 109466 bytes
#
# Parameters:
# MinSize: 4096
# AvgSize: 16384 (must be power of 2)
# MaxSize: 65535
# Normalization: 2
#
# Format: offset, length, sha256, fingerprint
# The fingerprint is the 64-bit gear hash value at the chunk boundary.

# Seed: 0
0 19186 0f9efa589121d5d9e9e2c4ace91337d77cae866537143f6f15a0ffd525a77c2d 17583755766661134474
19186 19279 c7c86a165573c16448cda35c9169742e85645af42be22889f8b96b8ee0ec7cb0 4098594969649699419
38465 17354 bc88521e28a8b4479cdea5f75aa721a24f3a0a7d0be903aa6d505c574e51e89d 2365586132076908760
55819 16387 4b8dac2652e4685c629d2bb1ae9d4448e676b86f2e67ca0b2fff3d9580184b79 16009206469796846404
72206 19940 c0a7062da6f2386c28e086ee0cedd5732252741269838773cff1ddb05b2df6ed 2473608525189754172
92146 17320 7fa5b12134dc75cd2ac8dc60d3a8f3c8d22f0ee9d4cf74a4aa937e2a0d2d79a5 2504464741100432583

# Seed: 666
0 17635 cb3a9d80a3569772d4ed331ca37ab0c862c759897b890fc1aac90a4f2ea3a407 17021115692437263050
17635 17334 d758c6b7b0b7eef1e996f8ccd17de6c645360b03a26c35541e7581348ac08944 8231525949846907466
34969 19136 24846aefd89e510594bae3e9d7d5ea5012067601512610fed126a3c57ba993f5 10944310959829698982
54105 17467 efa785e1fefb49f190e665f72fd246c1442079874508c312196da1fb3040d00b 13602876513398592944
71572 23593 a2f557bdd8d40d8faada963ad5f91ec54b10ccee7c5ae72754a65137592dc607 2945079350535657389
95165 14301 e131100b4a7147ccad19dc63c4a2fac1f5d8b644e1373eeb6803825024234efc 8981594897574481255
126 changes: 125 additions & 1 deletion build/bazel/remote/execution/v2/remote_execution.proto
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ service ContentAddressableStorage {
//
// When blob splitting and splicing is used at the same time, the clients and
// the server SHOULD agree out-of-band upon a chunking algorithm used by both
// parties to benefit from each others chunk data and avoid unnecessary data
// parties to benefit from each other's chunk data and avoid unnecessary data
// duplication.
//
// Errors:
Expand Down Expand Up @@ -1986,6 +1986,11 @@ message SplitBlobRequest {
// length of the blob digest hashes and the digest functions announced
// in the server's capabilities.
DigestFunction.Value digest_function = 3;

// The chunking function that the client prefers to use.
//
// The server MAY use a different chunking function.
ChunkingFunction.Value chunking_function = 4;
}

// A response message for
Expand All @@ -1998,6 +2003,9 @@ message SplitBlobResponse {
// The server MUST use the same digest function as the one explicitly or
// implicitly (through hash length) specified in the split request.
repeated Digest chunk_digests = 1;

// The chunking function used to split the blob.
ChunkingFunction.Value chunking_function = 2;
}

// A request message for
Expand Down Expand Up @@ -2036,6 +2044,9 @@ message SpliceBlobRequest {
// server SHOULD infer the digest function using the length of the blob digest
// hashes and the digest functions announced in the server's capabilities.
DigestFunction.Value digest_function = 4;

// The chunking function that the client used to split the blob.
ChunkingFunction.Value chunking_function = 5;
}

// A response message for
Expand Down Expand Up @@ -2178,6 +2189,34 @@ message DigestFunction {
}
}

// The chunking function is used to split a blob into chunks.
//
// The server advertises support for a chunking function by setting the
// corresponding params field in
// [CacheCapabilities][build.bazel.remote.execution.v2.CacheCapabilities].
// For example, if fast_cdc_2020_params is set, the server supports FAST_CDC_2020.
//
// For optimal deduplication, clients SHOULD use an advertised chunking function.
// When clients use UNKNOWN, the server chooses an algorithm for SplitBlob and
// simply verifies chunk concatenation for SpliceBlob.
message ChunkingFunction {
enum Value {
// No specific algorithm. Servers MUST always accept this value.
// For SplitBlob, the server chooses the algorithm. For SpliceBlob, the
// server only verifies that chunks concatenate to form the expected blob.
UNKNOWN = 0;

// The FastCDC chunking algorithm as described in the 2020 paper by
// Wen Xia, et al. See https://ieeexplore.ieee.org/document/9055082
// for details.
FAST_CDC_2020 = 1;

// The RepMaxCDC chunking algorithm as implemented by buildbarn/go-cdc.
// See https://github.com/buildbarn/go-cdc for details.
REP_MAX_CDC = 2;
}
}

// Describes the server/instance capabilities for updating the action cache.
message ActionCacheUpdateCapabilities {
bool update_enabled = 1;
Expand Down Expand Up @@ -2299,6 +2338,91 @@ message CacheCapabilities {
// [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob]
// operation.
bool splice_blob_support = 10;

// The parameters for the FastCDC 2020 chunking algorithm.
// If set, the server supports the FastCDC chunking algorithm.
FastCdc2020Params fast_cdc_2020_params = 11;

// The parameters for the RepMaxCDC chunking algorithm.
// If set, the server supports the RepMaxCDC chunking algorithm.
RepMaxCdcParams rep_max_cdc_params = 12;
}

// Parameters for the FastCDC content-defined chunking algorithm.
//
// Implementations MUST follow the FastCDC 2020 paper by Wen Xia, et al.:
// https://ieeexplore.ieee.org/document/9055082
//
// Supported implementations:
// - Rust: https://docs.rs/fastcdc/3.2.1/fastcdc/v2020/index.html
// - Go: https://github.com/buildbuddy-io/fastcdc2020
//
// Test vectors can be found in the accompanying fastcdc2020_test_vectors.txt file.
//
// Implementations MUST use normalization level 2, which has been found
// successful for build artifacts with an average chunk size of 512 KiB.
//
// Key algorithm components from the paper:
//
// GEAR table: 256 64-bit integers for the rolling hash, computed as:
// GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
//
// MASKS table: Bit patterns for chunk boundary detection, derived from
// the C reference implementation. The mask selection based on average
// chunk size SHOULD match the paper.
//
// The minimum and maximum chunk sizes MUST be derived from the average:
// - min_chunk_size = avg_chunk_size_bytes / 4
// - max_chunk_size = avg_chunk_size_bytes * 4
//
// Blobs smaller than max_chunk_size (avg_chunk_size_bytes * 4) SHOULD be
// uploaded without chunking.
//
// If any of the advertised parameters are not within the expected range,
// the client SHOULD ignore FastCDC chunking function support.
message FastCdc2020Params {
// The average (expected) chunk size for the FastCDC chunking algorithm.
// The value MUST be between 1 KiB and 1 MiB. The recommended value is
// 524288 (512 KiB).
uint64 avg_chunk_size_bytes = 1;

// The seed for the FastCDC mask generation.
// The recommended value is 0.
//
// All clients sharing a cache SHOULD use the same seed to maximize
// chunk reuse.
uint32 seed = 2;
}

// Parameters for the RepMaxCDC content-defined chunking algorithm.
//
// Supported implementations:
// - Go: https://github.com/buildbarn/go-cdc
//
// Key algorithm components:
//
// GEAR table: 256 64-bit integers for the rolling hash, computed as:
// GEAR[i] = high_64_bits(MD5(byte(i))) for i in 0..255
//
// The algorithm repeatedly applies chunking until all chunks are in the
// range [min_chunk_size_bytes, 2*min_chunk_size_bytes). Cutting points are
// selected where the Gear rolling hash is maximized within a lookahead
// window of horizon_size_bytes.
//
// If any of the advertised parameters are not within the expected range,
// the client SHOULD ignore RepMaxCDC chunking function support.
message RepMaxCdcParams {
// The minimum chunk size for the RepMaxCDC chunking algorithm.
// The value MUST be at least 64 bytes (the Gear hash window size).
// All chunks will be in the range [min_chunk_size_bytes, 2*min_chunk_size_bytes).
// The recommended value is 262144 (256 KiB).
uint64 min_chunk_size_bytes = 1;

// The lookahead window for finding optimal cutting points.
// Larger values improve deduplication quality with diminishing returns.
// Setting to 0 produces uniform chunks of min_chunk_size_bytes.
// The recommended value is 8 * min_chunk_size_bytes.
uint64 horizon_size_bytes = 2;
}

// Capabilities of the remote execution system.
Expand Down