diff options
author | Klaus Aehlig <klaus.aehlig@huawei.com> | 2024-04-12 10:29:11 +0200 |
---|---|---|
committer | Klaus Aehlig <klaus.aehlig@huawei.com> | 2024-04-12 11:59:01 +0200 |
commit | 7ee931b51e4f02313c86e9c5f7c64aadbc4b03de (patch) | |
tree | 4c79b16277c092f4dc2d74ec977ebd985aa19b91 | |
parent | 920dbcad30333ea91e34c8c5da07bb6f499c4925 (diff) | |
download | justbuild-7ee931b51e4f02313c86e9c5f7c64aadbc4b03de.tar.gz |
file chunker: increase chunk sizes
As we use chunking also for reducing storage, we have to consider
the overhead of block devices which is in the order of kB per file.
So our target chunk size should be at least 2 orders of magnitude
above this. This suggests to minimally aim for a chunk size of
128kB, a target size that also has the advantage the that maximal
chunk size associated with this size is 1MB which is still well
below the maximal transmission size of grpc allowing us to avoid
the streaming API.
As we're scaling everything up by a factor of 16, we also have
to increase the number of bits in the involved masks by 4. We use
this to also extend the window size by using the 2 most significant
octets. Following the advice of the paper proposing FastCDC to
spread out the ones roughly equally suggests 0x4444 as a suitable
value for the two most significant octets.
We also change the suggested extension of the remote-execution API
accordingly. As the precise parameters for FastCDC when announced
over the remote-execution APIs are still under discussion upstream,
we simplify the name to not mention the target size.
7 files changed, 26 insertions, 27 deletions
diff --git a/etc/patches/remote_execution.proto.diff b/etc/patches/remote_execution.proto.diff index 12e364e5..9a66debf 100644 --- a/etc/patches/remote_execution.proto.diff +++ b/etc/patches/remote_execution.proto.diff @@ -165,7 +165,7 @@ // [Capabilities.GetCapabilities][build.bazel.remote.execution.v2.Capabilities.GetCapabilities]. message GetCapabilitiesRequest { // The instance of the execution system to operate against. A server may -@@ -1723,6 +1874,36 @@ +@@ -1723,6 +1874,37 @@ } } @@ -189,20 +189,21 @@ + // the implementation can be found in algorithm 2 (FastCDC8KB) of + // https://ieeexplore.ieee.org/document/9055082. This algorithm has the + // following properties: -+ // - minimum chunk size: 2 KB -+ // - maximum chunk size: 64 KB -+ // - average chunk size: 8 KB ++ // - minimum chunk size: 32 KB ++ // - maximum chunk size: 1024 KB ++ // - average chunk size: 128 KB + // The 256 64-bit random numbers in the Gear table are created with the + // Mersenne Twister pseudo-random generator for 64-bit numbers with a state + // size of 19937 bits and a seed of 0. -+ FASTCDC_MT0_8KB = 2; ++ // The masks are extended by setting the 2 most significant bytes to 0x4444. ++ FASTCDC = 2; + } +} + // Capabilities of the remote cache system. message CacheCapabilities { // All the digest functions supported by the remote cache. -@@ -1751,6 +1932,25 @@ +@@ -1751,6 +1933,25 @@ // Note that this does not imply which if any compressors are supported by // the server at the gRPC level. repeated Compressor.Value supported_compressors = 6; diff --git a/src/buildtool/execution_api/execution_service/capabilities_server.cpp b/src/buildtool/execution_api/execution_service/capabilities_server.cpp index 1f34b388..71a1361f 100644 --- a/src/buildtool/execution_api/execution_service/capabilities_server.cpp +++ b/src/buildtool/execution_api/execution_service/capabilities_server.cpp @@ -41,8 +41,7 @@ auto CapabilitiesServiceImpl::GetCapabilities( static_assert(kMaxBatchTransferSize < GRPC_DEFAULT_MAX_RECV_MESSAGE_LENGTH, "Max batch transfer size too large."); cache.add_supported_chunking_algorithms( - ::bazel_re::ChunkingAlgorithm_Value:: - ChunkingAlgorithm_Value_FASTCDC_MT0_8KB); + ::bazel_re::ChunkingAlgorithm_Value::ChunkingAlgorithm_Value_FASTCDC); *(response->mutable_cache_capabilities()) = cache; exec.set_digest_function( diff --git a/src/buildtool/execution_api/execution_service/cas_server.cpp b/src/buildtool/execution_api/execution_service/cas_server.cpp index da585851..b9143daa 100644 --- a/src/buildtool/execution_api/execution_service/cas_server.cpp +++ b/src/buildtool/execution_api/execution_service/cas_server.cpp @@ -51,8 +51,8 @@ static auto ChunkingAlgorithmToString(::bazel_re::ChunkingAlgorithm_Value type) ChunkingAlgorithm_Value_RABINCDC_8KB: return "RABINCDC_8KB"; case ::bazel_re::ChunkingAlgorithm_Value:: - ChunkingAlgorithm_Value_FASTCDC_MT0_8KB: - return "FASTCDC_MT0_8KB"; + ChunkingAlgorithm_Value_FASTCDC: + return "FASTCDC"; default: return "[Unknown Chunking Algorithm Type]"; } @@ -254,16 +254,15 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/, if (chunking_algorithm != ::bazel_re::ChunkingAlgorithm_Value:: ChunkingAlgorithm_Value_IDENTITY and chunking_algorithm != ::bazel_re::ChunkingAlgorithm_Value:: - ChunkingAlgorithm_Value_FASTCDC_MT0_8KB) { - logger_.Emit( - LogLevel::Warning, - fmt::format( - "SplitBlob: unsupported chunking algorithm {}, will use " - "default implementation {}", - ChunkingAlgorithmToString(chunking_algorithm), - ChunkingAlgorithmToString( - ::bazel_re::ChunkingAlgorithm_Value:: - ChunkingAlgorithm_Value_FASTCDC_MT0_8KB))); + ChunkingAlgorithm_Value_FASTCDC) { + logger_.Emit(LogLevel::Warning, + fmt::format("SplitBlob: unsupported chunking algorithm " + "{}, will use " + "default implementation {}", + ChunkingAlgorithmToString(chunking_algorithm), + ChunkingAlgorithmToString( + ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_FASTCDC))); } // Acquire garbage collection lock. diff --git a/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp b/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp index e7dea224..748dee6f 100644 --- a/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp +++ b/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp @@ -388,8 +388,8 @@ auto BazelCasClient::SplitBlob(std::string const& instance_name, bazel_re::SplitBlobRequest request{}; request.set_instance_name(instance_name); request.mutable_blob_digest()->CopyFrom(blob_digest); - request.set_chunking_algorithm(bazel_re::ChunkingAlgorithm_Value:: - ChunkingAlgorithm_Value_FASTCDC_MT0_8KB); + request.set_chunking_algorithm( + bazel_re::ChunkingAlgorithm_Value::ChunkingAlgorithm_Value_FASTCDC); bazel_re::SplitBlobResponse response{}; auto [ok, status] = WithRetry( [this, &response, &request]() { diff --git a/src/buildtool/storage/file_chunker.cpp b/src/buildtool/storage/file_chunker.cpp index 8e747900..b94f487a 100644 --- a/src/buildtool/storage/file_chunker.cpp +++ b/src/buildtool/storage/file_chunker.cpp @@ -23,8 +23,8 @@ namespace { // Mask values taken from algorithm 2 of the paper // https://ieeexplore.ieee.org/document/9055082. -constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits -constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits +constexpr std::uint64_t kMaskS{0x4444d9f003530000ULL}; // 19 '1' bits +constexpr std::uint64_t kMaskL{0x4444d90003530000ULL}; // 15 '1' bits // Predefined array of 256 random 64-bit integers, needs to be initialized. constexpr std::uint32_t kRandomTableSize{256}; diff --git a/src/buildtool/storage/file_chunker.hpp b/src/buildtool/storage/file_chunker.hpp index f2aea001..914de3f0 100644 --- a/src/buildtool/storage/file_chunker.hpp +++ b/src/buildtool/storage/file_chunker.hpp @@ -32,7 +32,7 @@ /// A read buffer is used to progressively process the file content instead of /// reading the entire file content in memory. class FileChunker { - static constexpr std::uint32_t kAverageChunkSize{1024 * 8}; // 8 KB + static constexpr std::uint32_t kAverageChunkSize{1024 * 128}; // 128 KB static constexpr std::uint32_t kDefaultSeed{0}; public: diff --git a/test/buildtool/storage/large_object_cas.test.cpp b/test/buildtool/storage/large_object_cas.test.cpp index f9a427f8..9b0fc90b 100644 --- a/test/buildtool/storage/large_object_cas.test.cpp +++ b/test/buildtool/storage/large_object_cas.test.cpp @@ -67,8 +67,8 @@ using File = Blob<false>; class Tree final { public: - static constexpr auto kLargeId = std::string_view("tree_256"); - static constexpr auto kLargeSize = std::uintmax_t(256); + static constexpr auto kLargeId = std::string_view("tree_4096"); + static constexpr auto kLargeSize = std::uintmax_t(4096); static constexpr auto kSmallId = std::string_view("tree_1"); static constexpr auto kSmallSize = std::uintmax_t(1); |