From 7ee931b51e4f02313c86e9c5f7c64aadbc4b03de Mon Sep 17 00:00:00 2001 From: Klaus Aehlig Date: Fri, 12 Apr 2024 10:29:11 +0200 Subject: file chunker: increase chunk sizes As we use chunking also for reducing storage, we have to consider the overhead of block devices which is in the order of kB per file. So our target chunk size should be at least 2 orders of magnitude above this. This suggests to minimally aim for a chunk size of 128kB, a target size that also has the advantage the that maximal chunk size associated with this size is 1MB which is still well below the maximal transmission size of grpc allowing us to avoid the streaming API. As we're scaling everything up by a factor of 16, we also have to increase the number of bits in the involved masks by 4. We use this to also extend the window size by using the 2 most significant octets. Following the advice of the paper proposing FastCDC to spread out the ones roughly equally suggests 0x4444 as a suitable value for the two most significant octets. We also change the suggested extension of the remote-execution API accordingly. As the precise parameters for FastCDC when announced over the remote-execution APIs are still under discussion upstream, we simplify the name to not mention the target size. --- etc/patches/remote_execution.proto.diff | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'etc/patches') diff --git a/etc/patches/remote_execution.proto.diff b/etc/patches/remote_execution.proto.diff index 12e364e5..9a66debf 100644 --- a/etc/patches/remote_execution.proto.diff +++ b/etc/patches/remote_execution.proto.diff @@ -165,7 +165,7 @@ // [Capabilities.GetCapabilities][build.bazel.remote.execution.v2.Capabilities.GetCapabilities]. message GetCapabilitiesRequest { // The instance of the execution system to operate against. A server may -@@ -1723,6 +1874,36 @@ +@@ -1723,6 +1874,37 @@ } } @@ -189,20 +189,21 @@ + // the implementation can be found in algorithm 2 (FastCDC8KB) of + // https://ieeexplore.ieee.org/document/9055082. This algorithm has the + // following properties: -+ // - minimum chunk size: 2 KB -+ // - maximum chunk size: 64 KB -+ // - average chunk size: 8 KB ++ // - minimum chunk size: 32 KB ++ // - maximum chunk size: 1024 KB ++ // - average chunk size: 128 KB + // The 256 64-bit random numbers in the Gear table are created with the + // Mersenne Twister pseudo-random generator for 64-bit numbers with a state + // size of 19937 bits and a seed of 0. -+ FASTCDC_MT0_8KB = 2; ++ // The masks are extended by setting the 2 most significant bytes to 0x4444. ++ FASTCDC = 2; + } +} + // Capabilities of the remote cache system. message CacheCapabilities { // All the digest functions supported by the remote cache. -@@ -1751,6 +1932,25 @@ +@@ -1751,6 +1933,25 @@ // Note that this does not imply which if any compressors are supported by // the server at the gRPC level. repeated Compressor.Value supported_compressors = 6; -- cgit v1.2.3