diff options
author | Sascha Roloff <sascha.roloff@huawei.com> | 2024-02-23 15:46:48 +0100 |
---|---|---|
committer | Sascha Roloff <sascha.roloff@huawei.com> | 2024-02-26 17:16:21 +0100 |
commit | 25ef9672988f008e61193228756dcfed069bda57 (patch) | |
tree | 946dfd5472d4228832c019204304d004a097d936 /src | |
parent | 1debca0855d2e4ae8cf08498148831124b65bd9e (diff) | |
download | justbuild-25ef9672988f008e61193228756dcfed069bda57.tar.gz |
Implement blob chunking algorithm negotiation
Diffstat (limited to 'src')
6 files changed, 137 insertions, 16 deletions
diff --git a/src/buildtool/execution_api/execution_service/capabilities_server.cpp b/src/buildtool/execution_api/execution_service/capabilities_server.cpp index 35126bd3..22209524 100644 --- a/src/buildtool/execution_api/execution_service/capabilities_server.cpp +++ b/src/buildtool/execution_api/execution_service/capabilities_server.cpp @@ -37,6 +37,9 @@ auto CapabilitiesServiceImpl::GetCapabilities( cache.set_max_batch_total_size_bytes(kMaxBatchTransferSize); static_assert(kMaxBatchTransferSize < GRPC_DEFAULT_MAX_RECV_MESSAGE_LENGTH, "Max batch transfer size too large."); + cache.add_supported_chunking_algorithms( + ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_FASTCDC_MT0_8KB); *(response->mutable_cache_capabilities()) = cache; exec.set_digest_function( diff --git a/src/buildtool/execution_api/execution_service/cas_server.cpp b/src/buildtool/execution_api/execution_service/cas_server.cpp index 1e56eb53..064308e5 100644 --- a/src/buildtool/execution_api/execution_service/cas_server.cpp +++ b/src/buildtool/execution_api/execution_service/cas_server.cpp @@ -38,6 +38,23 @@ static auto IsValidHash(std::string const& x) -> bool { length == kGitSHA1Length); } +static auto ChunkingAlgorithmToString(::bazel_re::ChunkingAlgorithm_Value type) + -> std::string { + switch (type) { + case ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_IDENTITY: + return "IDENTITY"; + case ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_RABINCDC_8KB: + return "RABINCDC_8KB"; + case ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_FASTCDC_MT0_8KB: + return "FASTCDC_MT0_8KB"; + default: + return "[Unknown Chunking Algorithm Type]"; + } +} + auto CASServiceImpl::FindMissingBlobs( ::grpc::ServerContext* /*context*/, const ::bazel_re::FindMissingBlobsRequest* request, @@ -224,6 +241,28 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/, return ::grpc::Status{grpc::StatusCode::INVALID_ARGUMENT, str}; } + auto chunking_algorithm = request->chunking_algorithm(); + logger_.Emit(LogLevel::Debug, + "SplitBlob({}, {})", + blob_digest.hash(), + ChunkingAlgorithmToString(chunking_algorithm)); + + // Print warning if unsupported chunking algorithm was requested. + if (chunking_algorithm != ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_IDENTITY and + chunking_algorithm != ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_FASTCDC_MT0_8KB) { + logger_.Emit( + LogLevel::Warning, + fmt::format( + "SplitBlob: unsupported chunking algorithm {}, will use " + "default implementation {}", + ChunkingAlgorithmToString(chunking_algorithm), + ChunkingAlgorithmToString( + ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_FASTCDC_MT0_8KB))); + } + // Acquire garbage collection lock. auto lock = GarbageCollector::SharedLock(); if (not lock) { @@ -233,16 +272,20 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/, return ::grpc::Status{grpc::StatusCode::INTERNAL, str}; } - logger_.Emit(LogLevel::Info, "SplitBlob({})", blob_digest.hash()); - // Split blob into chunks. - auto split_result = CASUtils::SplitBlob(blob_digest, *storage_); + auto split_result = + chunking_algorithm == ::bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_IDENTITY + ? CASUtils::SplitBlobIdentity(blob_digest, *storage_) + : CASUtils::SplitBlobFastCDC(blob_digest, *storage_); + if (std::holds_alternative<grpc::Status>(split_result)) { auto status = std::get<grpc::Status>(split_result); auto str = fmt::format("SplitBlob: {}", status.error_message()); logger_.Emit(LogLevel::Error, str); return ::grpc::Status{status.error_code(), str}; } + auto chunk_digests = std::get<std::vector<bazel_re::Digest>>(split_result); logger_.Emit(LogLevel::Debug, [&blob_digest, &chunk_digests]() { std::stringstream ss{}; @@ -256,6 +299,7 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/, ss << "]"; return ss.str(); }); + std::copy(chunk_digests.cbegin(), chunk_digests.cend(), pb::back_inserter(response->mutable_chunk_digests())); diff --git a/src/buildtool/execution_api/execution_service/cas_server.hpp b/src/buildtool/execution_api/execution_service/cas_server.hpp index 306c2833..fd77a03e 100644 --- a/src/buildtool/execution_api/execution_service/cas_server.hpp +++ b/src/buildtool/execution_api/execution_service/cas_server.hpp @@ -120,17 +120,44 @@ class CASServiceImpl final -> ::grpc::Status override; // Split a blob into chunks. // + // This splitting API aims to reduce download traffic between client and + // server, e.g., if a client needs to fetch a large blob that just has been + // modified slightly since the last built. In this case, there is no need to + // fetch the entire blob data, but just the binary differences between the + // two blob versions, which are typically determined by deduplication + // techniques such as content-defined chunking. + // // Clients can use this API before downloading a blob to determine which // parts of the blob are already present locally and do not need to be - // downloaded again. - // - // The blob is split into chunks which are individually stored in the CAS. A - // list of the chunk digests is returned in the order in which the chunks - // have to be concatenated to assemble the requested blob. - // - // Using this API is optional but it allows clients to download only the - // missing parts of a blob instead of the entire blob data, which in turn - // can considerably reduce network traffic. + // downloaded again. The server splits the blob into chunks according to a + // specified content-defined chunking algorithm and returns a list of the + // chunk digests in the order in which the chunks have to be concatenated to + // assemble the requested blob. + // + // A client can expect the following guarantees from the server if a split + // request is answered successfully: + // 1. The blob chunks are stored in CAS. + // 2. Concatenating the blob chunks in the order of the digest list + // returned by the server results in the original blob. + // + // The usage of this API is optional for clients but it allows them to + // download only the missing parts of a large blob instead of the entire + // blob data, which in turn can considerably reduce download network + // traffic. + // + // Since the generated chunks are stored as blobs, they underlie the same + // lifetimes as other blobs. However, their lifetime is extended if they are + // part of the result of a split blob request. + // + // For the client, it is recommended to verify whether the digest of the + // blob assembled by the fetched chunks results in the requested blob + // digest. + // + // If several clients use blob splitting, it is recommended that they + // request the same splitting algorithm to benefit from each others chunking + // data. In combination with blob splicing, an agreement about the chunking + // algorithm is recommended since both client as well as server side can + // benefit from each others chunking data. // // Errors: // diff --git a/src/buildtool/execution_api/execution_service/cas_utils.cpp b/src/buildtool/execution_api/execution_service/cas_utils.cpp index 3da56e28..48bbd124 100644 --- a/src/buildtool/execution_api/execution_service/cas_utils.cpp +++ b/src/buildtool/execution_api/execution_service/cas_utils.cpp @@ -63,8 +63,47 @@ auto CASUtils::EnsureTreeInvariant(std::string const& data, return std::nullopt; } -auto CASUtils::SplitBlob(bazel_re::Digest const& blob_digest, - Storage const& storage) noexcept +auto CASUtils::SplitBlobIdentity(bazel_re::Digest const& blob_digest, + Storage const& storage) noexcept + -> std::variant<std::vector<bazel_re::Digest>, grpc::Status> { + + // Check blob existence. + auto path = NativeSupport::IsTree(blob_digest.hash()) + ? storage.CAS().TreePath(blob_digest) + : storage.CAS().BlobPath(blob_digest, false); + if (not path) { + return grpc::Status{ + grpc::StatusCode::NOT_FOUND, + fmt::format("blob not found {}", blob_digest.hash())}; + } + + // The split protocol states that each chunk that is returned by the + // operation is stored in (file) CAS. This means for the native mode, if we + // return the identity of a tree, we need to put the tree data in file CAS + // and return the resulting digest. + auto chunk_digests = std::vector<bazel_re::Digest>{}; + if (NativeSupport::IsTree(blob_digest.hash())) { + auto tree_data = FileSystemManager::ReadFile(*path); + if (not tree_data) { + return grpc::Status{ + grpc::StatusCode::INTERNAL, + fmt::format("could read tree data {}", blob_digest.hash())}; + } + auto digest = storage.CAS().StoreBlob(*tree_data, false); + if (not digest) { + return grpc::Status{grpc::StatusCode::INTERNAL, + fmt::format("could not store tree as blob {}", + blob_digest.hash())}; + } + chunk_digests.emplace_back(*digest); + return chunk_digests; + } + chunk_digests.emplace_back(blob_digest); + return chunk_digests; +} + +auto CASUtils::SplitBlobFastCDC(bazel_re::Digest const& blob_digest, + Storage const& storage) noexcept -> std::variant<std::vector<bazel_re::Digest>, grpc::Status> { // Check blob existence. diff --git a/src/buildtool/execution_api/execution_service/cas_utils.hpp b/src/buildtool/execution_api/execution_service/cas_utils.hpp index b22260b8..c3155b17 100644 --- a/src/buildtool/execution_api/execution_service/cas_utils.hpp +++ b/src/buildtool/execution_api/execution_service/cas_utils.hpp @@ -31,8 +31,14 @@ class CASUtils { std::string const& hash, Storage const& storage) noexcept -> std::optional<std::string>; - [[nodiscard]] static auto SplitBlob(bazel_re::Digest const& blob_digest, - Storage const& storage) noexcept + [[nodiscard]] static auto SplitBlobIdentity( + bazel_re::Digest const& blob_digest, + Storage const& storage) noexcept + -> std::variant<std::vector<bazel_re::Digest>, grpc::Status>; + + [[nodiscard]] static auto SplitBlobFastCDC( + bazel_re::Digest const& blob_digest, + Storage const& storage) noexcept -> std::variant<std::vector<bazel_re::Digest>, grpc::Status>; [[nodiscard]] static auto SpliceBlob( diff --git a/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp b/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp index 473dd5ba..e6830261 100644 --- a/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp +++ b/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp @@ -320,6 +320,8 @@ auto BazelCasClient::SplitBlob(std::string const& instance_name, bazel_re::SplitBlobRequest request{}; request.set_instance_name(instance_name); request.mutable_blob_digest()->CopyFrom(blob_digest); + request.set_chunking_algorithm(bazel_re::ChunkingAlgorithm_Value:: + ChunkingAlgorithm_Value_FASTCDC_MT0_8KB); bazel_re::SplitBlobResponse response{}; auto [ok, status] = WithRetry( [this, &response, &request]() { |