summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSascha Roloff <sascha.roloff@huawei.com>2024-02-23 15:46:48 +0100
committerSascha Roloff <sascha.roloff@huawei.com>2024-02-26 17:16:21 +0100
commit25ef9672988f008e61193228756dcfed069bda57 (patch)
tree946dfd5472d4228832c019204304d004a097d936 /src
parent1debca0855d2e4ae8cf08498148831124b65bd9e (diff)
downloadjustbuild-25ef9672988f008e61193228756dcfed069bda57.tar.gz
Implement blob chunking algorithm negotiation
Diffstat (limited to 'src')
-rw-r--r--src/buildtool/execution_api/execution_service/capabilities_server.cpp3
-rw-r--r--src/buildtool/execution_api/execution_service/cas_server.cpp50
-rw-r--r--src/buildtool/execution_api/execution_service/cas_server.hpp45
-rw-r--r--src/buildtool/execution_api/execution_service/cas_utils.cpp43
-rw-r--r--src/buildtool/execution_api/execution_service/cas_utils.hpp10
-rw-r--r--src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp2
6 files changed, 137 insertions, 16 deletions
diff --git a/src/buildtool/execution_api/execution_service/capabilities_server.cpp b/src/buildtool/execution_api/execution_service/capabilities_server.cpp
index 35126bd3..22209524 100644
--- a/src/buildtool/execution_api/execution_service/capabilities_server.cpp
+++ b/src/buildtool/execution_api/execution_service/capabilities_server.cpp
@@ -37,6 +37,9 @@ auto CapabilitiesServiceImpl::GetCapabilities(
cache.set_max_batch_total_size_bytes(kMaxBatchTransferSize);
static_assert(kMaxBatchTransferSize < GRPC_DEFAULT_MAX_RECV_MESSAGE_LENGTH,
"Max batch transfer size too large.");
+ cache.add_supported_chunking_algorithms(
+ ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_FASTCDC_MT0_8KB);
*(response->mutable_cache_capabilities()) = cache;
exec.set_digest_function(
diff --git a/src/buildtool/execution_api/execution_service/cas_server.cpp b/src/buildtool/execution_api/execution_service/cas_server.cpp
index 1e56eb53..064308e5 100644
--- a/src/buildtool/execution_api/execution_service/cas_server.cpp
+++ b/src/buildtool/execution_api/execution_service/cas_server.cpp
@@ -38,6 +38,23 @@ static auto IsValidHash(std::string const& x) -> bool {
length == kGitSHA1Length);
}
+static auto ChunkingAlgorithmToString(::bazel_re::ChunkingAlgorithm_Value type)
+ -> std::string {
+ switch (type) {
+ case ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_IDENTITY:
+ return "IDENTITY";
+ case ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_RABINCDC_8KB:
+ return "RABINCDC_8KB";
+ case ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_FASTCDC_MT0_8KB:
+ return "FASTCDC_MT0_8KB";
+ default:
+ return "[Unknown Chunking Algorithm Type]";
+ }
+}
+
auto CASServiceImpl::FindMissingBlobs(
::grpc::ServerContext* /*context*/,
const ::bazel_re::FindMissingBlobsRequest* request,
@@ -224,6 +241,28 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/,
return ::grpc::Status{grpc::StatusCode::INVALID_ARGUMENT, str};
}
+ auto chunking_algorithm = request->chunking_algorithm();
+ logger_.Emit(LogLevel::Debug,
+ "SplitBlob({}, {})",
+ blob_digest.hash(),
+ ChunkingAlgorithmToString(chunking_algorithm));
+
+ // Print warning if unsupported chunking algorithm was requested.
+ if (chunking_algorithm != ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_IDENTITY and
+ chunking_algorithm != ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_FASTCDC_MT0_8KB) {
+ logger_.Emit(
+ LogLevel::Warning,
+ fmt::format(
+ "SplitBlob: unsupported chunking algorithm {}, will use "
+ "default implementation {}",
+ ChunkingAlgorithmToString(chunking_algorithm),
+ ChunkingAlgorithmToString(
+ ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_FASTCDC_MT0_8KB)));
+ }
+
// Acquire garbage collection lock.
auto lock = GarbageCollector::SharedLock();
if (not lock) {
@@ -233,16 +272,20 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/,
return ::grpc::Status{grpc::StatusCode::INTERNAL, str};
}
- logger_.Emit(LogLevel::Info, "SplitBlob({})", blob_digest.hash());
-
// Split blob into chunks.
- auto split_result = CASUtils::SplitBlob(blob_digest, *storage_);
+ auto split_result =
+ chunking_algorithm == ::bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_IDENTITY
+ ? CASUtils::SplitBlobIdentity(blob_digest, *storage_)
+ : CASUtils::SplitBlobFastCDC(blob_digest, *storage_);
+
if (std::holds_alternative<grpc::Status>(split_result)) {
auto status = std::get<grpc::Status>(split_result);
auto str = fmt::format("SplitBlob: {}", status.error_message());
logger_.Emit(LogLevel::Error, str);
return ::grpc::Status{status.error_code(), str};
}
+
auto chunk_digests = std::get<std::vector<bazel_re::Digest>>(split_result);
logger_.Emit(LogLevel::Debug, [&blob_digest, &chunk_digests]() {
std::stringstream ss{};
@@ -256,6 +299,7 @@ auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/,
ss << "]";
return ss.str();
});
+
std::copy(chunk_digests.cbegin(),
chunk_digests.cend(),
pb::back_inserter(response->mutable_chunk_digests()));
diff --git a/src/buildtool/execution_api/execution_service/cas_server.hpp b/src/buildtool/execution_api/execution_service/cas_server.hpp
index 306c2833..fd77a03e 100644
--- a/src/buildtool/execution_api/execution_service/cas_server.hpp
+++ b/src/buildtool/execution_api/execution_service/cas_server.hpp
@@ -120,17 +120,44 @@ class CASServiceImpl final
-> ::grpc::Status override;
// Split a blob into chunks.
//
+ // This splitting API aims to reduce download traffic between client and
+ // server, e.g., if a client needs to fetch a large blob that just has been
+ // modified slightly since the last built. In this case, there is no need to
+ // fetch the entire blob data, but just the binary differences between the
+ // two blob versions, which are typically determined by deduplication
+ // techniques such as content-defined chunking.
+ //
// Clients can use this API before downloading a blob to determine which
// parts of the blob are already present locally and do not need to be
- // downloaded again.
- //
- // The blob is split into chunks which are individually stored in the CAS. A
- // list of the chunk digests is returned in the order in which the chunks
- // have to be concatenated to assemble the requested blob.
- //
- // Using this API is optional but it allows clients to download only the
- // missing parts of a blob instead of the entire blob data, which in turn
- // can considerably reduce network traffic.
+ // downloaded again. The server splits the blob into chunks according to a
+ // specified content-defined chunking algorithm and returns a list of the
+ // chunk digests in the order in which the chunks have to be concatenated to
+ // assemble the requested blob.
+ //
+ // A client can expect the following guarantees from the server if a split
+ // request is answered successfully:
+ // 1. The blob chunks are stored in CAS.
+ // 2. Concatenating the blob chunks in the order of the digest list
+ // returned by the server results in the original blob.
+ //
+ // The usage of this API is optional for clients but it allows them to
+ // download only the missing parts of a large blob instead of the entire
+ // blob data, which in turn can considerably reduce download network
+ // traffic.
+ //
+ // Since the generated chunks are stored as blobs, they underlie the same
+ // lifetimes as other blobs. However, their lifetime is extended if they are
+ // part of the result of a split blob request.
+ //
+ // For the client, it is recommended to verify whether the digest of the
+ // blob assembled by the fetched chunks results in the requested blob
+ // digest.
+ //
+ // If several clients use blob splitting, it is recommended that they
+ // request the same splitting algorithm to benefit from each others chunking
+ // data. In combination with blob splicing, an agreement about the chunking
+ // algorithm is recommended since both client as well as server side can
+ // benefit from each others chunking data.
//
// Errors:
//
diff --git a/src/buildtool/execution_api/execution_service/cas_utils.cpp b/src/buildtool/execution_api/execution_service/cas_utils.cpp
index 3da56e28..48bbd124 100644
--- a/src/buildtool/execution_api/execution_service/cas_utils.cpp
+++ b/src/buildtool/execution_api/execution_service/cas_utils.cpp
@@ -63,8 +63,47 @@ auto CASUtils::EnsureTreeInvariant(std::string const& data,
return std::nullopt;
}
-auto CASUtils::SplitBlob(bazel_re::Digest const& blob_digest,
- Storage const& storage) noexcept
+auto CASUtils::SplitBlobIdentity(bazel_re::Digest const& blob_digest,
+ Storage const& storage) noexcept
+ -> std::variant<std::vector<bazel_re::Digest>, grpc::Status> {
+
+ // Check blob existence.
+ auto path = NativeSupport::IsTree(blob_digest.hash())
+ ? storage.CAS().TreePath(blob_digest)
+ : storage.CAS().BlobPath(blob_digest, false);
+ if (not path) {
+ return grpc::Status{
+ grpc::StatusCode::NOT_FOUND,
+ fmt::format("blob not found {}", blob_digest.hash())};
+ }
+
+ // The split protocol states that each chunk that is returned by the
+ // operation is stored in (file) CAS. This means for the native mode, if we
+ // return the identity of a tree, we need to put the tree data in file CAS
+ // and return the resulting digest.
+ auto chunk_digests = std::vector<bazel_re::Digest>{};
+ if (NativeSupport::IsTree(blob_digest.hash())) {
+ auto tree_data = FileSystemManager::ReadFile(*path);
+ if (not tree_data) {
+ return grpc::Status{
+ grpc::StatusCode::INTERNAL,
+ fmt::format("could read tree data {}", blob_digest.hash())};
+ }
+ auto digest = storage.CAS().StoreBlob(*tree_data, false);
+ if (not digest) {
+ return grpc::Status{grpc::StatusCode::INTERNAL,
+ fmt::format("could not store tree as blob {}",
+ blob_digest.hash())};
+ }
+ chunk_digests.emplace_back(*digest);
+ return chunk_digests;
+ }
+ chunk_digests.emplace_back(blob_digest);
+ return chunk_digests;
+}
+
+auto CASUtils::SplitBlobFastCDC(bazel_re::Digest const& blob_digest,
+ Storage const& storage) noexcept
-> std::variant<std::vector<bazel_re::Digest>, grpc::Status> {
// Check blob existence.
diff --git a/src/buildtool/execution_api/execution_service/cas_utils.hpp b/src/buildtool/execution_api/execution_service/cas_utils.hpp
index b22260b8..c3155b17 100644
--- a/src/buildtool/execution_api/execution_service/cas_utils.hpp
+++ b/src/buildtool/execution_api/execution_service/cas_utils.hpp
@@ -31,8 +31,14 @@ class CASUtils {
std::string const& hash,
Storage const& storage) noexcept -> std::optional<std::string>;
- [[nodiscard]] static auto SplitBlob(bazel_re::Digest const& blob_digest,
- Storage const& storage) noexcept
+ [[nodiscard]] static auto SplitBlobIdentity(
+ bazel_re::Digest const& blob_digest,
+ Storage const& storage) noexcept
+ -> std::variant<std::vector<bazel_re::Digest>, grpc::Status>;
+
+ [[nodiscard]] static auto SplitBlobFastCDC(
+ bazel_re::Digest const& blob_digest,
+ Storage const& storage) noexcept
-> std::variant<std::vector<bazel_re::Digest>, grpc::Status>;
[[nodiscard]] static auto SpliceBlob(
diff --git a/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp b/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp
index 473dd5ba..e6830261 100644
--- a/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp
+++ b/src/buildtool/execution_api/remote/bazel/bazel_cas_client.cpp
@@ -320,6 +320,8 @@ auto BazelCasClient::SplitBlob(std::string const& instance_name,
bazel_re::SplitBlobRequest request{};
request.set_instance_name(instance_name);
request.mutable_blob_digest()->CopyFrom(blob_digest);
+ request.set_chunking_algorithm(bazel_re::ChunkingAlgorithm_Value::
+ ChunkingAlgorithm_Value_FASTCDC_MT0_8KB);
bazel_re::SplitBlobResponse response{};
auto [ok, status] = WithRetry(
[this, &response, &request]() {