diff options
author | Sascha Roloff <sascha.roloff@huawei.com> | 2023-10-31 20:48:32 +0100 |
---|---|---|
committer | Sascha Roloff <sascha.roloff@huawei.com> | 2023-11-22 16:18:17 +0100 |
commit | 44a7c680289ba6812583746013f350d63942c894 (patch) | |
tree | b6482b81c89768fcae95363080e0c9dc765b33be /src/buildtool/execution_api/execution_service | |
parent | 9dc43626cf863ecfee29ef36fe3637e52b876f85 (diff) | |
download | justbuild-44a7c680289ba6812583746013f350d63942c894.tar.gz |
Implement blob splitting protocol on just server side
Diffstat (limited to 'src/buildtool/execution_api/execution_service')
6 files changed, 347 insertions, 1 deletions
diff --git a/src/buildtool/execution_api/execution_service/TARGETS b/src/buildtool/execution_api/execution_service/TARGETS index 56aa04d5..d73c0387 100644 --- a/src/buildtool/execution_api/execution_service/TARGETS +++ b/src/buildtool/execution_api/execution_service/TARGETS @@ -50,13 +50,14 @@ [ ["src/buildtool/logging", "logging"] , ["src/buildtool/common", "bazel_types"] , ["src/buildtool/storage", "storage"] + , ["@", "gsl", "", "gsl"] ] , "private-deps": [ ["src/buildtool/compatibility", "compatibility"] , ["@", "fmt", "", "fmt"] , ["src/buildtool/storage", "storage"] - , ["src/buildtool/execution_api/local", "local"] , ["src/utils/cpp", "verify_hash"] + , "file_chunker" ] } , "server_implementation": @@ -72,6 +73,7 @@ , "bytestream_server" , "capabilities_server" , "operations_server" + , "file_chunker" , ["src/buildtool/execution_api/remote", "config"] , ["src/buildtool/auth", "auth"] , ["@", "json", "", "json"] @@ -136,4 +138,12 @@ , "stage": ["src", "buildtool", "execution_api", "execution_service"] , "private-deps": ["operation_cache", ["src/utils/cpp", "verify_hash"]] } +, "file_chunker": + { "type": ["@", "rules", "CC", "library"] + , "name": ["file_chunker"] + , "hdrs": ["file_chunker.hpp"] + , "srcs": ["file_chunker.cpp"] + , "stage": ["src", "buildtool", "execution_api", "execution_service"] + , "private-deps": [["@", "gsl", "", "gsl"]] + } } diff --git a/src/buildtool/execution_api/execution_service/cas_server.cpp b/src/buildtool/execution_api/execution_service/cas_server.cpp index abdab5b1..87faee66 100644 --- a/src/buildtool/execution_api/execution_service/cas_server.cpp +++ b/src/buildtool/execution_api/execution_service/cas_server.cpp @@ -14,8 +14,15 @@ #include "src/buildtool/execution_api/execution_service/cas_server.hpp" +#include <algorithm> +#include <filesystem> +#include <fstream> +#include <sstream> +#include <vector> + #include "fmt/core.h" #include "src/buildtool/compatibility/native_support.hpp" +#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" #include "src/buildtool/storage/garbage_collector.hpp" #include "src/utils/cpp/verify_hash.hpp" @@ -176,3 +183,87 @@ auto CASServiceImpl::GetTree( logger_.Emit(LogLevel::Error, str); return ::grpc::Status{grpc::StatusCode::UNIMPLEMENTED, str}; } + +auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/, + const ::bazel_re::SplitBlobRequest* request, + ::bazel_re::SplitBlobResponse* response) + -> ::grpc::Status { + if (not request->has_blob_digest()) { + auto str = fmt::format("SplitBlob: no blob digest provided"); + logger_.Emit(LogLevel::Error, str); + return ::grpc::Status{grpc::StatusCode::INVALID_ARGUMENT, str}; + } + + // Acquire garbage collection lock. + auto lock = GarbageCollector::SharedLock(); + if (not lock) { + auto str = + fmt::format("SplitBlob: could not acquire garbage collection lock"); + logger_.Emit(LogLevel::Error, str); + return ::grpc::Status{grpc::StatusCode::INTERNAL, str}; + } + + auto const& blob_digest = request->blob_digest(); + logger_.Emit(LogLevel::Info, "SplitBlob({})", blob_digest.hash()); + + // Check blob existence. + auto path = std::optional<std::filesystem::path>{}; + if (NativeSupport::IsTree(blob_digest.hash())) { + path = storage_->CAS().TreePath(blob_digest); + } + else { + path = storage_->CAS().BlobPath(blob_digest, true); + if (not path) { + path = storage_->CAS().BlobPath(blob_digest, false); + } + } + if (not path) { + auto str = + fmt::format("SplitBlob: blob not found {}", blob_digest.hash()); + logger_.Emit(LogLevel::Error, str); + return ::grpc::Status{grpc::StatusCode::NOT_FOUND, str}; + } + + // Split blob into chunks, store each chunk in CAS, and collect chunk + // digests. + auto chunker = FileChunker{*path}; + if (not chunker.IsOpen()) { + auto str = fmt::format("SplitBlob: could not open blob for reading"); + logger_.Emit(LogLevel::Error, str); + return ::grpc::Status{grpc::StatusCode::INTERNAL, str}; + } + + auto chunk_digests = std::vector<bazel_re::Digest>{}; + while (auto chunk = chunker.NextChunk()) { + auto chunk_digest = storage_->CAS().StoreBlob(*chunk, false); + if (not chunk_digest) { + auto str = + fmt::format("SplitBlob: could not store chunk of blob {}", + blob_digest.hash()); + logger_.Emit(LogLevel::Error, str); + return ::grpc::Status{grpc::StatusCode::RESOURCE_EXHAUSTED, str}; + } + chunk_digests.emplace_back(*chunk_digest); + } + if (not chunker.Finished()) { + auto str = + fmt::format("SplitBlob: could split blob {}", blob_digest.hash()); + logger_.Emit(LogLevel::Error, str); + return ::grpc::Status{grpc::StatusCode::INTERNAL, str}; + } + logger_.Emit(LogLevel::Debug, [&blob_digest, &chunk_digests]() { + std::stringstream ss{}; + ss << "Split blob " << blob_digest.hash() << ":" + << blob_digest.size_bytes() << " into [ "; + for (auto const& chunk_digest : chunk_digests) { + ss << chunk_digest.hash() << ":" << chunk_digest.size_bytes() + << " "; + } + ss << "]"; + return ss.str(); + }); + std::copy(chunk_digests.cbegin(), + chunk_digests.cend(), + pb::back_inserter(response->mutable_chunk_digests())); + return ::grpc::Status::OK; +} diff --git a/src/buildtool/execution_api/execution_service/cas_server.hpp b/src/buildtool/execution_api/execution_service/cas_server.hpp index 520263b6..afea2658 100644 --- a/src/buildtool/execution_api/execution_service/cas_server.hpp +++ b/src/buildtool/execution_api/execution_service/cas_server.hpp @@ -14,7 +14,12 @@ #ifndef CAS_SERVER_HPP #define CAS_SERVER_HPP + +#include <optional> +#include <string> + #include "build/bazel/remote/execution/v2/remote_execution.grpc.pb.h" +#include "gsl/gsl" #include "src/buildtool/common/bazel_types.hpp" #include "src/buildtool/logging/logger.hpp" #include "src/buildtool/storage/storage.hpp" @@ -113,6 +118,29 @@ class CASServiceImpl final const ::bazel_re::GetTreeRequest* request, ::grpc::ServerWriter< ::bazel_re::GetTreeResponse>* writer) -> ::grpc::Status override; + // Split a blob into chunks. + // + // Clients can use this API before downloading a blob to determine which + // parts of the blob are already present locally and do not need to be + // downloaded again. + // + // The blob is split into chunks which are individually stored in the CAS. A + // list of the chunk digests is returned in the order in which the chunks + // have to be concatenated to assemble the requested blob. + // + // Using this API is optional but it allows clients to download only the + // missing parts of a blob instead of the entire blob data, which in turn + // can considerably reduce network traffic. + // + // Errors: + // + // * `NOT_FOUND`: The requested blob is not present in the CAS. + // * `RESOURCE_EXHAUSTED`: There is insufficient disk quota to store the + // blob chunks. + auto SplitBlob(::grpc::ServerContext* context, + const ::bazel_re::SplitBlobRequest* request, + ::bazel_re::SplitBlobResponse* response) + -> ::grpc::Status override; private: [[nodiscard]] auto CheckDigestConsistency(std::string const& ref, diff --git a/src/buildtool/execution_api/execution_service/file_chunker.cpp b/src/buildtool/execution_api/execution_service/file_chunker.cpp new file mode 100644 index 00000000..280b79a2 --- /dev/null +++ b/src/buildtool/execution_api/execution_service/file_chunker.cpp @@ -0,0 +1,118 @@ +// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" + +#include <array> +#include <random> + +#include "gsl/gsl" + +namespace { + +// Mask values taken from algorithm 2 of the paper +// https://ieeexplore.ieee.org/document/9055082. +constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits +constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits + +// Predefined array of 256 random 64-bit integers, needs to be initialized. +constexpr std::uint32_t kRandomTableSize{256}; +constexpr std::uint64_t kLowerBound{0x1000000000000000ULL}; +constexpr std::uint64_t kUpperBound{0xffffffffffffffffULL}; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +std::array<std::uint64_t, kRandomTableSize> gear_table{}; + +} // namespace + +auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void { + std::mt19937_64 gen64(seed); + std::uniform_int_distribution<std::uint64_t> dist(kLowerBound, kUpperBound); + for (auto& item : gear_table) { + item = dist(gen64); + } +} + +auto FileChunker::IsOpen() const noexcept -> bool { + return stream_.is_open(); +} + +auto FileChunker::Finished() const noexcept -> bool { + return stream_.eof() && pos_ == size_; +} + +auto FileChunker::NextChunk() noexcept -> std::optional<std::string> { + // Handle failed past read attempts from the stream. + if (not stream_.good() and not stream_.eof()) { + return std::nullopt; + } + + // Ensure that at least max_chunk_size bytes are in the buffer, except if + // end-of-file is reached. + auto remaining = size_ - pos_; + if (remaining < max_chunk_size_ and not stream_.eof()) { + // Move the remaining bytes of the buffer to the front. + buffer_.copy(&buffer_[0], remaining, pos_); + auto ssize = static_cast<std::streamsize>(buffer_.size() - remaining); + // Fill the buffer with stream content. + stream_.read(&buffer_[remaining], ssize); + if (not stream_.good() and not stream_.eof()) { + return std::nullopt; + } + size_ = static_cast<std::size_t>(stream_.gcount()) + remaining; + pos_ = 0; + } + + // Handle finished chunking. + if (pos_ == size_) { + return std::nullopt; + } + + auto off = NextChunkBoundary(); + auto chunk = buffer_.substr(pos_, off); + pos_ += off; + return chunk; +} + +// Implementation of the FastCDC data deduplication algorithm described in +// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082. +auto FileChunker::NextChunkBoundary() noexcept -> std::size_t { + auto n = size_ - pos_; + auto fp = 0ULL; + auto i = min_chunk_size_; + auto normal_size = average_chunk_size_; + if (n <= min_chunk_size_) { + return n; + } + if (n >= max_chunk_size_) { + n = max_chunk_size_; + } + else if (n <= normal_size) { + normal_size = n; + } + for (; i < normal_size; i++) { + fp = (fp << 1U) + + gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i])); + if ((fp & kMaskS) == 0) { + return i; // if the masked bits are all '0' + } + } + for (; i < n; i++) { + fp = (fp << 1U) + + gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i])); + if ((fp & kMaskL) == 0) { + return i; // if the masked bits are all '0' + } + } + return i; +} diff --git a/src/buildtool/execution_api/execution_service/file_chunker.hpp b/src/buildtool/execution_api/execution_service/file_chunker.hpp new file mode 100644 index 00000000..6cad28aa --- /dev/null +++ b/src/buildtool/execution_api/execution_service/file_chunker.hpp @@ -0,0 +1,97 @@ +// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP +#define INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP + +#include <cstdint> +#include <filesystem> +#include <fstream> +#include <optional> +#include <string> + +/// @brief This class provides content-defined chunking for a file stream. It +/// allows to split a file stream into variable-sized chunks based on its data +/// content. In contrast to fixed-sized chunking, which splits a data stream +/// into chunks of fixed size, it is not prone to the data-shifting problem. In +/// order to assemble the resulting file, the delivered chunks have to be +/// concatenated in order. +/// +/// A read buffer is used to progressively process the file content instead of +/// reading the entire file content in memory. +class FileChunker { + static constexpr std::uint32_t kDefaultChunkSize{1024 * 8}; // 8 KB + static constexpr std::uint32_t kDefaultSeed{0}; + + public: + /// @brief Create an instance of the file chunker for a given file. + /// @param path The path to the file to be splitted. + /// @param average_chunk_size Targeted average chunk size in bytes + /// (default: 8 KB). + explicit FileChunker(std::filesystem::path const& path, + std::uint32_t average_chunk_size = kDefaultChunkSize) + // According to section 4.1 of the paper + // https://ieeexplore.ieee.org/document/9055082, maximum and minimum + // chunk sizes are configured to the 8x and the 1/4x of the average + // chunk size. + : min_chunk_size_(average_chunk_size >> 2U), + average_chunk_size_(average_chunk_size), + max_chunk_size_(average_chunk_size << 3U), + stream_{path, std::ios::in | std::ios::binary} { + // The buffer size needs to be at least max_chunk_size_ large, otherwise + // max_chunk_size_ is not fully exhausted and the buffer size determines + // the maximum chunk size. + buffer_.resize(max_chunk_size_ << 4U); + } + + FileChunker() noexcept = delete; + ~FileChunker() noexcept = default; + FileChunker(FileChunker const& other) noexcept = delete; + FileChunker(FileChunker&& other) noexcept = delete; + auto operator=(FileChunker const& other) noexcept = delete; + auto operator=(FileChunker&& other) noexcept = delete; + + /// @brief Check if the underlying file is open. + /// @return True if the file was opened successfully, false otherwise. + [[nodiscard]] auto IsOpen() const noexcept -> bool; + + /// @brief Check if chunking of the file stream was done successfully. + /// @return True if chunking was successful, false otherwise. + [[nodiscard]] auto Finished() const noexcept -> bool; + + /// @brief Fetch the next chunk from the file stream. + /// @return The next chunk of the file stream. + [[nodiscard]] auto NextChunk() noexcept -> std::optional<std::string>; + + /// @brief Initialize random number table used by the chunking algorithm. + /// @param seed Some random seed. + static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void; + + private: + // Different chunk size parameters, defined in number of bytes. + const std::uint32_t min_chunk_size_{}; + const std::uint32_t average_chunk_size_{}; + const std::uint32_t max_chunk_size_{}; + std::ifstream stream_{}; // File stream to be splitted. + std::string buffer_{}; // Buffer for the file content. + std::size_t size_{0}; // Current size of the buffer. + std::size_t pos_{0}; // Current read position within the buffer. + + /// @brief Find the next chunk boundary from the current read position + /// within the buffer. + /// @return The position of the next chunk boundary. + [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t; +}; + +#endif // INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP diff --git a/src/buildtool/execution_api/execution_service/server_implementation.cpp b/src/buildtool/execution_api/execution_service/server_implementation.cpp index 9804c53e..8c4e77f3 100644 --- a/src/buildtool/execution_api/execution_service/server_implementation.cpp +++ b/src/buildtool/execution_api/execution_service/server_implementation.cpp @@ -30,6 +30,7 @@ #include "src/buildtool/execution_api/execution_service/capabilities_server.hpp" #include "src/buildtool/execution_api/execution_service/cas_server.hpp" #include "src/buildtool/execution_api/execution_service/execution_server.hpp" +#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" #include "src/buildtool/execution_api/execution_service/operations_server.hpp" #include "src/buildtool/execution_api/remote/config.hpp" #include "src/buildtool/logging/logger.hpp" @@ -116,6 +117,7 @@ auto ServerImpl::Run() -> bool { } } + FileChunker::Initialize(); server->Wait(); return true; } |