summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/buildtool/execution_api/execution_service/TARGETS12
-rw-r--r--src/buildtool/execution_api/execution_service/cas_server.cpp91
-rw-r--r--src/buildtool/execution_api/execution_service/cas_server.hpp28
-rw-r--r--src/buildtool/execution_api/execution_service/file_chunker.cpp118
-rw-r--r--src/buildtool/execution_api/execution_service/file_chunker.hpp97
-rw-r--r--src/buildtool/execution_api/execution_service/server_implementation.cpp2
6 files changed, 347 insertions, 1 deletions
diff --git a/src/buildtool/execution_api/execution_service/TARGETS b/src/buildtool/execution_api/execution_service/TARGETS
index 56aa04d5..d73c0387 100644
--- a/src/buildtool/execution_api/execution_service/TARGETS
+++ b/src/buildtool/execution_api/execution_service/TARGETS
@@ -50,13 +50,14 @@
[ ["src/buildtool/logging", "logging"]
, ["src/buildtool/common", "bazel_types"]
, ["src/buildtool/storage", "storage"]
+ , ["@", "gsl", "", "gsl"]
]
, "private-deps":
[ ["src/buildtool/compatibility", "compatibility"]
, ["@", "fmt", "", "fmt"]
, ["src/buildtool/storage", "storage"]
- , ["src/buildtool/execution_api/local", "local"]
, ["src/utils/cpp", "verify_hash"]
+ , "file_chunker"
]
}
, "server_implementation":
@@ -72,6 +73,7 @@
, "bytestream_server"
, "capabilities_server"
, "operations_server"
+ , "file_chunker"
, ["src/buildtool/execution_api/remote", "config"]
, ["src/buildtool/auth", "auth"]
, ["@", "json", "", "json"]
@@ -136,4 +138,12 @@
, "stage": ["src", "buildtool", "execution_api", "execution_service"]
, "private-deps": ["operation_cache", ["src/utils/cpp", "verify_hash"]]
}
+, "file_chunker":
+ { "type": ["@", "rules", "CC", "library"]
+ , "name": ["file_chunker"]
+ , "hdrs": ["file_chunker.hpp"]
+ , "srcs": ["file_chunker.cpp"]
+ , "stage": ["src", "buildtool", "execution_api", "execution_service"]
+ , "private-deps": [["@", "gsl", "", "gsl"]]
+ }
}
diff --git a/src/buildtool/execution_api/execution_service/cas_server.cpp b/src/buildtool/execution_api/execution_service/cas_server.cpp
index abdab5b1..87faee66 100644
--- a/src/buildtool/execution_api/execution_service/cas_server.cpp
+++ b/src/buildtool/execution_api/execution_service/cas_server.cpp
@@ -14,8 +14,15 @@
#include "src/buildtool/execution_api/execution_service/cas_server.hpp"
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <sstream>
+#include <vector>
+
#include "fmt/core.h"
#include "src/buildtool/compatibility/native_support.hpp"
+#include "src/buildtool/execution_api/execution_service/file_chunker.hpp"
#include "src/buildtool/storage/garbage_collector.hpp"
#include "src/utils/cpp/verify_hash.hpp"
@@ -176,3 +183,87 @@ auto CASServiceImpl::GetTree(
logger_.Emit(LogLevel::Error, str);
return ::grpc::Status{grpc::StatusCode::UNIMPLEMENTED, str};
}
+
+auto CASServiceImpl::SplitBlob(::grpc::ServerContext* /*context*/,
+ const ::bazel_re::SplitBlobRequest* request,
+ ::bazel_re::SplitBlobResponse* response)
+ -> ::grpc::Status {
+ if (not request->has_blob_digest()) {
+ auto str = fmt::format("SplitBlob: no blob digest provided");
+ logger_.Emit(LogLevel::Error, str);
+ return ::grpc::Status{grpc::StatusCode::INVALID_ARGUMENT, str};
+ }
+
+ // Acquire garbage collection lock.
+ auto lock = GarbageCollector::SharedLock();
+ if (not lock) {
+ auto str =
+ fmt::format("SplitBlob: could not acquire garbage collection lock");
+ logger_.Emit(LogLevel::Error, str);
+ return ::grpc::Status{grpc::StatusCode::INTERNAL, str};
+ }
+
+ auto const& blob_digest = request->blob_digest();
+ logger_.Emit(LogLevel::Info, "SplitBlob({})", blob_digest.hash());
+
+ // Check blob existence.
+ auto path = std::optional<std::filesystem::path>{};
+ if (NativeSupport::IsTree(blob_digest.hash())) {
+ path = storage_->CAS().TreePath(blob_digest);
+ }
+ else {
+ path = storage_->CAS().BlobPath(blob_digest, true);
+ if (not path) {
+ path = storage_->CAS().BlobPath(blob_digest, false);
+ }
+ }
+ if (not path) {
+ auto str =
+ fmt::format("SplitBlob: blob not found {}", blob_digest.hash());
+ logger_.Emit(LogLevel::Error, str);
+ return ::grpc::Status{grpc::StatusCode::NOT_FOUND, str};
+ }
+
+ // Split blob into chunks, store each chunk in CAS, and collect chunk
+ // digests.
+ auto chunker = FileChunker{*path};
+ if (not chunker.IsOpen()) {
+ auto str = fmt::format("SplitBlob: could not open blob for reading");
+ logger_.Emit(LogLevel::Error, str);
+ return ::grpc::Status{grpc::StatusCode::INTERNAL, str};
+ }
+
+ auto chunk_digests = std::vector<bazel_re::Digest>{};
+ while (auto chunk = chunker.NextChunk()) {
+ auto chunk_digest = storage_->CAS().StoreBlob(*chunk, false);
+ if (not chunk_digest) {
+ auto str =
+ fmt::format("SplitBlob: could not store chunk of blob {}",
+ blob_digest.hash());
+ logger_.Emit(LogLevel::Error, str);
+ return ::grpc::Status{grpc::StatusCode::RESOURCE_EXHAUSTED, str};
+ }
+ chunk_digests.emplace_back(*chunk_digest);
+ }
+ if (not chunker.Finished()) {
+ auto str =
+ fmt::format("SplitBlob: could split blob {}", blob_digest.hash());
+ logger_.Emit(LogLevel::Error, str);
+ return ::grpc::Status{grpc::StatusCode::INTERNAL, str};
+ }
+ logger_.Emit(LogLevel::Debug, [&blob_digest, &chunk_digests]() {
+ std::stringstream ss{};
+ ss << "Split blob " << blob_digest.hash() << ":"
+ << blob_digest.size_bytes() << " into [ ";
+ for (auto const& chunk_digest : chunk_digests) {
+ ss << chunk_digest.hash() << ":" << chunk_digest.size_bytes()
+ << " ";
+ }
+ ss << "]";
+ return ss.str();
+ });
+ std::copy(chunk_digests.cbegin(),
+ chunk_digests.cend(),
+ pb::back_inserter(response->mutable_chunk_digests()));
+ return ::grpc::Status::OK;
+}
diff --git a/src/buildtool/execution_api/execution_service/cas_server.hpp b/src/buildtool/execution_api/execution_service/cas_server.hpp
index 520263b6..afea2658 100644
--- a/src/buildtool/execution_api/execution_service/cas_server.hpp
+++ b/src/buildtool/execution_api/execution_service/cas_server.hpp
@@ -14,7 +14,12 @@
#ifndef CAS_SERVER_HPP
#define CAS_SERVER_HPP
+
+#include <optional>
+#include <string>
+
#include "build/bazel/remote/execution/v2/remote_execution.grpc.pb.h"
+#include "gsl/gsl"
#include "src/buildtool/common/bazel_types.hpp"
#include "src/buildtool/logging/logger.hpp"
#include "src/buildtool/storage/storage.hpp"
@@ -113,6 +118,29 @@ class CASServiceImpl final
const ::bazel_re::GetTreeRequest* request,
::grpc::ServerWriter< ::bazel_re::GetTreeResponse>* writer)
-> ::grpc::Status override;
+ // Split a blob into chunks.
+ //
+ // Clients can use this API before downloading a blob to determine which
+ // parts of the blob are already present locally and do not need to be
+ // downloaded again.
+ //
+ // The blob is split into chunks which are individually stored in the CAS. A
+ // list of the chunk digests is returned in the order in which the chunks
+ // have to be concatenated to assemble the requested blob.
+ //
+ // Using this API is optional but it allows clients to download only the
+ // missing parts of a blob instead of the entire blob data, which in turn
+ // can considerably reduce network traffic.
+ //
+ // Errors:
+ //
+ // * `NOT_FOUND`: The requested blob is not present in the CAS.
+ // * `RESOURCE_EXHAUSTED`: There is insufficient disk quota to store the
+ // blob chunks.
+ auto SplitBlob(::grpc::ServerContext* context,
+ const ::bazel_re::SplitBlobRequest* request,
+ ::bazel_re::SplitBlobResponse* response)
+ -> ::grpc::Status override;
private:
[[nodiscard]] auto CheckDigestConsistency(std::string const& ref,
diff --git a/src/buildtool/execution_api/execution_service/file_chunker.cpp b/src/buildtool/execution_api/execution_service/file_chunker.cpp
new file mode 100644
index 00000000..280b79a2
--- /dev/null
+++ b/src/buildtool/execution_api/execution_service/file_chunker.cpp
@@ -0,0 +1,118 @@
+// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buildtool/execution_api/execution_service/file_chunker.hpp"
+
+#include <array>
+#include <random>
+
+#include "gsl/gsl"
+
+namespace {
+
+// Mask values taken from algorithm 2 of the paper
+// https://ieeexplore.ieee.org/document/9055082.
+constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits
+constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits
+
+// Predefined array of 256 random 64-bit integers, needs to be initialized.
+constexpr std::uint32_t kRandomTableSize{256};
+constexpr std::uint64_t kLowerBound{0x1000000000000000ULL};
+constexpr std::uint64_t kUpperBound{0xffffffffffffffffULL};
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::array<std::uint64_t, kRandomTableSize> gear_table{};
+
+} // namespace
+
+auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void {
+ std::mt19937_64 gen64(seed);
+ std::uniform_int_distribution<std::uint64_t> dist(kLowerBound, kUpperBound);
+ for (auto& item : gear_table) {
+ item = dist(gen64);
+ }
+}
+
+auto FileChunker::IsOpen() const noexcept -> bool {
+ return stream_.is_open();
+}
+
+auto FileChunker::Finished() const noexcept -> bool {
+ return stream_.eof() && pos_ == size_;
+}
+
+auto FileChunker::NextChunk() noexcept -> std::optional<std::string> {
+ // Handle failed past read attempts from the stream.
+ if (not stream_.good() and not stream_.eof()) {
+ return std::nullopt;
+ }
+
+ // Ensure that at least max_chunk_size bytes are in the buffer, except if
+ // end-of-file is reached.
+ auto remaining = size_ - pos_;
+ if (remaining < max_chunk_size_ and not stream_.eof()) {
+ // Move the remaining bytes of the buffer to the front.
+ buffer_.copy(&buffer_[0], remaining, pos_);
+ auto ssize = static_cast<std::streamsize>(buffer_.size() - remaining);
+ // Fill the buffer with stream content.
+ stream_.read(&buffer_[remaining], ssize);
+ if (not stream_.good() and not stream_.eof()) {
+ return std::nullopt;
+ }
+ size_ = static_cast<std::size_t>(stream_.gcount()) + remaining;
+ pos_ = 0;
+ }
+
+ // Handle finished chunking.
+ if (pos_ == size_) {
+ return std::nullopt;
+ }
+
+ auto off = NextChunkBoundary();
+ auto chunk = buffer_.substr(pos_, off);
+ pos_ += off;
+ return chunk;
+}
+
+// Implementation of the FastCDC data deduplication algorithm described in
+// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082.
+auto FileChunker::NextChunkBoundary() noexcept -> std::size_t {
+ auto n = size_ - pos_;
+ auto fp = 0ULL;
+ auto i = min_chunk_size_;
+ auto normal_size = average_chunk_size_;
+ if (n <= min_chunk_size_) {
+ return n;
+ }
+ if (n >= max_chunk_size_) {
+ n = max_chunk_size_;
+ }
+ else if (n <= normal_size) {
+ normal_size = n;
+ }
+ for (; i < normal_size; i++) {
+ fp = (fp << 1U) +
+ gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i]));
+ if ((fp & kMaskS) == 0) {
+ return i; // if the masked bits are all '0'
+ }
+ }
+ for (; i < n; i++) {
+ fp = (fp << 1U) +
+ gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i]));
+ if ((fp & kMaskL) == 0) {
+ return i; // if the masked bits are all '0'
+ }
+ }
+ return i;
+}
diff --git a/src/buildtool/execution_api/execution_service/file_chunker.hpp b/src/buildtool/execution_api/execution_service/file_chunker.hpp
new file mode 100644
index 00000000..6cad28aa
--- /dev/null
+++ b/src/buildtool/execution_api/execution_service/file_chunker.hpp
@@ -0,0 +1,97 @@
+// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP
+#define INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP
+
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <optional>
+#include <string>
+
+/// @brief This class provides content-defined chunking for a file stream. It
+/// allows to split a file stream into variable-sized chunks based on its data
+/// content. In contrast to fixed-sized chunking, which splits a data stream
+/// into chunks of fixed size, it is not prone to the data-shifting problem. In
+/// order to assemble the resulting file, the delivered chunks have to be
+/// concatenated in order.
+///
+/// A read buffer is used to progressively process the file content instead of
+/// reading the entire file content in memory.
+class FileChunker {
+ static constexpr std::uint32_t kDefaultChunkSize{1024 * 8}; // 8 KB
+ static constexpr std::uint32_t kDefaultSeed{0};
+
+ public:
+ /// @brief Create an instance of the file chunker for a given file.
+ /// @param path The path to the file to be splitted.
+ /// @param average_chunk_size Targeted average chunk size in bytes
+ /// (default: 8 KB).
+ explicit FileChunker(std::filesystem::path const& path,
+ std::uint32_t average_chunk_size = kDefaultChunkSize)
+ // According to section 4.1 of the paper
+ // https://ieeexplore.ieee.org/document/9055082, maximum and minimum
+ // chunk sizes are configured to the 8x and the 1/4x of the average
+ // chunk size.
+ : min_chunk_size_(average_chunk_size >> 2U),
+ average_chunk_size_(average_chunk_size),
+ max_chunk_size_(average_chunk_size << 3U),
+ stream_{path, std::ios::in | std::ios::binary} {
+ // The buffer size needs to be at least max_chunk_size_ large, otherwise
+ // max_chunk_size_ is not fully exhausted and the buffer size determines
+ // the maximum chunk size.
+ buffer_.resize(max_chunk_size_ << 4U);
+ }
+
+ FileChunker() noexcept = delete;
+ ~FileChunker() noexcept = default;
+ FileChunker(FileChunker const& other) noexcept = delete;
+ FileChunker(FileChunker&& other) noexcept = delete;
+ auto operator=(FileChunker const& other) noexcept = delete;
+ auto operator=(FileChunker&& other) noexcept = delete;
+
+ /// @brief Check if the underlying file is open.
+ /// @return True if the file was opened successfully, false otherwise.
+ [[nodiscard]] auto IsOpen() const noexcept -> bool;
+
+ /// @brief Check if chunking of the file stream was done successfully.
+ /// @return True if chunking was successful, false otherwise.
+ [[nodiscard]] auto Finished() const noexcept -> bool;
+
+ /// @brief Fetch the next chunk from the file stream.
+ /// @return The next chunk of the file stream.
+ [[nodiscard]] auto NextChunk() noexcept -> std::optional<std::string>;
+
+ /// @brief Initialize random number table used by the chunking algorithm.
+ /// @param seed Some random seed.
+ static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void;
+
+ private:
+ // Different chunk size parameters, defined in number of bytes.
+ const std::uint32_t min_chunk_size_{};
+ const std::uint32_t average_chunk_size_{};
+ const std::uint32_t max_chunk_size_{};
+ std::ifstream stream_{}; // File stream to be splitted.
+ std::string buffer_{}; // Buffer for the file content.
+ std::size_t size_{0}; // Current size of the buffer.
+ std::size_t pos_{0}; // Current read position within the buffer.
+
+ /// @brief Find the next chunk boundary from the current read position
+ /// within the buffer.
+ /// @return The position of the next chunk boundary.
+ [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t;
+};
+
+#endif // INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP
diff --git a/src/buildtool/execution_api/execution_service/server_implementation.cpp b/src/buildtool/execution_api/execution_service/server_implementation.cpp
index 9804c53e..8c4e77f3 100644
--- a/src/buildtool/execution_api/execution_service/server_implementation.cpp
+++ b/src/buildtool/execution_api/execution_service/server_implementation.cpp
@@ -30,6 +30,7 @@
#include "src/buildtool/execution_api/execution_service/capabilities_server.hpp"
#include "src/buildtool/execution_api/execution_service/cas_server.hpp"
#include "src/buildtool/execution_api/execution_service/execution_server.hpp"
+#include "src/buildtool/execution_api/execution_service/file_chunker.hpp"
#include "src/buildtool/execution_api/execution_service/operations_server.hpp"
#include "src/buildtool/execution_api/remote/config.hpp"
#include "src/buildtool/logging/logger.hpp"
@@ -116,6 +117,7 @@ auto ServerImpl::Run() -> bool {
}
}
+ FileChunker::Initialize();
server->Wait();
return true;
}