diff options
author | Sascha Roloff <sascha.roloff@huawei.com> | 2023-10-31 20:48:32 +0100 |
---|---|---|
committer | Sascha Roloff <sascha.roloff@huawei.com> | 2023-11-22 16:18:17 +0100 |
commit | 44a7c680289ba6812583746013f350d63942c894 (patch) | |
tree | b6482b81c89768fcae95363080e0c9dc765b33be /src/buildtool/execution_api/execution_service/file_chunker.cpp | |
parent | 9dc43626cf863ecfee29ef36fe3637e52b876f85 (diff) | |
download | justbuild-44a7c680289ba6812583746013f350d63942c894.tar.gz |
Implement blob splitting protocol on just server side
Diffstat (limited to 'src/buildtool/execution_api/execution_service/file_chunker.cpp')
-rw-r--r-- | src/buildtool/execution_api/execution_service/file_chunker.cpp | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/src/buildtool/execution_api/execution_service/file_chunker.cpp b/src/buildtool/execution_api/execution_service/file_chunker.cpp new file mode 100644 index 00000000..280b79a2 --- /dev/null +++ b/src/buildtool/execution_api/execution_service/file_chunker.cpp @@ -0,0 +1,118 @@ +// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" + +#include <array> +#include <random> + +#include "gsl/gsl" + +namespace { + +// Mask values taken from algorithm 2 of the paper +// https://ieeexplore.ieee.org/document/9055082. +constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits +constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits + +// Predefined array of 256 random 64-bit integers, needs to be initialized. +constexpr std::uint32_t kRandomTableSize{256}; +constexpr std::uint64_t kLowerBound{0x1000000000000000ULL}; +constexpr std::uint64_t kUpperBound{0xffffffffffffffffULL}; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +std::array<std::uint64_t, kRandomTableSize> gear_table{}; + +} // namespace + +auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void { + std::mt19937_64 gen64(seed); + std::uniform_int_distribution<std::uint64_t> dist(kLowerBound, kUpperBound); + for (auto& item : gear_table) { + item = dist(gen64); + } +} + +auto FileChunker::IsOpen() const noexcept -> bool { + return stream_.is_open(); +} + +auto FileChunker::Finished() const noexcept -> bool { + return stream_.eof() && pos_ == size_; +} + +auto FileChunker::NextChunk() noexcept -> std::optional<std::string> { + // Handle failed past read attempts from the stream. + if (not stream_.good() and not stream_.eof()) { + return std::nullopt; + } + + // Ensure that at least max_chunk_size bytes are in the buffer, except if + // end-of-file is reached. + auto remaining = size_ - pos_; + if (remaining < max_chunk_size_ and not stream_.eof()) { + // Move the remaining bytes of the buffer to the front. + buffer_.copy(&buffer_[0], remaining, pos_); + auto ssize = static_cast<std::streamsize>(buffer_.size() - remaining); + // Fill the buffer with stream content. + stream_.read(&buffer_[remaining], ssize); + if (not stream_.good() and not stream_.eof()) { + return std::nullopt; + } + size_ = static_cast<std::size_t>(stream_.gcount()) + remaining; + pos_ = 0; + } + + // Handle finished chunking. + if (pos_ == size_) { + return std::nullopt; + } + + auto off = NextChunkBoundary(); + auto chunk = buffer_.substr(pos_, off); + pos_ += off; + return chunk; +} + +// Implementation of the FastCDC data deduplication algorithm described in +// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082. +auto FileChunker::NextChunkBoundary() noexcept -> std::size_t { + auto n = size_ - pos_; + auto fp = 0ULL; + auto i = min_chunk_size_; + auto normal_size = average_chunk_size_; + if (n <= min_chunk_size_) { + return n; + } + if (n >= max_chunk_size_) { + n = max_chunk_size_; + } + else if (n <= normal_size) { + normal_size = n; + } + for (; i < normal_size; i++) { + fp = (fp << 1U) + + gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i])); + if ((fp & kMaskS) == 0) { + return i; // if the masked bits are all '0' + } + } + for (; i < n; i++) { + fp = (fp << 1U) + + gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i])); + if ((fp & kMaskL) == 0) { + return i; // if the masked bits are all '0' + } + } + return i; +} |