summaryrefslogtreecommitdiff
path: root/src/buildtool/execution_api/execution_service/file_chunker.cpp
diff options
context:
space:
mode:
authorSascha Roloff <sascha.roloff@huawei.com>2023-10-31 20:48:32 +0100
committerSascha Roloff <sascha.roloff@huawei.com>2023-11-22 16:18:17 +0100
commit44a7c680289ba6812583746013f350d63942c894 (patch)
treeb6482b81c89768fcae95363080e0c9dc765b33be /src/buildtool/execution_api/execution_service/file_chunker.cpp
parent9dc43626cf863ecfee29ef36fe3637e52b876f85 (diff)
downloadjustbuild-44a7c680289ba6812583746013f350d63942c894.tar.gz
Implement blob splitting protocol on just server side
Diffstat (limited to 'src/buildtool/execution_api/execution_service/file_chunker.cpp')
-rw-r--r--src/buildtool/execution_api/execution_service/file_chunker.cpp118
1 files changed, 118 insertions, 0 deletions
diff --git a/src/buildtool/execution_api/execution_service/file_chunker.cpp b/src/buildtool/execution_api/execution_service/file_chunker.cpp
new file mode 100644
index 00000000..280b79a2
--- /dev/null
+++ b/src/buildtool/execution_api/execution_service/file_chunker.cpp
@@ -0,0 +1,118 @@
+// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buildtool/execution_api/execution_service/file_chunker.hpp"
+
+#include <array>
+#include <random>
+
+#include "gsl/gsl"
+
+namespace {
+
+// Mask values taken from algorithm 2 of the paper
+// https://ieeexplore.ieee.org/document/9055082.
+constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits
+constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits
+
+// Predefined array of 256 random 64-bit integers, needs to be initialized.
+constexpr std::uint32_t kRandomTableSize{256};
+constexpr std::uint64_t kLowerBound{0x1000000000000000ULL};
+constexpr std::uint64_t kUpperBound{0xffffffffffffffffULL};
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::array<std::uint64_t, kRandomTableSize> gear_table{};
+
+} // namespace
+
+auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void {
+ std::mt19937_64 gen64(seed);
+ std::uniform_int_distribution<std::uint64_t> dist(kLowerBound, kUpperBound);
+ for (auto& item : gear_table) {
+ item = dist(gen64);
+ }
+}
+
+auto FileChunker::IsOpen() const noexcept -> bool {
+ return stream_.is_open();
+}
+
+auto FileChunker::Finished() const noexcept -> bool {
+ return stream_.eof() && pos_ == size_;
+}
+
+auto FileChunker::NextChunk() noexcept -> std::optional<std::string> {
+ // Handle failed past read attempts from the stream.
+ if (not stream_.good() and not stream_.eof()) {
+ return std::nullopt;
+ }
+
+ // Ensure that at least max_chunk_size bytes are in the buffer, except if
+ // end-of-file is reached.
+ auto remaining = size_ - pos_;
+ if (remaining < max_chunk_size_ and not stream_.eof()) {
+ // Move the remaining bytes of the buffer to the front.
+ buffer_.copy(&buffer_[0], remaining, pos_);
+ auto ssize = static_cast<std::streamsize>(buffer_.size() - remaining);
+ // Fill the buffer with stream content.
+ stream_.read(&buffer_[remaining], ssize);
+ if (not stream_.good() and not stream_.eof()) {
+ return std::nullopt;
+ }
+ size_ = static_cast<std::size_t>(stream_.gcount()) + remaining;
+ pos_ = 0;
+ }
+
+ // Handle finished chunking.
+ if (pos_ == size_) {
+ return std::nullopt;
+ }
+
+ auto off = NextChunkBoundary();
+ auto chunk = buffer_.substr(pos_, off);
+ pos_ += off;
+ return chunk;
+}
+
+// Implementation of the FastCDC data deduplication algorithm described in
+// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082.
+auto FileChunker::NextChunkBoundary() noexcept -> std::size_t {
+ auto n = size_ - pos_;
+ auto fp = 0ULL;
+ auto i = min_chunk_size_;
+ auto normal_size = average_chunk_size_;
+ if (n <= min_chunk_size_) {
+ return n;
+ }
+ if (n >= max_chunk_size_) {
+ n = max_chunk_size_;
+ }
+ else if (n <= normal_size) {
+ normal_size = n;
+ }
+ for (; i < normal_size; i++) {
+ fp = (fp << 1U) +
+ gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i]));
+ if ((fp & kMaskS) == 0) {
+ return i; // if the masked bits are all '0'
+ }
+ }
+ for (; i < n; i++) {
+ fp = (fp << 1U) +
+ gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i]));
+ if ((fp & kMaskL) == 0) {
+ return i; // if the masked bits are all '0'
+ }
+ }
+ return i;
+}