summaryrefslogtreecommitdiff
path: root/src/buildtool/execution_api/execution_service
diff options
context:
space:
mode:
Diffstat (limited to 'src/buildtool/execution_api/execution_service')
-rw-r--r--src/buildtool/execution_api/execution_service/TARGETS12
-rw-r--r--src/buildtool/execution_api/execution_service/cas_utils.cpp2
-rw-r--r--src/buildtool/execution_api/execution_service/file_chunker.cpp115
-rw-r--r--src/buildtool/execution_api/execution_service/file_chunker.hpp98
4 files changed, 3 insertions, 224 deletions
diff --git a/src/buildtool/execution_api/execution_service/TARGETS b/src/buildtool/execution_api/execution_service/TARGETS
index 3e195136..632e8d0b 100644
--- a/src/buildtool/execution_api/execution_service/TARGETS
+++ b/src/buildtool/execution_api/execution_service/TARGETS
@@ -151,14 +151,6 @@
, ["src/utils/cpp", "verify_hash"]
]
}
-, "file_chunker":
- { "type": ["@", "rules", "CC", "library"]
- , "name": ["file_chunker"]
- , "hdrs": ["file_chunker.hpp"]
- , "srcs": ["file_chunker.cpp"]
- , "stage": ["src", "buildtool", "execution_api", "execution_service"]
- , "private-deps": [["@", "gsl", "", "gsl"]]
- }
, "cas_utils":
{ "type": ["@", "rules", "CC", "library"]
, "name": ["cas_utils"]
@@ -171,8 +163,7 @@
, ["src/buildtool/storage", "storage"]
]
, "private-deps":
- [ "file_chunker"
- , ["@", "fmt", "", "fmt"]
+ [ ["@", "fmt", "", "fmt"]
, ["src/buildtool/common", "common"]
, ["src/buildtool/compatibility", "compatibility"]
, ["src/buildtool/file_system", "git_repo"]
@@ -180,6 +171,7 @@
, ["src/buildtool/file_system", "file_system_manager"]
, ["src/buildtool/storage", "config"]
, ["src/utils/cpp", "hex_string"]
+ , ["src/buildtool/storage", "file_chunker"]
]
}
}
diff --git a/src/buildtool/execution_api/execution_service/cas_utils.cpp b/src/buildtool/execution_api/execution_service/cas_utils.cpp
index e2c4a418..9b7ef336 100644
--- a/src/buildtool/execution_api/execution_service/cas_utils.cpp
+++ b/src/buildtool/execution_api/execution_service/cas_utils.cpp
@@ -19,11 +19,11 @@
#include "fmt/core.h"
#include "src/buildtool/common/artifact_digest.hpp"
#include "src/buildtool/compatibility/native_support.hpp"
-#include "src/buildtool/execution_api/execution_service/file_chunker.hpp"
#include "src/buildtool/file_system/file_system_manager.hpp"
#include "src/buildtool/file_system/git_repo.hpp"
#include "src/buildtool/file_system/object_type.hpp"
#include "src/buildtool/storage/config.hpp"
+#include "src/buildtool/storage/file_chunker.hpp"
#include "src/utils/cpp/hex_string.hpp"
auto CASUtils::EnsureTreeInvariant(std::string const& data,
diff --git a/src/buildtool/execution_api/execution_service/file_chunker.cpp b/src/buildtool/execution_api/execution_service/file_chunker.cpp
deleted file mode 100644
index a4332110..00000000
--- a/src/buildtool/execution_api/execution_service/file_chunker.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/buildtool/execution_api/execution_service/file_chunker.hpp"
-
-#include <array>
-#include <random>
-
-#include "gsl/gsl"
-
-namespace {
-
-// Mask values taken from algorithm 2 of the paper
-// https://ieeexplore.ieee.org/document/9055082.
-constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits
-constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits
-
-// Predefined array of 256 random 64-bit integers, needs to be initialized.
-constexpr std::uint32_t kRandomTableSize{256};
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-std::array<std::uint64_t, kRandomTableSize> gear_table{};
-
-} // namespace
-
-auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void {
- std::mt19937_64 gen64(seed);
- for (auto& item : gear_table) {
- item = gen64();
- }
-}
-
-auto FileChunker::IsOpen() const noexcept -> bool {
- return stream_.is_open();
-}
-
-auto FileChunker::Finished() const noexcept -> bool {
- return stream_.eof() && pos_ == size_;
-}
-
-auto FileChunker::NextChunk() noexcept -> std::optional<std::string> {
- // Handle failed past read attempts from the stream.
- if (not stream_.good() and not stream_.eof()) {
- return std::nullopt;
- }
-
- // Ensure that at least max_chunk_size bytes are in the buffer, except if
- // end-of-file is reached.
- auto remaining = size_ - pos_;
- if (remaining < max_chunk_size_ and not stream_.eof()) {
- // Move the remaining bytes of the buffer to the front.
- buffer_.copy(&buffer_[0], remaining, pos_);
- auto ssize = static_cast<std::streamsize>(buffer_.size() - remaining);
- // Fill the buffer with stream content.
- stream_.read(&buffer_[remaining], ssize);
- if (not stream_.good() and not stream_.eof()) {
- return std::nullopt;
- }
- size_ = static_cast<std::size_t>(stream_.gcount()) + remaining;
- pos_ = 0;
- }
-
- // Handle finished chunking.
- if (pos_ == size_) {
- return std::nullopt;
- }
-
- auto off = NextChunkBoundary();
- auto chunk = buffer_.substr(pos_, off);
- pos_ += off;
- return chunk;
-}
-
-// Implementation of the FastCDC data deduplication algorithm described in
-// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082.
-auto FileChunker::NextChunkBoundary() noexcept -> std::size_t {
- auto n = size_ - pos_;
- auto fp = 0ULL;
- auto i = min_chunk_size_;
- auto normal_size = average_chunk_size_;
- if (n <= min_chunk_size_) {
- return n;
- }
- if (n >= max_chunk_size_) {
- n = max_chunk_size_;
- }
- else if (n <= normal_size) {
- normal_size = n;
- }
- for (; i < normal_size; i++) {
- fp = (fp << 1U) +
- gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i]));
- if ((fp & kMaskS) == 0) {
- return i; // if the masked bits are all '0'
- }
- }
- for (; i < n; i++) {
- fp = (fp << 1U) +
- gsl::at(gear_table, static_cast<uint8_t>(buffer_[pos_ + i]));
- if ((fp & kMaskL) == 0) {
- return i; // if the masked bits are all '0'
- }
- }
- return i;
-}
diff --git a/src/buildtool/execution_api/execution_service/file_chunker.hpp b/src/buildtool/execution_api/execution_service/file_chunker.hpp
deleted file mode 100644
index 5c4e8e89..00000000
--- a/src/buildtool/execution_api/execution_service/file_chunker.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP
-#define INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP
-
-#include <cstddef>
-#include <cstdint>
-#include <filesystem>
-#include <fstream>
-#include <optional>
-#include <string>
-
-/// @brief This class provides content-defined chunking for a file stream. It
-/// allows to split a file stream into variable-sized chunks based on its data
-/// content. In contrast to fixed-sized chunking, which splits a data stream
-/// into chunks of fixed size, it is not prone to the data-shifting problem. In
-/// order to assemble the resulting file, the delivered chunks have to be
-/// concatenated in order.
-///
-/// A read buffer is used to progressively process the file content instead of
-/// reading the entire file content in memory.
-class FileChunker {
- static constexpr std::uint32_t kDefaultChunkSize{1024 * 8}; // 8 KB
- static constexpr std::uint32_t kDefaultSeed{0};
-
- public:
- /// @brief Create an instance of the file chunker for a given file.
- /// @param path The path to the file to be splitted.
- /// @param average_chunk_size Targeted average chunk size in bytes
- /// (default: 8 KB).
- explicit FileChunker(std::filesystem::path const& path,
- std::uint32_t average_chunk_size = kDefaultChunkSize)
- // According to section 4.1 of the paper
- // https://ieeexplore.ieee.org/document/9055082, maximum and minimum
- // chunk sizes are configured to the 8x and the 1/4x of the average
- // chunk size.
- : min_chunk_size_(average_chunk_size >> 2U),
- average_chunk_size_(average_chunk_size),
- max_chunk_size_(average_chunk_size << 3U),
- stream_{path, std::ios::in | std::ios::binary} {
- // The buffer size needs to be at least max_chunk_size_ large, otherwise
- // max_chunk_size_ is not fully exhausted and the buffer size determines
- // the maximum chunk size.
- buffer_.resize(max_chunk_size_ << 4U);
- }
-
- FileChunker() noexcept = delete;
- ~FileChunker() noexcept = default;
- FileChunker(FileChunker const& other) noexcept = delete;
- FileChunker(FileChunker&& other) noexcept = delete;
- auto operator=(FileChunker const& other) noexcept = delete;
- auto operator=(FileChunker&& other) noexcept = delete;
-
- /// @brief Check if the underlying file is open.
- /// @return True if the file was opened successfully, false otherwise.
- [[nodiscard]] auto IsOpen() const noexcept -> bool;
-
- /// @brief Check if chunking of the file stream was done successfully.
- /// @return True if chunking was successful, false otherwise.
- [[nodiscard]] auto Finished() const noexcept -> bool;
-
- /// @brief Fetch the next chunk from the file stream.
- /// @return The next chunk of the file stream.
- [[nodiscard]] auto NextChunk() noexcept -> std::optional<std::string>;
-
- /// @brief Initialize random number table used by the chunking algorithm.
- /// @param seed Some random seed.
- static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void;
-
- private:
- // Different chunk size parameters, defined in number of bytes.
- const std::uint32_t min_chunk_size_{};
- const std::uint32_t average_chunk_size_{};
- const std::uint32_t max_chunk_size_{};
- std::ifstream stream_{}; // File stream to be splitted.
- std::string buffer_{}; // Buffer for the file content.
- std::size_t size_{0}; // Current size of the buffer.
- std::size_t pos_{0}; // Current read position within the buffer.
-
- /// @brief Find the next chunk boundary from the current read position
- /// within the buffer.
- /// @return The position of the next chunk boundary.
- [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t;
-};
-
-#endif // INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP