From fbd7eb02efc6a541a79360490e940bad4387c12c Mon Sep 17 00:00:00 2001 From: Maksim Denisov Date: Thu, 14 Mar 2024 14:50:38 +0100 Subject: Move file chunker to storage. --- .../execution_api/execution_service/TARGETS | 12 +-- .../execution_api/execution_service/cas_utils.cpp | 2 +- .../execution_service/file_chunker.cpp | 115 --------------------- .../execution_service/file_chunker.hpp | 98 ------------------ src/buildtool/main/TARGETS | 2 +- src/buildtool/main/main.cpp | 2 +- src/buildtool/storage/TARGETS | 8 ++ src/buildtool/storage/file_chunker.cpp | 115 +++++++++++++++++++++ src/buildtool/storage/file_chunker.hpp | 98 ++++++++++++++++++ 9 files changed, 226 insertions(+), 226 deletions(-) delete mode 100644 src/buildtool/execution_api/execution_service/file_chunker.cpp delete mode 100644 src/buildtool/execution_api/execution_service/file_chunker.hpp create mode 100644 src/buildtool/storage/file_chunker.cpp create mode 100644 src/buildtool/storage/file_chunker.hpp (limited to 'src') diff --git a/src/buildtool/execution_api/execution_service/TARGETS b/src/buildtool/execution_api/execution_service/TARGETS index 3e195136..632e8d0b 100644 --- a/src/buildtool/execution_api/execution_service/TARGETS +++ b/src/buildtool/execution_api/execution_service/TARGETS @@ -151,14 +151,6 @@ , ["src/utils/cpp", "verify_hash"] ] } -, "file_chunker": - { "type": ["@", "rules", "CC", "library"] - , "name": ["file_chunker"] - , "hdrs": ["file_chunker.hpp"] - , "srcs": ["file_chunker.cpp"] - , "stage": ["src", "buildtool", "execution_api", "execution_service"] - , "private-deps": [["@", "gsl", "", "gsl"]] - } , "cas_utils": { "type": ["@", "rules", "CC", "library"] , "name": ["cas_utils"] @@ -171,8 +163,7 @@ , ["src/buildtool/storage", "storage"] ] , "private-deps": - [ "file_chunker" - , ["@", "fmt", "", "fmt"] + [ ["@", "fmt", "", "fmt"] , ["src/buildtool/common", "common"] , ["src/buildtool/compatibility", "compatibility"] , ["src/buildtool/file_system", "git_repo"] @@ -180,6 +171,7 @@ , ["src/buildtool/file_system", "file_system_manager"] , ["src/buildtool/storage", "config"] , ["src/utils/cpp", "hex_string"] + , ["src/buildtool/storage", "file_chunker"] ] } } diff --git a/src/buildtool/execution_api/execution_service/cas_utils.cpp b/src/buildtool/execution_api/execution_service/cas_utils.cpp index e2c4a418..9b7ef336 100644 --- a/src/buildtool/execution_api/execution_service/cas_utils.cpp +++ b/src/buildtool/execution_api/execution_service/cas_utils.cpp @@ -19,11 +19,11 @@ #include "fmt/core.h" #include "src/buildtool/common/artifact_digest.hpp" #include "src/buildtool/compatibility/native_support.hpp" -#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" #include "src/buildtool/file_system/file_system_manager.hpp" #include "src/buildtool/file_system/git_repo.hpp" #include "src/buildtool/file_system/object_type.hpp" #include "src/buildtool/storage/config.hpp" +#include "src/buildtool/storage/file_chunker.hpp" #include "src/utils/cpp/hex_string.hpp" auto CASUtils::EnsureTreeInvariant(std::string const& data, diff --git a/src/buildtool/execution_api/execution_service/file_chunker.cpp b/src/buildtool/execution_api/execution_service/file_chunker.cpp deleted file mode 100644 index a4332110..00000000 --- a/src/buildtool/execution_api/execution_service/file_chunker.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" - -#include -#include - -#include "gsl/gsl" - -namespace { - -// Mask values taken from algorithm 2 of the paper -// https://ieeexplore.ieee.org/document/9055082. -constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits -constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits - -// Predefined array of 256 random 64-bit integers, needs to be initialized. -constexpr std::uint32_t kRandomTableSize{256}; -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -std::array gear_table{}; - -} // namespace - -auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void { - std::mt19937_64 gen64(seed); - for (auto& item : gear_table) { - item = gen64(); - } -} - -auto FileChunker::IsOpen() const noexcept -> bool { - return stream_.is_open(); -} - -auto FileChunker::Finished() const noexcept -> bool { - return stream_.eof() && pos_ == size_; -} - -auto FileChunker::NextChunk() noexcept -> std::optional { - // Handle failed past read attempts from the stream. - if (not stream_.good() and not stream_.eof()) { - return std::nullopt; - } - - // Ensure that at least max_chunk_size bytes are in the buffer, except if - // end-of-file is reached. - auto remaining = size_ - pos_; - if (remaining < max_chunk_size_ and not stream_.eof()) { - // Move the remaining bytes of the buffer to the front. - buffer_.copy(&buffer_[0], remaining, pos_); - auto ssize = static_cast(buffer_.size() - remaining); - // Fill the buffer with stream content. - stream_.read(&buffer_[remaining], ssize); - if (not stream_.good() and not stream_.eof()) { - return std::nullopt; - } - size_ = static_cast(stream_.gcount()) + remaining; - pos_ = 0; - } - - // Handle finished chunking. - if (pos_ == size_) { - return std::nullopt; - } - - auto off = NextChunkBoundary(); - auto chunk = buffer_.substr(pos_, off); - pos_ += off; - return chunk; -} - -// Implementation of the FastCDC data deduplication algorithm described in -// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082. -auto FileChunker::NextChunkBoundary() noexcept -> std::size_t { - auto n = size_ - pos_; - auto fp = 0ULL; - auto i = min_chunk_size_; - auto normal_size = average_chunk_size_; - if (n <= min_chunk_size_) { - return n; - } - if (n >= max_chunk_size_) { - n = max_chunk_size_; - } - else if (n <= normal_size) { - normal_size = n; - } - for (; i < normal_size; i++) { - fp = (fp << 1U) + - gsl::at(gear_table, static_cast(buffer_[pos_ + i])); - if ((fp & kMaskS) == 0) { - return i; // if the masked bits are all '0' - } - } - for (; i < n; i++) { - fp = (fp << 1U) + - gsl::at(gear_table, static_cast(buffer_[pos_ + i])); - if ((fp & kMaskL) == 0) { - return i; // if the masked bits are all '0' - } - } - return i; -} diff --git a/src/buildtool/execution_api/execution_service/file_chunker.hpp b/src/buildtool/execution_api/execution_service/file_chunker.hpp deleted file mode 100644 index 5c4e8e89..00000000 --- a/src/buildtool/execution_api/execution_service/file_chunker.hpp +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP -#define INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP - -#include -#include -#include -#include -#include -#include - -/// @brief This class provides content-defined chunking for a file stream. It -/// allows to split a file stream into variable-sized chunks based on its data -/// content. In contrast to fixed-sized chunking, which splits a data stream -/// into chunks of fixed size, it is not prone to the data-shifting problem. In -/// order to assemble the resulting file, the delivered chunks have to be -/// concatenated in order. -/// -/// A read buffer is used to progressively process the file content instead of -/// reading the entire file content in memory. -class FileChunker { - static constexpr std::uint32_t kDefaultChunkSize{1024 * 8}; // 8 KB - static constexpr std::uint32_t kDefaultSeed{0}; - - public: - /// @brief Create an instance of the file chunker for a given file. - /// @param path The path to the file to be splitted. - /// @param average_chunk_size Targeted average chunk size in bytes - /// (default: 8 KB). - explicit FileChunker(std::filesystem::path const& path, - std::uint32_t average_chunk_size = kDefaultChunkSize) - // According to section 4.1 of the paper - // https://ieeexplore.ieee.org/document/9055082, maximum and minimum - // chunk sizes are configured to the 8x and the 1/4x of the average - // chunk size. - : min_chunk_size_(average_chunk_size >> 2U), - average_chunk_size_(average_chunk_size), - max_chunk_size_(average_chunk_size << 3U), - stream_{path, std::ios::in | std::ios::binary} { - // The buffer size needs to be at least max_chunk_size_ large, otherwise - // max_chunk_size_ is not fully exhausted and the buffer size determines - // the maximum chunk size. - buffer_.resize(max_chunk_size_ << 4U); - } - - FileChunker() noexcept = delete; - ~FileChunker() noexcept = default; - FileChunker(FileChunker const& other) noexcept = delete; - FileChunker(FileChunker&& other) noexcept = delete; - auto operator=(FileChunker const& other) noexcept = delete; - auto operator=(FileChunker&& other) noexcept = delete; - - /// @brief Check if the underlying file is open. - /// @return True if the file was opened successfully, false otherwise. - [[nodiscard]] auto IsOpen() const noexcept -> bool; - - /// @brief Check if chunking of the file stream was done successfully. - /// @return True if chunking was successful, false otherwise. - [[nodiscard]] auto Finished() const noexcept -> bool; - - /// @brief Fetch the next chunk from the file stream. - /// @return The next chunk of the file stream. - [[nodiscard]] auto NextChunk() noexcept -> std::optional; - - /// @brief Initialize random number table used by the chunking algorithm. - /// @param seed Some random seed. - static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void; - - private: - // Different chunk size parameters, defined in number of bytes. - const std::uint32_t min_chunk_size_{}; - const std::uint32_t average_chunk_size_{}; - const std::uint32_t max_chunk_size_{}; - std::ifstream stream_{}; // File stream to be splitted. - std::string buffer_{}; // Buffer for the file content. - std::size_t size_{0}; // Current size of the buffer. - std::size_t pos_{0}; // Current read position within the buffer. - - /// @brief Find the next chunk boundary from the current read position - /// within the buffer. - /// @return The position of the next chunk boundary. - [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t; -}; - -#endif // INCLUDED_SRC_EXECUTION_API_EXECUTION_SERVICE_FILE_CHUNKER_HPP diff --git a/src/buildtool/main/TARGETS b/src/buildtool/main/TARGETS index b15ef25a..602e31b8 100644 --- a/src/buildtool/main/TARGETS +++ b/src/buildtool/main/TARGETS @@ -32,7 +32,7 @@ , ["src/buildtool/serve_api/remote", "config"] , ["src/buildtool/serve_api/serve_service", "serve_server_implementation"] , ["src/buildtool/common/remote", "retry_parameters"] - , ["src/buildtool/execution_api/execution_service", "file_chunker"] + , ["src/buildtool/storage", "file_chunker"] , "common" , "cli" , "version" diff --git a/src/buildtool/main/main.cpp b/src/buildtool/main/main.cpp index 895fd517..18beff58 100644 --- a/src/buildtool/main/main.cpp +++ b/src/buildtool/main/main.cpp @@ -33,7 +33,6 @@ #include "src/buildtool/common/repository_config.hpp" #include "src/buildtool/common/statistics.hpp" #include "src/buildtool/compatibility/compatibility.hpp" -#include "src/buildtool/execution_api/execution_service/file_chunker.hpp" #include "src/buildtool/execution_api/local/config.hpp" #include "src/buildtool/file_system/file_root.hpp" #include "src/buildtool/logging/log_config.hpp" @@ -55,6 +54,7 @@ #include "src/buildtool/multithreading/task_system.hpp" #include "src/buildtool/progress_reporting/progress.hpp" #include "src/buildtool/storage/config.hpp" +#include "src/buildtool/storage/file_chunker.hpp" #include "src/buildtool/storage/garbage_collector.hpp" #include "src/buildtool/storage/storage.hpp" #include "src/buildtool/storage/target_cache.hpp" diff --git a/src/buildtool/storage/TARGETS b/src/buildtool/storage/TARGETS index a3e06628..38180055 100644 --- a/src/buildtool/storage/TARGETS +++ b/src/buildtool/storage/TARGETS @@ -81,4 +81,12 @@ , "storage" ] } +, "file_chunker": + { "type": ["@", "rules", "CC", "library"] + , "name": ["file_chunker"] + , "hdrs": ["file_chunker.hpp"] + , "srcs": ["file_chunker.cpp"] + , "stage": ["src", "buildtool", "storage"] + , "private-deps": [["@", "gsl", "", "gsl"]] + } } diff --git a/src/buildtool/storage/file_chunker.cpp b/src/buildtool/storage/file_chunker.cpp new file mode 100644 index 00000000..af318501 --- /dev/null +++ b/src/buildtool/storage/file_chunker.cpp @@ -0,0 +1,115 @@ +// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/buildtool/storage/file_chunker.hpp" + +#include +#include + +#include "gsl/gsl" + +namespace { + +// Mask values taken from algorithm 2 of the paper +// https://ieeexplore.ieee.org/document/9055082. +constexpr std::uint64_t kMaskS{0x0000d9f003530000ULL}; // 15 '1' bits +constexpr std::uint64_t kMaskL{0x0000d90003530000ULL}; // 11 '1' bits + +// Predefined array of 256 random 64-bit integers, needs to be initialized. +constexpr std::uint32_t kRandomTableSize{256}; +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +std::array gear_table{}; + +} // namespace + +auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void { + std::mt19937_64 gen64(seed); + for (auto& item : gear_table) { + item = gen64(); + } +} + +auto FileChunker::IsOpen() const noexcept -> bool { + return stream_.is_open(); +} + +auto FileChunker::Finished() const noexcept -> bool { + return stream_.eof() && pos_ == size_; +} + +auto FileChunker::NextChunk() noexcept -> std::optional { + // Handle failed past read attempts from the stream. + if (not stream_.good() and not stream_.eof()) { + return std::nullopt; + } + + // Ensure that at least max_chunk_size bytes are in the buffer, except if + // end-of-file is reached. + auto remaining = size_ - pos_; + if (remaining < max_chunk_size_ and not stream_.eof()) { + // Move the remaining bytes of the buffer to the front. + buffer_.copy(&buffer_[0], remaining, pos_); + auto ssize = static_cast(buffer_.size() - remaining); + // Fill the buffer with stream content. + stream_.read(&buffer_[remaining], ssize); + if (not stream_.good() and not stream_.eof()) { + return std::nullopt; + } + size_ = static_cast(stream_.gcount()) + remaining; + pos_ = 0; + } + + // Handle finished chunking. + if (pos_ == size_) { + return std::nullopt; + } + + auto off = NextChunkBoundary(); + auto chunk = buffer_.substr(pos_, off); + pos_ += off; + return chunk; +} + +// Implementation of the FastCDC data deduplication algorithm described in +// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082. +auto FileChunker::NextChunkBoundary() noexcept -> std::size_t { + auto n = size_ - pos_; + auto fp = 0ULL; + auto i = min_chunk_size_; + auto normal_size = average_chunk_size_; + if (n <= min_chunk_size_) { + return n; + } + if (n >= max_chunk_size_) { + n = max_chunk_size_; + } + else if (n <= normal_size) { + normal_size = n; + } + for (; i < normal_size; i++) { + fp = (fp << 1U) + + gsl::at(gear_table, static_cast(buffer_[pos_ + i])); + if ((fp & kMaskS) == 0) { + return i; // if the masked bits are all '0' + } + } + for (; i < n; i++) { + fp = (fp << 1U) + + gsl::at(gear_table, static_cast(buffer_[pos_ + i])); + if ((fp & kMaskL) == 0) { + return i; // if the masked bits are all '0' + } + } + return i; +} diff --git a/src/buildtool/storage/file_chunker.hpp b/src/buildtool/storage/file_chunker.hpp new file mode 100644 index 00000000..c4611c48 --- /dev/null +++ b/src/buildtool/storage/file_chunker.hpp @@ -0,0 +1,98 @@ +// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP +#define INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP + +#include +#include +#include +#include +#include +#include + +/// @brief This class provides content-defined chunking for a file stream. It +/// allows to split a file stream into variable-sized chunks based on its data +/// content. In contrast to fixed-sized chunking, which splits a data stream +/// into chunks of fixed size, it is not prone to the data-shifting problem. In +/// order to assemble the resulting file, the delivered chunks have to be +/// concatenated in order. +/// +/// A read buffer is used to progressively process the file content instead of +/// reading the entire file content in memory. +class FileChunker { + static constexpr std::uint32_t kDefaultChunkSize{1024 * 8}; // 8 KB + static constexpr std::uint32_t kDefaultSeed{0}; + + public: + /// @brief Create an instance of the file chunker for a given file. + /// @param path The path to the file to be splitted. + /// @param average_chunk_size Targeted average chunk size in bytes + /// (default: 8 KB). + explicit FileChunker(std::filesystem::path const& path, + std::uint32_t average_chunk_size = kDefaultChunkSize) + // According to section 4.1 of the paper + // https://ieeexplore.ieee.org/document/9055082, maximum and minimum + // chunk sizes are configured to the 8x and the 1/4x of the average + // chunk size. + : min_chunk_size_(average_chunk_size >> 2U), + average_chunk_size_(average_chunk_size), + max_chunk_size_(average_chunk_size << 3U), + stream_{path, std::ios::in | std::ios::binary} { + // The buffer size needs to be at least max_chunk_size_ large, otherwise + // max_chunk_size_ is not fully exhausted and the buffer size determines + // the maximum chunk size. + buffer_.resize(max_chunk_size_ << 4U); + } + + FileChunker() noexcept = delete; + ~FileChunker() noexcept = default; + FileChunker(FileChunker const& other) noexcept = delete; + FileChunker(FileChunker&& other) noexcept = delete; + auto operator=(FileChunker const& other) noexcept = delete; + auto operator=(FileChunker&& other) noexcept = delete; + + /// @brief Check if the underlying file is open. + /// @return True if the file was opened successfully, false otherwise. + [[nodiscard]] auto IsOpen() const noexcept -> bool; + + /// @brief Check if chunking of the file stream was done successfully. + /// @return True if chunking was successful, false otherwise. + [[nodiscard]] auto Finished() const noexcept -> bool; + + /// @brief Fetch the next chunk from the file stream. + /// @return The next chunk of the file stream. + [[nodiscard]] auto NextChunk() noexcept -> std::optional; + + /// @brief Initialize random number table used by the chunking algorithm. + /// @param seed Some random seed. + static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void; + + private: + // Different chunk size parameters, defined in number of bytes. + const std::uint32_t min_chunk_size_{}; + const std::uint32_t average_chunk_size_{}; + const std::uint32_t max_chunk_size_{}; + std::ifstream stream_{}; // File stream to be splitted. + std::string buffer_{}; // Buffer for the file content. + std::size_t size_{0}; // Current size of the buffer. + std::size_t pos_{0}; // Current read position within the buffer. + + /// @brief Find the next chunk boundary from the current read position + /// within the buffer. + /// @return The position of the next chunk boundary. + [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t; +}; + +#endif // INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP -- cgit v1.2.3