diff options
author | Maksim Denisov <denisov.maksim@huawei.com> | 2024-03-14 14:50:38 +0100 |
---|---|---|
committer | Maksim Denisov <denisov.maksim@huawei.com> | 2024-04-02 15:30:03 +0200 |
commit | fbd7eb02efc6a541a79360490e940bad4387c12c (patch) | |
tree | cc728b2a611d32bbf4a65db6016e20b7a7382d2b /src/buildtool/storage/file_chunker.hpp | |
parent | 60142c07ff866fc18ea3e497ab30d2292ff8fd2c (diff) | |
download | justbuild-fbd7eb02efc6a541a79360490e940bad4387c12c.tar.gz |
Move file chunker to storage.
Diffstat (limited to 'src/buildtool/storage/file_chunker.hpp')
-rw-r--r-- | src/buildtool/storage/file_chunker.hpp | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/src/buildtool/storage/file_chunker.hpp b/src/buildtool/storage/file_chunker.hpp new file mode 100644 index 00000000..c4611c48 --- /dev/null +++ b/src/buildtool/storage/file_chunker.hpp @@ -0,0 +1,98 @@ +// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP +#define INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP + +#include <cstddef> +#include <cstdint> +#include <filesystem> +#include <fstream> +#include <optional> +#include <string> + +/// @brief This class provides content-defined chunking for a file stream. It +/// allows to split a file stream into variable-sized chunks based on its data +/// content. In contrast to fixed-sized chunking, which splits a data stream +/// into chunks of fixed size, it is not prone to the data-shifting problem. In +/// order to assemble the resulting file, the delivered chunks have to be +/// concatenated in order. +/// +/// A read buffer is used to progressively process the file content instead of +/// reading the entire file content in memory. +class FileChunker { + static constexpr std::uint32_t kDefaultChunkSize{1024 * 8}; // 8 KB + static constexpr std::uint32_t kDefaultSeed{0}; + + public: + /// @brief Create an instance of the file chunker for a given file. + /// @param path The path to the file to be splitted. + /// @param average_chunk_size Targeted average chunk size in bytes + /// (default: 8 KB). + explicit FileChunker(std::filesystem::path const& path, + std::uint32_t average_chunk_size = kDefaultChunkSize) + // According to section 4.1 of the paper + // https://ieeexplore.ieee.org/document/9055082, maximum and minimum + // chunk sizes are configured to the 8x and the 1/4x of the average + // chunk size. + : min_chunk_size_(average_chunk_size >> 2U), + average_chunk_size_(average_chunk_size), + max_chunk_size_(average_chunk_size << 3U), + stream_{path, std::ios::in | std::ios::binary} { + // The buffer size needs to be at least max_chunk_size_ large, otherwise + // max_chunk_size_ is not fully exhausted and the buffer size determines + // the maximum chunk size. + buffer_.resize(max_chunk_size_ << 4U); + } + + FileChunker() noexcept = delete; + ~FileChunker() noexcept = default; + FileChunker(FileChunker const& other) noexcept = delete; + FileChunker(FileChunker&& other) noexcept = delete; + auto operator=(FileChunker const& other) noexcept = delete; + auto operator=(FileChunker&& other) noexcept = delete; + + /// @brief Check if the underlying file is open. + /// @return True if the file was opened successfully, false otherwise. + [[nodiscard]] auto IsOpen() const noexcept -> bool; + + /// @brief Check if chunking of the file stream was done successfully. + /// @return True if chunking was successful, false otherwise. + [[nodiscard]] auto Finished() const noexcept -> bool; + + /// @brief Fetch the next chunk from the file stream. + /// @return The next chunk of the file stream. + [[nodiscard]] auto NextChunk() noexcept -> std::optional<std::string>; + + /// @brief Initialize random number table used by the chunking algorithm. + /// @param seed Some random seed. + static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void; + + private: + // Different chunk size parameters, defined in number of bytes. + const std::uint32_t min_chunk_size_{}; + const std::uint32_t average_chunk_size_{}; + const std::uint32_t max_chunk_size_{}; + std::ifstream stream_{}; // File stream to be splitted. + std::string buffer_{}; // Buffer for the file content. + std::size_t size_{0}; // Current size of the buffer. + std::size_t pos_{0}; // Current read position within the buffer. + + /// @brief Find the next chunk boundary from the current read position + /// within the buffer. + /// @return The position of the next chunk boundary. + [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t; +}; + +#endif // INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP |