summaryrefslogtreecommitdiff
path: root/src/buildtool/storage/file_chunker.hpp
blob: 4a7e99e413aff9b948f7d6938e42e8448d68e045 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP
#define INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP

#include <cstddef>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <optional>
#include <string>

/// @brief This class provides content-defined chunking for a file stream. It
/// allows to split a file stream into variable-sized chunks based on its data
/// content. In contrast to fixed-sized chunking, which splits a data stream
/// into chunks of fixed size, it is not prone to the data-shifting problem. In
/// order to assemble the resulting file, the delivered chunks have to be
/// concatenated in order.
///
/// A read buffer is used to progressively process the file content instead of
/// reading the entire file content in memory.
class FileChunker {
    static constexpr std::uint32_t kAverageChunkSize{1024 * 128};  // 128 KB
    static constexpr std::uint32_t kDefaultSeed{0};

  public:
    /// @brief Create an instance of the file chunker for a given file.
    /// @param path                 The path to the file to be splitted.
    /// @param average_chunk_size   Targeted average chunk size in bytes
    ///                             (default: 8 KB).
    explicit FileChunker(std::filesystem::path const& path)
        // According to section 4.1 of the paper
        // https://ieeexplore.ieee.org/document/9055082, maximum and minimum
        // chunk sizes are configured to the 8x and the 1/4x of the average
        // chunk size.
        : min_chunk_size_(kAverageChunkSize >> 2U),
          average_chunk_size_(kAverageChunkSize),
          max_chunk_size_(kAverageChunkSize << 3U),
          stream_{path, std::ios::in | std::ios::binary} {
        // The buffer size needs to be at least max_chunk_size_ large, otherwise
        // max_chunk_size_ is not fully exhausted and the buffer size determines
        // the maximum chunk size.
        buffer_.resize(max_chunk_size_ << 1U);
    }

    FileChunker() noexcept = delete;
    ~FileChunker() noexcept = default;
    FileChunker(FileChunker const& other) noexcept = delete;
    FileChunker(FileChunker&& other) noexcept = delete;
    auto operator=(FileChunker const& other) noexcept = delete;
    auto operator=(FileChunker&& other) noexcept = delete;

    /// @brief Check if the underlying file is open.
    /// @return True if the file was opened successfully, false otherwise.
    [[nodiscard]] auto IsOpen() const noexcept -> bool;

    /// @brief Check if chunking of the file stream was done successfully.
    /// @return True if chunking was successful, false otherwise.
    [[nodiscard]] auto Finished() const noexcept -> bool;

    /// @brief Fetch the next chunk from the file stream.
    /// @return The next chunk of the file stream.
    [[nodiscard]] auto NextChunk() noexcept -> std::optional<std::string>;

    /// @brief Initialize random number table used by the chunking algorithm.
    /// @param seed Some random seed.
    static auto Initialize(std::uint32_t seed = kDefaultSeed) noexcept -> void;

  private:
    // Different chunk size parameters, defined in number of bytes.
    const std::uint32_t min_chunk_size_{};
    const std::uint32_t average_chunk_size_{};
    const std::uint32_t max_chunk_size_{};
    std::ifstream stream_;  // File stream to be splitted.
    std::string buffer_;    // Buffer for the file content.
    std::size_t size_{0};   // Current size of the buffer.
    std::size_t pos_{0};    // Current read position within the buffer.

    /// @brief Find the next chunk boundary from the current read position
    /// within the buffer.
    /// @return The position of the next chunk boundary.
    [[nodiscard]] auto NextChunkBoundary() noexcept -> std::size_t;
};

#endif  // INCLUDED_SRC_BUILDTOOL_STORAGE_FILE_CHUNKER_HPP