summaryrefslogtreecommitdiff
path: root/src/buildtool/storage/file_chunker.cpp
blob: b94f487a479dd418b2863e3ed08a7c5be990163b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "src/buildtool/storage/file_chunker.hpp"

#include <array>
#include <random>

#include "gsl/gsl"

namespace {

// Mask values taken from algorithm 2 of the paper
// https://ieeexplore.ieee.org/document/9055082.
constexpr std::uint64_t kMaskS{0x4444d9f003530000ULL};  // 19 '1' bits
constexpr std::uint64_t kMaskL{0x4444d90003530000ULL};  // 15 '1' bits

// Predefined array of 256 random 64-bit integers, needs to be initialized.
constexpr std::uint32_t kRandomTableSize{256};
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
std::array<std::uint64_t, kRandomTableSize> gear_table{};

}  // namespace

auto FileChunker::Initialize(std::uint32_t seed) noexcept -> void {
    std::mt19937_64 gen64(seed);
    for (auto& item : gear_table) {
        item = gen64();
    }
}

auto FileChunker::IsOpen() const noexcept -> bool {
    return stream_.is_open();
}

auto FileChunker::Finished() const noexcept -> bool {
    return stream_.eof() && pos_ == size_;
}

auto FileChunker::NextChunk() noexcept -> std::optional<std::string> {
    // Handle failed past read attempts from the stream.
    if (not stream_.good() and not stream_.eof()) {
        return std::nullopt;
    }

    // Ensure that at least max_chunk_size bytes are in the buffer, except if
    // end-of-file is reached.
    auto remaining = size_ - pos_;
    if (remaining < max_chunk_size_ and not stream_.eof()) {
        // Move the remaining bytes of the buffer to the front.
        buffer_.copy(&buffer_[0], remaining, pos_);
        auto ssize = static_cast<std::streamsize>(buffer_.size() - remaining);
        // Fill the buffer with stream content.
        stream_.read(&buffer_[remaining], ssize);
        if (not stream_.good() and not stream_.eof()) {
            return std::nullopt;
        }
        size_ = static_cast<std::size_t>(stream_.gcount()) + remaining;
        pos_ = 0;
    }

    // Handle finished chunking.
    if (pos_ == size_) {
        return std::nullopt;
    }

    auto off = NextChunkBoundary();
    auto chunk = buffer_.substr(pos_, off);
    pos_ += off;
    return chunk;
}

// Implementation of the FastCDC data deduplication algorithm described in
// algorithm 2 of the paper https://ieeexplore.ieee.org/document/9055082.
auto FileChunker::NextChunkBoundary() noexcept -> std::size_t {
    auto n = size_ - pos_;
    auto fp = 0ULL;
    auto i = min_chunk_size_;
    auto normal_size = average_chunk_size_;
    if (n <= min_chunk_size_) {
        return n;
    }
    if (n >= max_chunk_size_) {
        n = max_chunk_size_;
    }
    else if (n <= normal_size) {
        normal_size = n;
    }
    for (; i < normal_size; i++) {
        fp = (fp << 1U) +
             gsl::at(gear_table, static_cast<std::uint8_t>(buffer_[pos_ + i]));
        if ((fp & kMaskS) == 0) {
            return i;  // if the masked bits are all '0'
        }
    }
    for (; i < n; i++) {
        fp = (fp << 1U) +
             gsl::at(gear_table, static_cast<std::uint8_t>(buffer_[pos_ + i]));
        if ((fp & kMaskL) == 0) {
            return i;  // if the masked bits are all '0'
        }
    }
    return i;
}