summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlberto Sartori <alberto.sartori@huawei.com>2023-12-04 16:37:25 +0100
committerAlberto Sartori <alberto.sartori@huawei.com>2023-12-13 23:04:36 +0100
commit9b3eb0e818f25be8a49b441a55991bcf86697548 (patch)
tree875637c9be4416baed52daff0492bd1884a79475 /src
parent2c60618bc6a49ea3b88e65991b14243f98555193 (diff)
downloadjustbuild-9b3eb0e818f25be8a49b441a55991bcf86697548.tar.gz
Add a retry lib that provides useful wrappers to allow to retry...
...the provided function until either the function succeeds, maximum number of attempts is reached or the function returns an error indicating that we cannot simply retry. It is meant to be used with rpc calls, but the logic can be applied to other scenarios.
Diffstat (limited to 'src')
-rw-r--r--src/buildtool/common/remote/TARGETS18
-rw-r--r--src/buildtool/common/remote/retry.hpp146
-rw-r--r--src/buildtool/common/remote/retry_parameters.hpp127
3 files changed, 291 insertions, 0 deletions
diff --git a/src/buildtool/common/remote/TARGETS b/src/buildtool/common/remote/TARGETS
index 97966431..087a0ea4 100644
--- a/src/buildtool/common/remote/TARGETS
+++ b/src/buildtool/common/remote/TARGETS
@@ -29,4 +29,22 @@
, "deps": [["@", "fmt", "", "fmt"], ["@", "json", "", "json"], "port"]
, "stage": ["src", "buildtool", "common", "remote"]
}
+, "retry_parameters":
+ { "type": ["@", "rules", "CC", "library"]
+ , "name": ["retry_parameters"]
+ , "hdrs": ["retry_parameters.hpp"]
+ , "deps": [["src/buildtool/logging", "logging"]]
+ , "stage": ["src", "buildtool", "common", "remote"]
+ }
+, "retry":
+ { "type": ["@", "rules", "CC", "library"]
+ , "name": ["retry"]
+ , "hdrs": ["retry.hpp"]
+ , "deps":
+ [ ["src/buildtool/logging", "logging"]
+ , ["@", "grpc", "", "grpc++"]
+ , "retry_parameters"
+ ]
+ , "stage": ["src", "buildtool", "common", "remote"]
+ }
}
diff --git a/src/buildtool/common/remote/retry.hpp b/src/buildtool/common/remote/retry.hpp
new file mode 100644
index 00000000..7abfcc2e
--- /dev/null
+++ b/src/buildtool/common/remote/retry.hpp
@@ -0,0 +1,146 @@
+// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <optional>
+#include <thread>
+
+#include <grpcpp/grpcpp.h>
+
+#include "src/buildtool/common/remote/retry_parameters.hpp"
+#include "src/buildtool/logging/logger.hpp"
+
+// Utility class to help detecting when exit the retry loop. This class can be
+// used when the failure cannot be immediately detected by the return value of
+// the function. E.g., when using a grpc stream.
+//
+// Please note that it is user's responsibility to do not set both to true.
+//
+// Design note: even though only one bool could be sufficient (e.g., exit), this
+// would require to check two times if we exited because of a success or a
+// failure: the first time, inside the retry loop; the second time, by the
+// caller.
+struct RetryResponse {
+ // When set to true, it means the function successfully run
+ bool ok{false};
+ // When set to true, it means that it is not worthy to retry.
+ bool exit_retry_loop{false};
+ // error message logged when exit_retry_loop was set to true or when the
+ // last retry attempt failed
+ std::optional<std::string> error_msg{std::nullopt};
+};
+
+template <typename F>
+concept CallableReturningRetryResponse = requires(F const& f) {
+ {RetryResponse{f()}};
+};
+
+template <CallableReturningRetryResponse F>
+// \p f is the callable invoked with a back off algorithm. The retry loop is
+// interrupted when one of the two member of the returned RetryResponse object
+// is set to true.
+[[nodiscard]] auto WithRetry(F const& f, Logger const& logger) noexcept
+ -> bool {
+ try {
+ auto const& attempts = Retry::GetMaxAttempts();
+ for (auto attempt = 1U; attempt <= attempts; ++attempt) {
+ auto [ok, fatal, error_msg] = f();
+ if (ok) {
+ return true;
+ }
+ if (fatal) {
+ if (error_msg) {
+ logger.Emit(LogLevel::Error, *error_msg);
+ }
+ return false;
+ }
+ // don't wait if it was the last attempt
+ if (attempt < attempts) {
+ auto const sleep_for_seconds =
+ Retry::GetSleepTimeSeconds(attempt);
+ logger.Emit(kRetryLogLevel,
+ "Attempt {}/{} failed{} Retrying in {} seconds.",
+ attempt,
+ attempts,
+ error_msg ? fmt::format(": {}", *error_msg) : ".",
+ sleep_for_seconds);
+ std::this_thread::sleep_for(
+ std::chrono::seconds(sleep_for_seconds));
+ }
+ else {
+ if (error_msg) {
+ logger.Emit(LogLevel::Error,
+ "After {} attempts: {}",
+ attempt,
+ *error_msg);
+ }
+ }
+ }
+ } catch (...) {
+ logger.Emit(LogLevel::Error, "WithRetry: caught unknown exception");
+ }
+ return false;
+}
+
+template <typename F>
+concept CallableReturningGrpcStatus = requires(F const& f) {
+ {grpc::Status{f()}};
+};
+
+template <CallableReturningGrpcStatus F>
+// F is the function to be invoked with a back off algorithm
+[[nodiscard]] auto WithRetry(F const& f, Logger const& logger) noexcept
+ -> std::pair<bool, grpc::Status> {
+ grpc::Status status{};
+ try {
+ auto attempts = Retry::GetMaxAttempts();
+ for (auto attempt = 1U; attempt <= attempts; ++attempt) {
+ status = f();
+ if (status.ok() or
+ status.error_code() != grpc::StatusCode::UNAVAILABLE) {
+ return {status.ok(), std::move(status)};
+ }
+ // don't wait if it was the last attempt
+ if (attempt < attempts) {
+ auto const sleep_for_seconds =
+ Retry::GetSleepTimeSeconds(attempt);
+ logger.Emit(
+ kRetryLogLevel,
+ "Attempt {}/{} failed: {}: {}: Retrying in {} seconds.",
+ attempt,
+ attempts,
+ static_cast<int>(status.error_code()),
+ status.error_message(),
+ sleep_for_seconds);
+ std::this_thread::sleep_for(
+ std::chrono::seconds(sleep_for_seconds));
+ }
+ else {
+ // The caller performs a second check on the
+ // status.error_code(), and, eventually, emits to Error level
+ // there.
+ //
+ // To avoid duplication of similar errors, we emit to Debug
+ // level.
+ logger.Emit(LogLevel::Debug,
+ "After {} attempts: {}: {}",
+ attempt,
+ static_cast<int>(status.error_code()),
+ status.error_message());
+ }
+ }
+ } catch (...) {
+ logger.Emit(LogLevel::Error, "WithRetry: caught unknown exception");
+ }
+ return {false, std::move(status)};
+}
diff --git a/src/buildtool/common/remote/retry_parameters.hpp b/src/buildtool/common/remote/retry_parameters.hpp
new file mode 100644
index 00000000..73b92db7
--- /dev/null
+++ b/src/buildtool/common/remote/retry_parameters.hpp
@@ -0,0 +1,127 @@
+// Copyright 2023 Huawei Cloud Computing Technology Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+#include <random>
+
+#include "src/buildtool/logging/logger.hpp"
+
+constexpr unsigned int kDefaultInitialBackoffSeconds{1};
+constexpr unsigned int kDefaultMaxBackoffSeconds{60};
+constexpr unsigned int kDefaultAttempts{1};
+constexpr auto kRetryLogLevel = LogLevel::Progress;
+class Retry {
+ using dist_type = std::uniform_int_distribution<std::mt19937::result_type>;
+
+ public:
+ Retry() = default;
+ [[nodiscard]] static auto Instance() -> Retry& {
+ static Retry instance{};
+ return instance;
+ }
+
+ [[nodiscard]] static auto SetInitialBackoffSeconds(unsigned int x) noexcept
+ -> bool {
+ if (x < 1) {
+ Logger::Log(
+ LogLevel::Error,
+ "Invalid initial amount of seconds provided: {}. Value must "
+ "be strictly greater than 0.",
+ x);
+ return false;
+ }
+ Instance().initial_backoff_seconds_ = x;
+ return true;
+ }
+
+ [[nodiscard]] static auto SetMaxBackoffSeconds(unsigned int x) noexcept
+ -> bool {
+ if (x < 1) {
+ Logger::Log(LogLevel::Error,
+ "Invalid max backoff provided: {}. Value must be "
+ "strictly greater than 0.",
+ x);
+ return false;
+ }
+ Instance().max_backoff_seconds_ = x;
+ return true;
+ }
+
+ [[nodiscard]] static auto GetMaxBackoffSeconds() noexcept -> unsigned int {
+ return Instance().max_backoff_seconds_;
+ }
+
+ [[nodiscard]] static auto SetMaxAttempts(unsigned int x) noexcept -> bool {
+ if (x < 1) {
+ Logger::Log(LogLevel::Error,
+ "Invalid number of max number of attempts provided: "
+ "{}. Value must be strictly greater than 0",
+ x);
+ return false;
+ }
+ Instance().attempts_ = x;
+ return true;
+ }
+
+ [[nodiscard]] static auto GetInitialBackoffSeconds() noexcept
+ -> unsigned int {
+ return Instance().initial_backoff_seconds_;
+ }
+
+ [[nodiscard]] static auto GetMaxAttempts() noexcept -> unsigned int {
+ return Instance().attempts_;
+ }
+
+ [[nodiscard]] static auto Jitter(unsigned int backoff) noexcept ->
+ typename dist_type::result_type {
+ auto& inst = Instance();
+ try {
+ dist_type dist{0, backoff * 3};
+ std::unique_lock lock(inst.mutex_);
+ return dist(inst.rng_);
+ } catch (...) {
+ return 0;
+ }
+ }
+
+ /// \brief The waiting time is exponentially increased at each \p attempt
+ /// until it exceeds max_backoff_seconds.
+ ///
+ /// To avoid overloading of the reachable resources, a jitter (aka, random
+ /// value) is added to distributed the workload.
+ [[nodiscard]] static auto GetSleepTimeSeconds(unsigned int attempt) noexcept
+ -> unsigned int {
+ auto backoff = Retry::GetInitialBackoffSeconds();
+ auto const& max_backoff = Retry::GetMaxBackoffSeconds();
+ // on the first attempt, we don't double the backoff time
+ // also we do it in a for loop to avoid overflow
+ for (auto x = 1U; x < attempt; ++x) {
+ backoff <<= 1U;
+ if (backoff >= max_backoff) {
+ backoff = max_backoff;
+ break;
+ }
+ }
+ return backoff + Retry::Jitter(backoff);
+ }
+
+ private:
+ unsigned int initial_backoff_seconds_{kDefaultInitialBackoffSeconds};
+ unsigned int max_backoff_seconds_{kDefaultMaxBackoffSeconds};
+ unsigned int attempts_{kDefaultAttempts};
+ LogLevel retry_log_level_{kRetryLogLevel};
+ std::mutex mutex_;
+ std::random_device dev_;
+ std::mt19937 rng_{dev_()};
+};