From 08bf03382124fe5fbe58dd068d4ebb546b5c3e87 Mon Sep 17 00:00:00 2001 From: Klaus Aehlig Date: Mon, 28 Oct 2024 14:23:03 +0100 Subject: Retry Execution on FAILED_PRECONDITION The specification for this status code is as follows. One or more errors occurred in setting up the action requested, such as a missing input or command or no worker being available. The client may be able to fix the errors and retry. We routinely ensure all inputs are available to the remote execution before we start an action, so all prerequisites will be there on a compliant server, however might not actually be on a server where the CAS only has eventual consistency or is incorrect (due to old cache entries on CAS purge) in its answer to FindMissingBlobs. While we have no guarantee that a retry will help, we still retry; at least in the case of an unavailable worker or CAS entries not yet available due to eventual consistency, this will help. Also, we log at debug lvel the full response, including the repeated Any message. In this way, we can find out what useful information (if any) is sent by popular remote-execution services and implement more specific mitigations in the future. --- src/buildtool/execution_api/remote/TARGETS | 1 + .../remote/bazel/bazel_execution_client.cpp | 24 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/buildtool/execution_api/remote/TARGETS b/src/buildtool/execution_api/remote/TARGETS index 2c356660..a5290001 100644 --- a/src/buildtool/execution_api/remote/TARGETS +++ b/src/buildtool/execution_api/remote/TARGETS @@ -51,6 +51,7 @@ , "private-deps": [ ["@", "fmt", "", "fmt"] , ["@", "grpc", "", "grpc++"] + , ["@", "protoc", "", "libprotobuf"] , ["src/buildtool/common", "artifact_digest_factory"] , ["src/buildtool/common", "bazel_digest_factory"] , ["src/buildtool/common", "protocol_traits"] diff --git a/src/buildtool/execution_api/remote/bazel/bazel_execution_client.cpp b/src/buildtool/execution_api/remote/bazel/bazel_execution_client.cpp index 8eb26a0c..09f16b8f 100644 --- a/src/buildtool/execution_api/remote/bazel/bazel_execution_client.cpp +++ b/src/buildtool/execution_api/remote/bazel/bazel_execution_client.cpp @@ -16,6 +16,8 @@ #include // std::move +#include "fmt/core.h" +#include "google/protobuf/text_format.h" #include "grpcpp/grpcpp.h" #include "src/buildtool/common/remote/client_common.hpp" #include "src/buildtool/common/remote/retry.hpp" @@ -40,6 +42,17 @@ void LogExecutionStatus(gsl::not_null const& logger, "Execution could not be started.\n{}", s.ShortDebugString()); break; + case grpc::StatusCode::FAILED_PRECONDITION: + // quote from remote_execution.proto: + // One or more errors occurred in setting up the + // action requested, such as a missing input or command or no worker + // being available. The client may be able to fix the errors and + // retry. + logger->Emit(LogLevel::Progress, + "Some precondition for the action failed.\n{}", + s.message()); + break; + default: // fallback to default status logging LogStatus(logger, LogLevel::Error, s); @@ -231,6 +244,17 @@ auto BazelExecutionClient::ExtractContents( if (status_code == grpc::StatusCode::UNAVAILABLE) { response.state = ExecutionResponse::State::Retry; } + else if (status_code == grpc::StatusCode::FAILED_PRECONDITION) { + logger_.Emit(LogLevel::Debug, [&exec_response] { + std::string text_repr; + google::protobuf::TextFormat::PrintToString(exec_response, + &text_repr); + return fmt::format( + "Full exec_response of precondition failure\n{}", + text_repr); + }); + response.state = ExecutionResponse::State::Retry; + } else { response.state = ExecutionResponse::State::Failed; } -- cgit v1.2.3