summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Blanchard <martin.blanchard@codethink.co.uk>2018-09-18 08:56:43 +0100
committerMartin Blanchard <martin.blanchard@codethink.co.uk>2018-10-23 11:54:40 +0100
commitaa0cbf5dfb8e68674c60efe841cf862d6e5cdea6 (patch)
tree4c7cf7ccc256b7a33985884593873159ab6929a9
parentecb58b423ab6896a6bfc4f634089ca73d9816782 (diff)
downloadbuildstream-mablanch/630-remote-execution-reconn.tar.gz
_sandboxremote.py: Try to reopen operation steam on failuremablanch/630-remote-execution-reconn
The REAPI allows a client to reconnect to an ongoing operation stream by providing a WaitExecution(). If implemented on server side, BuildStream will try to recover from connection errors using it. https://gitlab.com/BuildStream/buildstream/issues/630
-rw-r--r--buildstream/sandbox/_sandboxremote.py58
1 files changed, 43 insertions, 15 deletions
diff --git a/buildstream/sandbox/_sandboxremote.py b/buildstream/sandbox/_sandboxremote.py
index ab0c31bff..f522cc772 100644
--- a/buildstream/sandbox/_sandboxremote.py
+++ b/buildstream/sandbox/_sandboxremote.py
@@ -76,8 +76,7 @@ class SandboxRemote(Sandbox):
# Upload the Command message to the remote CAS server
command_digest = cascache.push_message(self._get_project(), remote_command)
if not command_digest or not cascache.verify_digest_pushed(self._get_project(), command_digest):
- # Command push failed
- return None
+ raise SandboxError("Failed pushing build command to remote CAS.")
# Create and send the action.
action = remote_execution_pb2.Action(command_digest=command_digest,
@@ -88,27 +87,57 @@ class SandboxRemote(Sandbox):
# Upload the Action message to the remote CAS server
action_digest = cascache.push_message(self._get_project(), action)
if not action_digest or not cascache.verify_digest_pushed(self._get_project(), action_digest):
- # Action push failed
- return None
+ raise SandboxError("Failed pushing build action to remote CAS.")
# Next, try to create a communication channel to the BuildGrid server.
channel = grpc.insecure_channel(self.server_url)
stub = remote_execution_pb2_grpc.ExecutionStub(channel)
request = remote_execution_pb2.ExecuteRequest(action_digest=action_digest,
skip_cache_lookup=False)
- try:
- operation_iterator = stub.Execute(request)
- except grpc.RpcError:
- return None
+
+ def __run_remote_command(stub, execute_request=None, running_operation=None):
+ try:
+ last_operation = None
+ if execute_request is not None:
+ operation_iterator = stub.Execute(execute_request)
+ else:
+ request = remote_execution_pb2.WaitExecutionRequest(name=running_operation.name)
+ operation_iterator = stub.WaitExecution(request)
+
+ for operation in operation_iterator:
+ if operation.done:
+ return operation
+ else:
+ last_operation = operation
+ except grpc.RpcError as e:
+ status_code = e.code()
+ if status_code == grpc.StatusCode.UNAVAILABLE:
+ raise SandboxError("Failed contacting remote execution server at {}."
+ .format(self.server_url))
+
+ elif status_code in (grpc.StatusCode.INVALID_ARGUMENT,
+ grpc.StatusCode.FAILED_PRECONDITION,
+ grpc.StatusCode.RESOURCE_EXHAUSTED,
+ grpc.StatusCode.INTERNAL,
+ grpc.StatusCode.DEADLINE_EXCEEDED):
+ raise SandboxError("{} ({}).".format(e.details(), status_code.name))
+
+ elif running_operation and status_code == grpc.StatusCode.UNIMPLEMENTED:
+ raise SandboxError("Failed trying to recover from connection loss: "
+ "server does not support operation status polling recovery.")
+
+ return last_operation
operation = None
with self._get_context().timed_activity("Waiting for the remote build to complete"):
- # It is advantageous to check operation_iterator.code() is grpc.StatusCode.OK here,
- # which will check the server is actually contactable. However, calling it when the
- # server is available seems to cause .code() to hang forever.
- for operation in operation_iterator:
- if operation.done:
- break
+ operation = __run_remote_command(stub, execute_request=request)
+ if operation is None:
+ return None
+ elif operation.done:
+ return operation
+
+ while operation is not None and not operation.done:
+ operation = __run_remote_command(stub, running_operation=operation)
return operation
@@ -192,7 +221,6 @@ class SandboxRemote(Sandbox):
if operation is None:
# Failure of remote execution, usually due to an error in BuildStream
- # NB This error could be raised in __run_remote_command
raise SandboxError("No response returned from server")
assert not operation.HasField('error') and operation.HasField('response')