summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJürg Billeter <j@bitron.ch>2017-07-27 10:20:53 +0100
committerJürg Billeter <j@bitron.ch>2017-07-27 11:58:44 +0100
commitbfd822b3232f69ed9df48a20fedf31be7e0986f1 (patch)
tree4f5859ebecbb3e89d7c6281184ac5a7ec62c3dfb
parent855b63003ee15eb8cdd42f174a9193ee7b942ef5 (diff)
downloadbuildstream-bfd822b3232f69ed9df48a20fedf31be7e0986f1.tar.gz
Add network-retries option
Retry network tasks up to two times by default. Fixes #30
-rw-r--r--buildstream/_frontend/main.py5
-rw-r--r--buildstream/_frontend/widget.py1
-rw-r--r--buildstream/_scheduler/job.py29
-rw-r--r--buildstream/_scheduler/queue.py5
-rw-r--r--buildstream/context.py4
-rw-r--r--buildstream/data/userconfig.yaml3
6 files changed, 41 insertions, 6 deletions
diff --git a/buildstream/_frontend/main.py b/buildstream/_frontend/main.py
index 66b13c2be..d468b1f57 100644
--- a/buildstream/_frontend/main.py
+++ b/buildstream/_frontend/main.py
@@ -62,6 +62,8 @@ _, _, _, _, host_machine = os.uname()
help="Maximum simultaneous build tasks")
@click.option('--pushers', type=click.INT, default=None,
help="Maximum simultaneous upload tasks")
+@click.option('--network-retries', type=click.INT, default=None,
+ help="Maximum retries for network tasks")
@click.option('--no-interactive', is_flag=True, default=False,
help="Force non interactive mode, otherwise this is automatically decided")
@click.option('--verbose/--no-verbose', default=None,
@@ -625,7 +627,8 @@ class App():
'on_error': 'sched_error_action',
'fetchers': 'sched_fetchers',
'builders': 'sched_builders',
- 'pushers': 'sched_pushers'
+ 'pushers': 'sched_pushers',
+ 'network_retries': 'sched_network_retries'
}
for cli_option, context_attr in override_map.items():
option_value = self.main_options.get(cli_option)
diff --git a/buildstream/_frontend/widget.py b/buildstream/_frontend/widget.py
index 2177e892a..601ccdcfb 100644
--- a/buildstream/_frontend/widget.py
+++ b/buildstream/_frontend/widget.py
@@ -437,6 +437,7 @@ class LogLine(Widget):
values["Maximum Fetch Tasks"] = context.sched_fetchers
values["Maximum Build Tasks"] = context.sched_builders
values["Maximum Push Tasks"] = context.sched_pushers
+ values["Maximum Network Retries"] = context.sched_network_retries
text += self.format_values(values)
text += '\n'
diff --git a/buildstream/_scheduler/job.py b/buildstream/_scheduler/job.py
index e72332267..7c7c29bb7 100644
--- a/buildstream/_scheduler/job.py
+++ b/buildstream/_scheduler/job.py
@@ -78,16 +78,22 @@ class Job():
self.pid = None # The child's pid in the parent
self.result = None # Return value of child action in the parent
+ self.tries = 0
+
# spawn()
#
# Args:
# action (callable): The action function
# complete (callable): The function to call when complete
+ # max_retries (int): The maximum number of retries
#
- def spawn(self, action, complete):
+ def spawn(self, action, complete, max_retries=0):
self.action = action
self.complete = complete
+ self.tries += 1
+ self.max_retries = max_retries
+
self.parent_start_listening()
# Spawn the process
@@ -263,9 +269,15 @@ class Job():
except _BstError as e:
elapsed = datetime.datetime.now() - starttime
- self.message(element, MessageType.FAIL, self.action_name,
- elapsed=elapsed, detail=str(e),
- logfile=filename, sandbox=e.sandbox)
+
+ if self.tries <= self.max_retries:
+ self.message(element, MessageType.FAIL, "Try #{} failed, retrying".format(self.tries),
+ elapsed=elapsed)
+ else:
+ self.message(element, MessageType.FAIL, self.action_name,
+ elapsed=elapsed, detail=str(e),
+ logfile=filename, sandbox=e.sandbox)
+
self.child_shutdown(1)
except Exception as e:
@@ -290,6 +302,11 @@ class Job():
self.child_shutdown(0)
def child_complete(self, pid, returncode, element):
+ if returncode != 0 and self.tries <= self.max_retries:
+ self.shutdown()
+ self.spawn(self.action, self.complete, self.max_retries)
+ return
+
self.complete(self, returncode, element)
def child_shutdown(self, exit_code):
@@ -336,6 +353,10 @@ class Job():
# Log first
self.child_log(plugin, message, context)
+ if message.message_type == MessageType.FAIL and self.tries <= self.max_retries:
+ # Job will be retried, display failures as warnings in the frontend
+ message.message_type = MessageType.WARN
+
# Send to frontend if appropriate
if (context._silent_messages() and
message.message_type not in unconditional_messages):
diff --git a/buildstream/_scheduler/queue.py b/buildstream/_scheduler/queue.py
index 04e559eee..97fec98d5 100644
--- a/buildstream/_scheduler/queue.py
+++ b/buildstream/_scheduler/queue.py
@@ -55,6 +55,7 @@ class Queue():
self.wait_queue = deque()
self.done_queue = deque()
self.active_jobs = []
+ self.max_retries = 0
# For the frontend to know how many elements
# were successfully processed, failed, or skipped
@@ -139,6 +140,8 @@ class Queue():
# Attach to the scheduler
def attach(self, scheduler):
self.scheduler = scheduler
+ if self.queue_type == QueueType.FETCH or self.queue_type == QueueType.PUSH:
+ self.max_retries = scheduler.context.sched_network_retries
def enqueue(self, elts):
if not elts:
@@ -177,7 +180,7 @@ class Queue():
job = Job(scheduler, element, self.action_name)
scheduler.job_starting(job)
- job.spawn(self.process, self.job_done)
+ job.spawn(self.process, self.job_done, self.max_retries)
self.active_jobs.append(job)
# These were not ready but were in the beginning, give em
diff --git a/buildstream/context.py b/buildstream/context.py
index 97dfe86cd..64acd4fc4 100644
--- a/buildstream/context.py
+++ b/buildstream/context.py
@@ -110,6 +110,9 @@ class Context():
self.sched_pushers = 4
"""Maximum number of push tasks"""
+ self.sched_network_retries = 2
+ """Maximum number of retries for network tasks"""
+
self.sched_error_action = 'continue'
"""What to do when a build fails in non interactive mode"""
@@ -185,6 +188,7 @@ class Context():
self.sched_fetchers = _yaml.node_get(scheduler, int, 'fetchers')
self.sched_builders = _yaml.node_get(scheduler, int, 'builders')
self.sched_pushers = _yaml.node_get(scheduler, int, 'pushers')
+ self.sched_network_retries = _yaml.node_get(scheduler, int, 'network-retries')
profile_end(Topics.LOAD_CONTEXT, 'load')
diff --git a/buildstream/data/userconfig.yaml b/buildstream/data/userconfig.yaml
index e5dcb7820..1cce9fcd5 100644
--- a/buildstream/data/userconfig.yaml
+++ b/buildstream/data/userconfig.yaml
@@ -39,6 +39,9 @@ scheduler:
# Maximum number of simultaneous artifact uploading tasks.
pushers: 4
+ # Maximum number of retries for network tasks.
+ network-retries: 2
+
# What to do when an element fails, if not running in
# interactive mode:
#