diff options
-rw-r--r-- | buildstream/_frontend/main.py | 5 | ||||
-rw-r--r-- | buildstream/_frontend/widget.py | 1 | ||||
-rw-r--r-- | buildstream/_scheduler/job.py | 29 | ||||
-rw-r--r-- | buildstream/_scheduler/queue.py | 5 | ||||
-rw-r--r-- | buildstream/context.py | 4 | ||||
-rw-r--r-- | buildstream/data/userconfig.yaml | 3 |
6 files changed, 41 insertions, 6 deletions
diff --git a/buildstream/_frontend/main.py b/buildstream/_frontend/main.py index 66b13c2be..d468b1f57 100644 --- a/buildstream/_frontend/main.py +++ b/buildstream/_frontend/main.py @@ -62,6 +62,8 @@ _, _, _, _, host_machine = os.uname() help="Maximum simultaneous build tasks") @click.option('--pushers', type=click.INT, default=None, help="Maximum simultaneous upload tasks") +@click.option('--network-retries', type=click.INT, default=None, + help="Maximum retries for network tasks") @click.option('--no-interactive', is_flag=True, default=False, help="Force non interactive mode, otherwise this is automatically decided") @click.option('--verbose/--no-verbose', default=None, @@ -625,7 +627,8 @@ class App(): 'on_error': 'sched_error_action', 'fetchers': 'sched_fetchers', 'builders': 'sched_builders', - 'pushers': 'sched_pushers' + 'pushers': 'sched_pushers', + 'network_retries': 'sched_network_retries' } for cli_option, context_attr in override_map.items(): option_value = self.main_options.get(cli_option) diff --git a/buildstream/_frontend/widget.py b/buildstream/_frontend/widget.py index 2177e892a..601ccdcfb 100644 --- a/buildstream/_frontend/widget.py +++ b/buildstream/_frontend/widget.py @@ -437,6 +437,7 @@ class LogLine(Widget): values["Maximum Fetch Tasks"] = context.sched_fetchers values["Maximum Build Tasks"] = context.sched_builders values["Maximum Push Tasks"] = context.sched_pushers + values["Maximum Network Retries"] = context.sched_network_retries text += self.format_values(values) text += '\n' diff --git a/buildstream/_scheduler/job.py b/buildstream/_scheduler/job.py index e72332267..7c7c29bb7 100644 --- a/buildstream/_scheduler/job.py +++ b/buildstream/_scheduler/job.py @@ -78,16 +78,22 @@ class Job(): self.pid = None # The child's pid in the parent self.result = None # Return value of child action in the parent + self.tries = 0 + # spawn() # # Args: # action (callable): The action function # complete (callable): The function to call when complete + # max_retries (int): The maximum number of retries # - def spawn(self, action, complete): + def spawn(self, action, complete, max_retries=0): self.action = action self.complete = complete + self.tries += 1 + self.max_retries = max_retries + self.parent_start_listening() # Spawn the process @@ -263,9 +269,15 @@ class Job(): except _BstError as e: elapsed = datetime.datetime.now() - starttime - self.message(element, MessageType.FAIL, self.action_name, - elapsed=elapsed, detail=str(e), - logfile=filename, sandbox=e.sandbox) + + if self.tries <= self.max_retries: + self.message(element, MessageType.FAIL, "Try #{} failed, retrying".format(self.tries), + elapsed=elapsed) + else: + self.message(element, MessageType.FAIL, self.action_name, + elapsed=elapsed, detail=str(e), + logfile=filename, sandbox=e.sandbox) + self.child_shutdown(1) except Exception as e: @@ -290,6 +302,11 @@ class Job(): self.child_shutdown(0) def child_complete(self, pid, returncode, element): + if returncode != 0 and self.tries <= self.max_retries: + self.shutdown() + self.spawn(self.action, self.complete, self.max_retries) + return + self.complete(self, returncode, element) def child_shutdown(self, exit_code): @@ -336,6 +353,10 @@ class Job(): # Log first self.child_log(plugin, message, context) + if message.message_type == MessageType.FAIL and self.tries <= self.max_retries: + # Job will be retried, display failures as warnings in the frontend + message.message_type = MessageType.WARN + # Send to frontend if appropriate if (context._silent_messages() and message.message_type not in unconditional_messages): diff --git a/buildstream/_scheduler/queue.py b/buildstream/_scheduler/queue.py index 04e559eee..97fec98d5 100644 --- a/buildstream/_scheduler/queue.py +++ b/buildstream/_scheduler/queue.py @@ -55,6 +55,7 @@ class Queue(): self.wait_queue = deque() self.done_queue = deque() self.active_jobs = [] + self.max_retries = 0 # For the frontend to know how many elements # were successfully processed, failed, or skipped @@ -139,6 +140,8 @@ class Queue(): # Attach to the scheduler def attach(self, scheduler): self.scheduler = scheduler + if self.queue_type == QueueType.FETCH or self.queue_type == QueueType.PUSH: + self.max_retries = scheduler.context.sched_network_retries def enqueue(self, elts): if not elts: @@ -177,7 +180,7 @@ class Queue(): job = Job(scheduler, element, self.action_name) scheduler.job_starting(job) - job.spawn(self.process, self.job_done) + job.spawn(self.process, self.job_done, self.max_retries) self.active_jobs.append(job) # These were not ready but were in the beginning, give em diff --git a/buildstream/context.py b/buildstream/context.py index 97dfe86cd..64acd4fc4 100644 --- a/buildstream/context.py +++ b/buildstream/context.py @@ -110,6 +110,9 @@ class Context(): self.sched_pushers = 4 """Maximum number of push tasks""" + self.sched_network_retries = 2 + """Maximum number of retries for network tasks""" + self.sched_error_action = 'continue' """What to do when a build fails in non interactive mode""" @@ -185,6 +188,7 @@ class Context(): self.sched_fetchers = _yaml.node_get(scheduler, int, 'fetchers') self.sched_builders = _yaml.node_get(scheduler, int, 'builders') self.sched_pushers = _yaml.node_get(scheduler, int, 'pushers') + self.sched_network_retries = _yaml.node_get(scheduler, int, 'network-retries') profile_end(Topics.LOAD_CONTEXT, 'load') diff --git a/buildstream/data/userconfig.yaml b/buildstream/data/userconfig.yaml index e5dcb7820..1cce9fcd5 100644 --- a/buildstream/data/userconfig.yaml +++ b/buildstream/data/userconfig.yaml @@ -39,6 +39,9 @@ scheduler: # Maximum number of simultaneous artifact uploading tasks. pushers: 4 + # Maximum number of retries for network tasks. + network-retries: 2 + # What to do when an element fails, if not running in # interactive mode: # |