summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTristan Van Berkom <tristan.vanberkom@codethink.co.uk>2020-05-15 21:56:57 +0900
committerbst-marge-bot <marge-bot@buildstream.build>2020-05-19 09:28:33 +0000
commita247912c29d84ddf349dcc5d11beb5976a3b0d8a (patch)
tree9edf7d24419bdc64b534fb1129de12f2676c5ed0
parent7cdd7f4e085d6750235c2f8e8f578eae07db9655 (diff)
downloadbuildstream-a247912c29d84ddf349dcc5d11beb5976a3b0d8a.tar.gz
_scheduler: Fix order of launching jobs and sending notifications.
Sending notifications causes potentially large bodies of code to run in the abstracted frontend codebase, we are not allowed to have knowledge of the frontend from this code. Previously, we were adding the job to the active jobs, sending the notification, and then starting the job. This means that if a BuildStream frontend implementation crashes, we handle the excepting in an inconsistent state and try to kill jobs which are not running. In addition to making sure that active_jobs list adjustment and job starting does not have any code body run in the danger window in between these, this patch also adds some fault tolerance and assertions around job termination so that: o Job.terminate() and Job.kill() do not crash with None _process o Job.start() raises an assertion if started after being terminated This fixes the infinite looping aspects of frontend crashes at job_start() time described in #1312.
-rw-r--r--src/buildstream/_scheduler/jobs/job.py8
-rw-r--r--src/buildstream/_scheduler/scheduler.py9
2 files changed, 14 insertions, 3 deletions
diff --git a/src/buildstream/_scheduler/jobs/job.py b/src/buildstream/_scheduler/jobs/job.py
index 823404b59..23aa51e58 100644
--- a/src/buildstream/_scheduler/jobs/job.py
+++ b/src/buildstream/_scheduler/jobs/job.py
@@ -156,6 +156,8 @@ class Job:
#
def start(self):
+ assert not self._terminated, "Attempted to start process which was already terminated"
+
self._pipe_r, pipe_w = multiprocessing.Pipe(duplex=False)
self._tries += 1
@@ -214,7 +216,8 @@ class Job:
self._parent_stop_listening()
# Terminate the process using multiprocessing API pathway
- self._process.terminate()
+ if self._process:
+ self._process.terminate()
self._terminated = True
@@ -235,7 +238,8 @@ class Job:
def kill(self):
# Force kill
self.message(MessageType.WARN, "{} did not terminate gracefully, killing".format(self.action_name))
- utils._kill_process_tree(self._process.pid)
+ if self._process:
+ utils._kill_process_tree(self._process.pid)
# suspend()
#
diff --git a/src/buildstream/_scheduler/scheduler.py b/src/buildstream/_scheduler/scheduler.py
index 0bcbb7182..ae510c2e0 100644
--- a/src/buildstream/_scheduler/scheduler.py
+++ b/src/buildstream/_scheduler/scheduler.py
@@ -375,7 +375,15 @@ class Scheduler:
# job (Job): The job to start
#
def _start_job(self, job):
+
+ # From the scheduler perspective, the following
+ # is considered atomic; started jobs are always in the
+ # active_jobs list, and jobs in the active_jobs list
+ # are always started.
+ #
self._active_jobs.append(job)
+ job.start()
+
notification = Notification(
NotificationType.JOB_START,
full_name=job.name,
@@ -383,7 +391,6 @@ class Scheduler:
time=self._state.elapsed_time(start_time=self._starttime),
)
self._notify(notification)
- job.start()
# _sched_queue_jobs()
#