summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Ipsum <richard.ipsum@codethink.co.uk>2014-06-23 17:03:08 +0000
committerRichard Ipsum <richard.ipsum@codethink.co.uk>2014-06-23 17:03:08 +0000
commitad4e0b397816a5526ab6365b7d47a0c8785a7614 (patch)
treec9051647615b02a9279aafecda5c998c581d6a9e
parent376c03c53bc86b88f14ef46c036795eb52186805 (diff)
parentbd214214053de9cf146e75f7ee4ad68c3d4a959c (diff)
downloadlorry-controller-ad4e0b397816a5526ab6365b7d47a0c8785a7614.tar.gz
Merge branch 'liw/anti-exterminate'
Reviewed by: Richard Ipsum Sam Thursfield
-rw-r--r--lorrycontroller/jobupdate.py4
-rw-r--r--lorrycontroller/statedb.py43
-rw-r--r--lorrycontroller/stopjob.py6
-rw-r--r--yarns.webapp/040-running-jobs.yarn32
4 files changed, 63 insertions, 22 deletions
diff --git a/lorrycontroller/jobupdate.py b/lorrycontroller/jobupdate.py
index 24a3c4a..3bd0e81 100644
--- a/lorrycontroller/jobupdate.py
+++ b/lorrycontroller/jobupdate.py
@@ -57,9 +57,9 @@ class JobUpdate(lorrycontroller.LorryControllerRoute):
logging.warning(
'Job %r has been running too long, '
'marking it to be exterminated', job_id)
- statedb.set_kill_job(path, True)
+ statedb.set_kill_job(job_id, True)
- obj = statedb.get_lorry_info(path)
+ obj = statedb.get_job_info(job_id)
logging.debug('obj=%r', obj)
return obj
diff --git a/lorrycontroller/statedb.py b/lorrycontroller/statedb.py
index 8316c9a..2d223e0 100644
--- a/lorrycontroller/statedb.py
+++ b/lorrycontroller/statedb.py
@@ -61,14 +61,12 @@ class StateDB(object):
('from_trovehost', 'TEXT'),
('from_path', 'TEXT'),
('running_job', 'INT'),
- ('kill_job', 'INT'),
('last_run', 'INT'),
('interval', 'INT'),
('lorry_timeout', 'INT'),
('disk_usage', 'INT'),
]
self.lorries_booleans = [
- 'kill_job',
]
if self._conn is None:
@@ -131,6 +129,7 @@ class StateDB(object):
'pid INT, '
'started INT, '
'ended INT, '
+ 'kill INT, '
'path TEXT, '
'exit TEXT, '
'disk_usage INT, '
@@ -356,10 +355,10 @@ class StateDB(object):
c.execute(
'INSERT INTO lorries '
'(path, text, from_trovehost, from_path, last_run, interval, '
- 'lorry_timeout, running_job, kill_job) '
- 'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
+ 'lorry_timeout, running_job) '
+ 'VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
(path, text, from_trovehost, from_path, 0,
- interval, timeout, None, 0))
+ interval, timeout, None))
else:
c = self.get_cursor()
c.execute(
@@ -407,8 +406,8 @@ class StateDB(object):
'SELECT running_job FROM lorries WHERE running_job IS NOT NULL')
return [row[0] for row in c.fetchall()]
- def set_kill_job(self, path, value):
- logging.debug('StateDB.set_kill_job(%r, %r) called', path, value)
+ def set_kill_job(self, job_id, value):
+ logging.debug('StateDB.set_kill_job(%r, %r) called', job_id, value)
assert self.in_transaction
if value:
value = 1
@@ -416,8 +415,8 @@ class StateDB(object):
value = 0
c = self.get_cursor()
c.execute(
- 'UPDATE lorries SET kill_job=? WHERE path=?',
- (value, path))
+ 'UPDATE jobs SET kill=? WHERE job_id=?',
+ (value, job_id))
def set_lorry_last_run(self, path, last_run):
logging.debug(
@@ -452,6 +451,26 @@ class StateDB(object):
c.execute('SELECT job_id FROM jobs')
return [row[0] for row in c.fetchall()]
+ def get_job_info(self, job_id):
+ c = self.get_cursor()
+ c.execute(
+ 'SELECT job_id, host, pid, started, ended, kill, path, exit, '
+ 'disk_usage, output FROM jobs WHERE job_id=?',
+ (job_id,))
+ row = c.fetchone()
+ return {
+ 'job_id': row[0],
+ 'host': row[1],
+ 'pid': row[2],
+ 'started': row[3],
+ 'ended': row[4],
+ 'kill': row[5],
+ 'path': row[6],
+ 'exit': row[7],
+ 'disk_usage': row[8],
+ 'output': row[9],
+ }
+
def add_new_job(self, job_id, host, pid, path, started):
logging.debug(
'StateDB.add_new_job(%r, %r, %r, %r, %r) called',
@@ -459,9 +478,9 @@ class StateDB(object):
assert self.in_transaction
c = self.get_cursor()
c.execute(
- 'INSERT INTO jobs (job_id, host, pid, path, started) '
- 'VALUES (?, ?, ?, ?, ?)',
- (job_id, host, pid, path, started))
+ 'INSERT INTO jobs (job_id, host, pid, path, started, kill) '
+ 'VALUES (?, ?, ?, ?, ?, ?)',
+ (job_id, host, pid, path, started, 0))
def get_job_minion_host(self, job_id):
c = self.get_cursor()
diff --git a/lorrycontroller/stopjob.py b/lorrycontroller/stopjob.py
index 947f733..f2ead87 100644
--- a/lorrycontroller/stopjob.py
+++ b/lorrycontroller/stopjob.py
@@ -28,14 +28,14 @@ class StopJob(lorrycontroller.LorryControllerRoute):
def run(self, **kwargs):
logging.info('%s %s called', self.http_method, self.path)
+ job_id = bottle.request.forms.job_id
statedb = self.open_statedb()
with statedb:
- job_id = bottle.request.forms.job_id
try:
path = statedb.find_lorry_running_job(job_id)
except lorrycontroller.WrongNumberLorriesRunningJob:
logging.warning(
"Tried to kill job %s which isn't running" % job_id)
bottle.abort(409, 'Job is not currently running')
- statedb.set_kill_job(path, True)
- return statedb.get_lorry_info(path)
+ statedb.set_kill_job(job_id, True)
+ return statedb.get_job_info(job_id)
diff --git a/yarns.webapp/040-running-jobs.yarn b/yarns.webapp/040-running-jobs.yarn
index 11ec557..879d9fa 100644
--- a/yarns.webapp/040-running-jobs.yarn
+++ b/yarns.webapp/040-running-jobs.yarn
@@ -58,7 +58,7 @@ Requesting another job should now again return null.
Inform WEBAPP the job is finished.
WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=0
- THEN response has kill_job set to false
+ THEN response has kill set to false
WHEN admin makes request GET /1.0/lorry/upstream/foo
THEN response has running_job set to null
WHEN admin makes request GET /1.0/list-running-jobs
@@ -140,14 +140,13 @@ Admin will now ask WEBAPP to kill the job. This changes sets a field
in the STATEDB only.
WHEN admin makes request POST /1.0/stop-job with job_id=1
- AND admin makes request GET /1.0/lorry/upstream/foo
- THEN response has kill_job set to true
+ THEN response has kill set to true
Now, when MINION updates the job, WEBAPP will tell it to kill it.
MINION will do so, and then update the job again.
WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=no
- THEN response has kill_job set to true
+ THEN response has kill set to true
WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=1
Admin will now see that the job has, indeed, been killed.
@@ -158,6 +157,16 @@ Admin will now see that the job has, indeed, been killed.
WHEN admin makes request GET /1.0/list-running-jobs
THEN response has running_jobs set to []
+Check that job can be run successfully again. In 2014, we found a bug
+where a lorry that was ever set to be killed, would never again
+successfully run.
+
+ WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+ THEN response has job_id set to 2
+ AND response has path set to "upstream/foo"
+ WHEN MINION makes request POST /1.0/job-update with job_id=2&exit=no
+ THEN response has kill set to false
+
Cleanup.
FINALLY WEBAPP terminates
@@ -209,7 +218,20 @@ Pretend to be a MINION that reports an update on the job. WEBAPP
should now be telling us to kill the job.
WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=no
- THEN response has kill_job set to true
+ THEN response has kill set to true
+
+Kill the job, as requested.
+
+ WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=1
+
+Verify we can run the job successfully after it has been killed once
+by timeout. In 2014 we had a bug where this would not happen, because
+a lorry that had ever been killed would never run successfully again.
+
+ WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+ THEN response has job_id set to 2
+ WHEN MINION makes request POST /1.0/job-update with job_id=2&exit=no
+ THEN response has kill set to false
Cleanup.