Merge branch 'liw/anti-exterminate'

Reviewed by: Richard Ipsum Sam Thursfield
author: Richard Ipsum <richard.ipsum@codethink.co.uk> 2014-06-23 17:03:08 +0000
committer: Richard Ipsum <richard.ipsum@codethink.co.uk> 2014-06-23 17:03:08 +0000
commit: ad4e0b397816a5526ab6365b7d47a0c8785a7614 (patch)
tree: c9051647615b02a9279aafecda5c998c581d6a9e
parent: 376c03c53bc86b88f14ef46c036795eb52186805 (diff)
parent: bd214214053de9cf146e75f7ee4ad68c3d4a959c (diff)
download: lorry-controller-ad4e0b397816a5526ab6365b7d47a0c8785a7614.tar.gz
4 files changed, 63 insertions, 22 deletions
diff --git a/lorrycontroller/jobupdate.py b/lorrycontroller/jobupdate.py
index 24a3c4a..3bd0e81 100644
--- a/lorrycontroller/jobupdate.py
+++ b/lorrycontroller/jobupdate.py
@@ -57,9 +57,9 @@ class JobUpdate(lorrycontroller.LorryControllerRoute):
                 logging.warning(
                     'Job %r has been running too long, '
                     'marking it to be exterminated', job_id)
-                statedb.set_kill_job(path, True)
+                statedb.set_kill_job(job_id, True)
 
-            obj = statedb.get_lorry_info(path)
+            obj = statedb.get_job_info(job_id)
             logging.debug('obj=%r', obj)
             return obj
 
diff --git a/lorrycontroller/statedb.py b/lorrycontroller/statedb.py
index 8316c9a..2d223e0 100644
--- a/lorrycontroller/statedb.py
+++ b/lorrycontroller/statedb.py
@@ -61,14 +61,12 @@ class StateDB(object):
             ('from_trovehost', 'TEXT'),
             ('from_path', 'TEXT'),
             ('running_job', 'INT'),
-            ('kill_job', 'INT'),
             ('last_run', 'INT'),
             ('interval', 'INT'),
             ('lorry_timeout', 'INT'),
             ('disk_usage', 'INT'),
             ]
         self.lorries_booleans = [
-            'kill_job',
             ]
 
         if self._conn is None:
@@ -131,6 +129,7 @@ class StateDB(object):
             'pid INT, '
             'started INT, '
             'ended INT, '
+            'kill INT, '
             'path TEXT, '
             'exit TEXT, '
             'disk_usage INT, '
@@ -356,10 +355,10 @@ class StateDB(object):
             c.execute(
                 'INSERT INTO lorries '
                 '(path, text, from_trovehost, from_path, last_run, interval, '
-                'lorry_timeout, running_job, kill_job) '
-                'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
+                'lorry_timeout, running_job) '
+                'VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
                 (path, text, from_trovehost, from_path, 0,
-                 interval, timeout, None, 0))
+                 interval, timeout, None))
         else:
             c = self.get_cursor()
             c.execute(
@@ -407,8 +406,8 @@ class StateDB(object):
             'SELECT running_job FROM lorries WHERE running_job IS NOT NULL')
         return [row[0] for row in c.fetchall()]
 
-    def set_kill_job(self, path, value):
-        logging.debug('StateDB.set_kill_job(%r, %r) called', path, value)
+    def set_kill_job(self, job_id, value):
+        logging.debug('StateDB.set_kill_job(%r, %r) called', job_id, value)
         assert self.in_transaction
         if value:
             value = 1
@@ -416,8 +415,8 @@ class StateDB(object):
             value = 0
         c = self.get_cursor()
         c.execute(
-            'UPDATE lorries SET kill_job=? WHERE path=?',
-            (value, path))
+            'UPDATE jobs SET kill=? WHERE job_id=?',
+            (value, job_id))
 
     def set_lorry_last_run(self, path, last_run):
         logging.debug(
@@ -452,6 +451,26 @@ class StateDB(object):
         c.execute('SELECT job_id FROM jobs')
         return [row[0] for row in c.fetchall()]
 
+    def get_job_info(self, job_id):
+        c = self.get_cursor()
+        c.execute(
+            'SELECT job_id, host, pid, started, ended, kill, path, exit, '
+            'disk_usage, output FROM jobs WHERE job_id=?',
+            (job_id,))
+        row = c.fetchone()
+        return {
+            'job_id': row[0],
+            'host': row[1],
+            'pid': row[2],
+            'started': row[3],
+            'ended': row[4],
+            'kill': row[5],
+            'path': row[6],
+            'exit': row[7],
+            'disk_usage': row[8],
+            'output': row[9],
+            }
+
     def add_new_job(self, job_id, host, pid, path, started):
         logging.debug(
             'StateDB.add_new_job(%r, %r, %r, %r, %r) called',
@@ -459,9 +478,9 @@ class StateDB(object):
         assert self.in_transaction
         c = self.get_cursor()
         c.execute(
-            'INSERT INTO jobs (job_id, host, pid, path, started) '
-            'VALUES (?, ?, ?, ?, ?)',
-            (job_id, host, pid, path, started))
+            'INSERT INTO jobs (job_id, host, pid, path, started, kill) '
+            'VALUES (?, ?, ?, ?, ?, ?)',
+            (job_id, host, pid, path, started, 0))
 
     def get_job_minion_host(self, job_id):
         c = self.get_cursor()
diff --git a/lorrycontroller/stopjob.py b/lorrycontroller/stopjob.py
index 947f733..f2ead87 100644
--- a/lorrycontroller/stopjob.py
+++ b/lorrycontroller/stopjob.py
@@ -28,14 +28,14 @@ class StopJob(lorrycontroller.LorryControllerRoute):
 
     def run(self, **kwargs):
         logging.info('%s %s called', self.http_method, self.path)
+        job_id = bottle.request.forms.job_id
         statedb = self.open_statedb()
         with statedb:
-            job_id = bottle.request.forms.job_id
             try:
                 path = statedb.find_lorry_running_job(job_id)
             except lorrycontroller.WrongNumberLorriesRunningJob:
                 logging.warning(
                     "Tried to kill job %s which isn't running" % job_id)
                 bottle.abort(409, 'Job is not currently running')
-            statedb.set_kill_job(path, True)
-        return statedb.get_lorry_info(path)
+            statedb.set_kill_job(job_id, True)
+        return statedb.get_job_info(job_id)
diff --git a/yarns.webapp/040-running-jobs.yarn b/yarns.webapp/040-running-jobs.yarn
index 11ec557..879d9fa 100644
--- a/yarns.webapp/040-running-jobs.yarn
+++ b/yarns.webapp/040-running-jobs.yarn
@@ -58,7 +58,7 @@ Requesting another job should now again return null.
 Inform WEBAPP the job is finished.
 
     WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=0
-    THEN response has kill_job set to false
+    THEN response has kill set to false
     WHEN admin makes request GET /1.0/lorry/upstream/foo
     THEN response has running_job set to null
     WHEN admin makes request GET /1.0/list-running-jobs
@@ -140,14 +140,13 @@ Admin will now ask WEBAPP to kill the job. This changes sets a field
 in the STATEDB only.
 
     WHEN admin makes request POST /1.0/stop-job with job_id=1
-    AND admin makes request GET /1.0/lorry/upstream/foo
-    THEN response has kill_job set to true
+    THEN response has kill set to true
 
 Now, when MINION updates the job, WEBAPP will tell it to kill it.
 MINION will do so, and then update the job again.
 
     WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=no
-    THEN response has kill_job set to true
+    THEN response has kill set to true
     WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=1
 
 Admin will now see that the job has, indeed, been killed.
@@ -158,6 +157,16 @@ Admin will now see that the job has, indeed, been killed.
     WHEN admin makes request GET /1.0/list-running-jobs
     THEN response has running_jobs set to []
 
+Check that job can be run successfully again. In 2014, we found a bug
+where a lorry that was ever set to be killed, would never again
+successfully run.
+
+    WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+    THEN response has job_id set to 2
+    AND response has path set to "upstream/foo"
+    WHEN MINION makes request POST /1.0/job-update with job_id=2&exit=no
+    THEN response has kill set to false
+
 Cleanup.
 
     FINALLY WEBAPP terminates
@@ -209,7 +218,20 @@ Pretend to be a MINION that reports an update on the job. WEBAPP
 should now be telling us to kill the job.
 
     WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=no
-    THEN response has kill_job set to true
+    THEN response has kill set to true
+
+Kill the job, as requested.
+
+    WHEN MINION makes request POST /1.0/job-update with job_id=1&exit=1
+
+Verify we can run the job successfully after it has been killed once
+by timeout. In 2014 we had a bug where this would not happen, because
+a lorry that had ever been killed would never run successfully again.
+
+    WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+    THEN response has job_id set to 2
+    WHEN MINION makes request POST /1.0/job-update with job_id=2&exit=no
+    THEN response has kill set to false
 
 Cleanup.
author	Richard Ipsum <richard.ipsum@codethink.co.uk>	2014-06-23 17:03:08 +0000
committer	Richard Ipsum <richard.ipsum@codethink.co.uk>	2014-06-23 17:03:08 +0000
commit	ad4e0b397816a5526ab6365b7d47a0c8785a7614 (patch)
tree	c9051647615b02a9279aafecda5c998c581d6a9e
parent	376c03c53bc86b88f14ef46c036795eb52186805 (diff)
parent	bd214214053de9cf146e75f7ee4ad68c3d4a959c (diff)
download	lorry-controller-ad4e0b397816a5526ab6365b7d47a0c8785a7614.tar.gz