Merge branch 'baserock/liw/de-ghost'

author: Lars Wirzenius <lars.wirzenius@codethink.co.uk> 2014-09-08 15:50:52 +0000
committer: Lars Wirzenius <lars.wirzenius@codethink.co.uk> 2014-09-08 15:50:52 +0000
commit: 4a6d0ef584a3d87c1d6ad237336660aacd161650 (patch)
tree: 71ac6cc3557d6cf340f1dbeec32ce429794ec096
parent: 0ef176c196db439e05026705450f691e678cdccd (diff)
parent: 2ce8d016add6dd279c3903fea26645a7499ec50a (diff)
download: lorry-controller-4a6d0ef584a3d87c1d6ad237336660aacd161650.tar.gz
9 files changed, 224 insertions, 11 deletions
diff --git a/ARCH b/ARCH
index d2d81ad..271b2bc 100644
--- a/ARCH
+++ b/ARCH
@@ -275,6 +275,11 @@ Requests for admins:
   of all jobs, running or finished, that it knows about. (RQ/ALLJOBS)
 * `POST /1.0/remove-job` with `job_id=jobid` in the body, removes a
   stopped job from the state database.
+* `POST /1.0/remove-ghost-jobs` looks for any running jobs in STATEDB
+  that haven't been updated (with `job-update`, see below) in a long
+  time (see `--ghost-timeout`), and marks them as terminated. This is
+  used to catch situations when a MINION fails to tell the WEBAPP that
+  a job has terminated.
 
 Requests for MINION:
 
diff --git a/lorry-controller-webapp b/lorry-controller-webapp
index 9234498..faabb2d 100755
--- a/lorry-controller-webapp
+++ b/lorry-controller-webapp
@@ -28,6 +28,9 @@ from flup.server.fcgi import WSGIServer
 import lorrycontroller
 
 
+ONE_MINUTE = 60
+
+
 class WEBAPP(cliapp.Application):
 
     def add_settings(self):
@@ -110,6 +113,22 @@ class WEBAPP(cliapp.Application):
             metavar='DIR',
             default='/usr/share/lorry-controller/static')
 
+        # The default value of ten minutes for the ghost-timeout
+        # setting was chosen arbitrarily, by Lars Wirzenius. The value
+        # needs to be long enough that there's no realistic danger of
+        # hitting it just because a host is a bit overloaded, but
+        # still short enough that ghost jobs do get removed often
+        # enough, especially right after boot, when all jobs are
+        # ghosts. Experience may show that a different value would
+        # actually be better, and if so, the code and this comment
+        # should be changed accordingly.
+        self.settings.integer(
+            ['ghost-timeout'],
+            'running jobs should get an update from their '
+            'MINION within this time or they will be considered '
+            'ghosts and be removed from STATEDB (in seconds)',
+            default=10*ONE_MINUTE)
+
     def find_routes(self):
         '''Return all classes that are API routes.
 
diff --git a/lorrycontroller/__init__.py b/lorrycontroller/__init__.py
index bc51b88..a65ff02 100644
--- a/lorrycontroller/__init__.py
+++ b/lorrycontroller/__init__.py
@@ -32,6 +32,7 @@ from movetopbottom import MoveToTop, MoveToBottom
 from stopjob import StopJob
 from listjobs import ListAllJobs, ListAllJobsHTML
 from showjob import ShowJob, ShowJobHTML, JobShower
+from removeghostjobs import RemoveGhostJobs
 from removejob import RemoveJob
 from lstroves import LsTroves, ForceLsTrove
 from pretendtime import PretendTime
diff --git a/lorrycontroller/jobupdate.py b/lorrycontroller/jobupdate.py
index 3bd0e81..efc9ce1 100644
--- a/lorrycontroller/jobupdate.py
+++ b/lorrycontroller/jobupdate.py
@@ -44,11 +44,13 @@ class JobUpdate(lorrycontroller.LorryControllerRoute):
             if stderr:
                 statedb.append_to_job_output(job_id, stderr)
 
+            now = statedb.get_current_time()
+            statedb.set_job_updated(job_id, now)
+
             path = statedb.find_lorry_running_job(job_id)
             lorry_info = statedb.get_lorry_info(path)
 
             if exit is not None and exit != 'no':
-                now = statedb.get_current_time()
                 statedb.set_lorry_last_run(path, int(now))
                 statedb.set_running_job(path, None)
                 statedb.set_job_exit(job_id, exit, int(now), disk_usage)
diff --git a/lorrycontroller/removeghostjobs.py b/lorrycontroller/removeghostjobs.py
new file mode 100644
index 0000000..2b2760c
--- /dev/null
+++ b/lorrycontroller/removeghostjobs.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2014  Codethink Limited
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+import logging
+import time
+
+import bottle
+
+import lorrycontroller
+
+
+class RemoveGhostJobs(lorrycontroller.LorryControllerRoute):
+
+    http_method = 'POST'
+    path = '/1.0/remove-ghost-jobs'
+
+    def run(self, **kwargs):
+        logging.info('%s %s called', self.http_method, self.path)
+
+        ghost_timeout = self.app_settings['ghost-timeout']
+        ghosts = []
+        with self.open_statedb() as statedb:
+            for job_id in statedb.get_running_jobs():
+                if self.is_ghost_job(statedb, job_id, ghost_timeout):
+                    self.exorcise_ghost_job(statedb, job_id)
+                    ghosts.append(statedb.get_job_info(job_id))
+        return {
+            'killed-ghost-jobs': ghosts,
+            }
+
+    def is_ghost_job(self, statedb, job_id, ghost_timeout):
+        updated = statedb.get_job_updated(job_id)
+        return self.now(statedb) - updated >= ghost_timeout
+
+    def now(self, statedb):
+        return statedb.get_current_time()
+
+    def exorcise_ghost_job(self, statedb, job_id):
+        logging.info('Job %s is a ghost job', job_id)
+        self.mark_job_to_be_killed_in_case_minion_appears(statedb, job_id)
+        self.mark_job_as_terminated(statedb, job_id)
+
+    def mark_job_to_be_killed_in_case_minion_appears(self, statedb, job_id):
+        statedb.set_kill_job(job_id, True)
+
+    def mark_job_as_terminated(self, statedb, job_id):
+        statedb.append_to_job_output(
+            job_id, '\nTERMINATED DUE TO GHOST TIMEOUT\n')
+        statedb.set_job_exit(job_id, 127, self.now(statedb), -1)
+
+        job_info = statedb.get_job_info(job_id)
+        statedb.set_running_job(job_info['path'], None)
diff --git a/lorrycontroller/statedb.py b/lorrycontroller/statedb.py
index 2d223e0..fd7857d 100644
--- a/lorrycontroller/statedb.py
+++ b/lorrycontroller/statedb.py
@@ -129,6 +129,7 @@ class StateDB(object):
             'pid INT, '
             'started INT, '
             'ended INT, '
+            'updated INT, '
             'kill INT, '
             'path TEXT, '
             'exit TEXT, '
@@ -454,8 +455,8 @@ class StateDB(object):
     def get_job_info(self, job_id):
         c = self.get_cursor()
         c.execute(
-            'SELECT job_id, host, pid, started, ended, kill, path, exit, '
-            'disk_usage, output FROM jobs WHERE job_id=?',
+            'SELECT job_id, host, pid, started, ended, updated, kill, '
+            'path, exit, disk_usage, output FROM jobs WHERE job_id=?',
             (job_id,))
         row = c.fetchone()
         return {
@@ -464,11 +465,12 @@ class StateDB(object):
             'pid': row[2],
             'started': row[3],
             'ended': row[4],
-            'kill': row[5],
-            'path': row[6],
-            'exit': row[7],
-            'disk_usage': row[8],
-            'output': row[9],
+            'updated': row[5],
+            'kill': row[6],
+            'path': row[7],
+            'exit': row[8],
+            'disk_usage': row[9],
+            'output': row[10],
             }
 
     def add_new_job(self, job_id, host, pid, path, started):
@@ -478,9 +480,10 @@ class StateDB(object):
         assert self.in_transaction
         c = self.get_cursor()
         c.execute(
-            'INSERT INTO jobs (job_id, host, pid, path, started, kill) '
-            'VALUES (?, ?, ?, ?, ?, ?)',
-            (job_id, host, pid, path, started, 0))
+            'INSERT INTO jobs (job_id, host, pid, path, started, '
+            'updated, kill) '
+            'VALUES (?, ?, ?, ?, ?, ?, ?)',
+            (job_id, host, pid, path, started, started, 0))
 
     def get_job_minion_host(self, job_id):
         c = self.get_cursor()
@@ -514,6 +517,24 @@ class StateDB(object):
         row = c.fetchone()
         return row[0], row[1]
 
+    def get_job_updated(self, job_id):
+        c = self.get_cursor()
+        c.execute(
+            'SELECT updated FROM jobs WHERE job_id IS ?',
+            (job_id,))
+        row = c.fetchone()
+        return row[0]
+
+    def set_job_updated(self, job_id, updated):
+        logging.debug(
+            'StateDB.set_job_updated(%r, %r) called',
+            job_id, updated)
+        assert self.in_transaction
+        c = self.get_cursor()
+        c.execute(
+            'UPDATE jobs SET updated=? WHERE job_id IS ?',
+            (updated, job_id))
+
     def get_job_exit(self, job_id):
         c = self.get_cursor()
         c.execute(
diff --git a/units/lorry-controller-remove-ghost-jobs.service b/units/lorry-controller-remove-ghost-jobs.service
new file mode 100644
index 0000000..e28a494
--- /dev/null
+++ b/units/lorry-controller-remove-ghost-jobs.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Lorry Controller remove ghost jobs
+After=lighttpd-lorry-controller-webapp.service
+
+[Install]
+WantedBy=multi-user.target
+
+[Service]
+ExecStart=/usr/bin/curl -o /dev/null -X POST -d '' http://localhost:12765/1.0/remove-ghost-jobs
+Restart=no
+User=lorry
+Group=lorry
diff --git a/units/lorry-controller-remove-ghost-jobs.timer b/units/lorry-controller-remove-ghost-jobs.timer
new file mode 100644
index 0000000..61ebaba
--- /dev/null
+++ b/units/lorry-controller-remove-ghost-jobs.timer
@@ -0,0 +1,9 @@
+[Unit]
+Description=Lorry Controller remove ghost jobs
+After=lighttpd-lorry-controller-webapp.service
+
+[Install]
+WantedBy=multi-user.target
+
+[Timer]
+OnUnitInactiveSec=60
diff --git a/yarns.webapp/040-running-jobs.yarn b/yarns.webapp/040-running-jobs.yarn
index 879d9fa..571afd6 100644
--- a/yarns.webapp/040-running-jobs.yarn
+++ b/yarns.webapp/040-running-jobs.yarn
@@ -237,6 +237,85 @@ Cleanup.
 
     FINALLY WEBAPP terminates
 
+
+Forget jobs whose MINION is gone
+--------------------------------
+
+A job's status is updated when a MINION uses the `/1.0/job-update`
+call, and when the MINION uses that to report that the job has
+finished, the STATEDB is updated accordingly. However, sometimes the
+MINION never tells WEBAPP that the job if finished. This can happen
+for a variety of reasons, such as (not limited to these):
+
+* MINION crashes.
+* WEBAPP is unavailable.
+* The host reboots, killing MINION and WEBAPP both.
+
+If this happens, STATEDB still marks the job as running, and WEBAPP
+won't start a new job for that lorry specification.
+
+To deal with these, we need to have a way to clean up "ghost jobs"
+like these. We do this with the `/1.0/cleanup-ghost-jobs` API call,
+which marks all jobs finished that haven't had a `job-update` called
+on them for a long time.
+
+    SCENARIO forget jobs without MINION updates in a long time
+    
+Set up a WEBAPP that uses a CONFGIT with a Lorry file, so we can start
+a job.
+
+    GIVEN a new git repository in CONFGIT
+    AND an empty lorry-controller.conf in CONFGIT
+    AND lorry-controller.conf in CONFGIT adds lorries *.lorry using prefix upstream
+    AND Lorry file CONFGIT/foo.lorry with {"foo":{"type":"git","url":"git://foo"}}
+    AND WEBAPP uses CONFGIT as its configuration directory
+    AND a running WEBAPP
+
+Pretend it is a known time (specifically, the beginning of the epoch).
+This is needed so we can trigger the ghost job timeout later.
+
+    WHEN admin makes request POST /1.0/pretend-time with now=0
+
+Tell WEBAPP to read the configuration.
+
+    WHEN admin makes request POST /1.0/read-configuration
+
+Start a new job.
+
+    WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+    THEN response has job_id set to 1
+
+Verify that the job is in the list of running jobs.
+
+    WHEN admin makes request GET /1.0/list-running-jobs
+    THEN response has running_jobs set to [1]
+
+Remove any ghosts. There aren't any yet, so nothing should be removed.
+
+    WHEN admin makes request POST /1.0/remove-ghost-jobs
+    AND admin makes request GET /1.0/list-running-jobs
+    THEN response has running_jobs set to [1]
+
+Now, pretend a long time has passed, and clean up the ghost job. The
+default value for the ghost timeout is reasonably short (less than a
+day), so we pretend it is about 10 days later (one million seconds).
+
+    WHEN admin makes request POST /1.0/pretend-time with now=1000000
+    AND admin makes request POST /1.0/remove-ghost-jobs
+    AND admin makes request GET /1.0/list-running-jobs
+    THEN response has running_jobs set to []
+
+Further, if we request for a new job now, we'll get one for the same
+lorry specification.
+
+    WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+    THEN response has job_id set to 2
+    AND response has path set to "upstream/foo"
+
+Finally, clean up.
+
+    FINALLY WEBAPP terminates
+
 Remove a terminated job
 -----------------------
author	Lars Wirzenius <lars.wirzenius@codethink.co.uk>	2014-09-08 15:50:52 +0000
committer	Lars Wirzenius <lars.wirzenius@codethink.co.uk>	2014-09-08 15:50:52 +0000
commit	4a6d0ef584a3d87c1d6ad237336660aacd161650 (patch)
tree	71ac6cc3557d6cf340f1dbeec32ce429794ec096
parent	0ef176c196db439e05026705450f691e678cdccd (diff)
parent	2ce8d016add6dd279c3903fea26645a7499ec50a (diff)
download	lorry-controller-4a6d0ef584a3d87c1d6ad237336660aacd161650.tar.gz