summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <lars.wirzenius@codethink.co.uk>2014-09-08 15:50:52 +0000
committerLars Wirzenius <lars.wirzenius@codethink.co.uk>2014-09-08 15:50:52 +0000
commit4a6d0ef584a3d87c1d6ad237336660aacd161650 (patch)
tree71ac6cc3557d6cf340f1dbeec32ce429794ec096
parent0ef176c196db439e05026705450f691e678cdccd (diff)
parent2ce8d016add6dd279c3903fea26645a7499ec50a (diff)
downloadlorry-controller-4a6d0ef584a3d87c1d6ad237336660aacd161650.tar.gz
Merge branch 'baserock/liw/de-ghost'
-rw-r--r--ARCH5
-rwxr-xr-xlorry-controller-webapp19
-rw-r--r--lorrycontroller/__init__.py1
-rw-r--r--lorrycontroller/jobupdate.py4
-rw-r--r--lorrycontroller/removeghostjobs.py65
-rw-r--r--lorrycontroller/statedb.py41
-rw-r--r--units/lorry-controller-remove-ghost-jobs.service12
-rw-r--r--units/lorry-controller-remove-ghost-jobs.timer9
-rw-r--r--yarns.webapp/040-running-jobs.yarn79
9 files changed, 224 insertions, 11 deletions
diff --git a/ARCH b/ARCH
index d2d81ad..271b2bc 100644
--- a/ARCH
+++ b/ARCH
@@ -275,6 +275,11 @@ Requests for admins:
of all jobs, running or finished, that it knows about. (RQ/ALLJOBS)
* `POST /1.0/remove-job` with `job_id=jobid` in the body, removes a
stopped job from the state database.
+* `POST /1.0/remove-ghost-jobs` looks for any running jobs in STATEDB
+ that haven't been updated (with `job-update`, see below) in a long
+ time (see `--ghost-timeout`), and marks them as terminated. This is
+ used to catch situations when a MINION fails to tell the WEBAPP that
+ a job has terminated.
Requests for MINION:
diff --git a/lorry-controller-webapp b/lorry-controller-webapp
index 9234498..faabb2d 100755
--- a/lorry-controller-webapp
+++ b/lorry-controller-webapp
@@ -28,6 +28,9 @@ from flup.server.fcgi import WSGIServer
import lorrycontroller
+ONE_MINUTE = 60
+
+
class WEBAPP(cliapp.Application):
def add_settings(self):
@@ -110,6 +113,22 @@ class WEBAPP(cliapp.Application):
metavar='DIR',
default='/usr/share/lorry-controller/static')
+ # The default value of ten minutes for the ghost-timeout
+ # setting was chosen arbitrarily, by Lars Wirzenius. The value
+ # needs to be long enough that there's no realistic danger of
+ # hitting it just because a host is a bit overloaded, but
+ # still short enough that ghost jobs do get removed often
+ # enough, especially right after boot, when all jobs are
+ # ghosts. Experience may show that a different value would
+ # actually be better, and if so, the code and this comment
+ # should be changed accordingly.
+ self.settings.integer(
+ ['ghost-timeout'],
+ 'running jobs should get an update from their '
+ 'MINION within this time or they will be considered '
+ 'ghosts and be removed from STATEDB (in seconds)',
+ default=10*ONE_MINUTE)
+
def find_routes(self):
'''Return all classes that are API routes.
diff --git a/lorrycontroller/__init__.py b/lorrycontroller/__init__.py
index bc51b88..a65ff02 100644
--- a/lorrycontroller/__init__.py
+++ b/lorrycontroller/__init__.py
@@ -32,6 +32,7 @@ from movetopbottom import MoveToTop, MoveToBottom
from stopjob import StopJob
from listjobs import ListAllJobs, ListAllJobsHTML
from showjob import ShowJob, ShowJobHTML, JobShower
+from removeghostjobs import RemoveGhostJobs
from removejob import RemoveJob
from lstroves import LsTroves, ForceLsTrove
from pretendtime import PretendTime
diff --git a/lorrycontroller/jobupdate.py b/lorrycontroller/jobupdate.py
index 3bd0e81..efc9ce1 100644
--- a/lorrycontroller/jobupdate.py
+++ b/lorrycontroller/jobupdate.py
@@ -44,11 +44,13 @@ class JobUpdate(lorrycontroller.LorryControllerRoute):
if stderr:
statedb.append_to_job_output(job_id, stderr)
+ now = statedb.get_current_time()
+ statedb.set_job_updated(job_id, now)
+
path = statedb.find_lorry_running_job(job_id)
lorry_info = statedb.get_lorry_info(path)
if exit is not None and exit != 'no':
- now = statedb.get_current_time()
statedb.set_lorry_last_run(path, int(now))
statedb.set_running_job(path, None)
statedb.set_job_exit(job_id, exit, int(now), disk_usage)
diff --git a/lorrycontroller/removeghostjobs.py b/lorrycontroller/removeghostjobs.py
new file mode 100644
index 0000000..2b2760c
--- /dev/null
+++ b/lorrycontroller/removeghostjobs.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2014 Codethink Limited
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+import logging
+import time
+
+import bottle
+
+import lorrycontroller
+
+
+class RemoveGhostJobs(lorrycontroller.LorryControllerRoute):
+
+ http_method = 'POST'
+ path = '/1.0/remove-ghost-jobs'
+
+ def run(self, **kwargs):
+ logging.info('%s %s called', self.http_method, self.path)
+
+ ghost_timeout = self.app_settings['ghost-timeout']
+ ghosts = []
+ with self.open_statedb() as statedb:
+ for job_id in statedb.get_running_jobs():
+ if self.is_ghost_job(statedb, job_id, ghost_timeout):
+ self.exorcise_ghost_job(statedb, job_id)
+ ghosts.append(statedb.get_job_info(job_id))
+ return {
+ 'killed-ghost-jobs': ghosts,
+ }
+
+ def is_ghost_job(self, statedb, job_id, ghost_timeout):
+ updated = statedb.get_job_updated(job_id)
+ return self.now(statedb) - updated >= ghost_timeout
+
+ def now(self, statedb):
+ return statedb.get_current_time()
+
+ def exorcise_ghost_job(self, statedb, job_id):
+ logging.info('Job %s is a ghost job', job_id)
+ self.mark_job_to_be_killed_in_case_minion_appears(statedb, job_id)
+ self.mark_job_as_terminated(statedb, job_id)
+
+ def mark_job_to_be_killed_in_case_minion_appears(self, statedb, job_id):
+ statedb.set_kill_job(job_id, True)
+
+ def mark_job_as_terminated(self, statedb, job_id):
+ statedb.append_to_job_output(
+ job_id, '\nTERMINATED DUE TO GHOST TIMEOUT\n')
+ statedb.set_job_exit(job_id, 127, self.now(statedb), -1)
+
+ job_info = statedb.get_job_info(job_id)
+ statedb.set_running_job(job_info['path'], None)
diff --git a/lorrycontroller/statedb.py b/lorrycontroller/statedb.py
index 2d223e0..fd7857d 100644
--- a/lorrycontroller/statedb.py
+++ b/lorrycontroller/statedb.py
@@ -129,6 +129,7 @@ class StateDB(object):
'pid INT, '
'started INT, '
'ended INT, '
+ 'updated INT, '
'kill INT, '
'path TEXT, '
'exit TEXT, '
@@ -454,8 +455,8 @@ class StateDB(object):
def get_job_info(self, job_id):
c = self.get_cursor()
c.execute(
- 'SELECT job_id, host, pid, started, ended, kill, path, exit, '
- 'disk_usage, output FROM jobs WHERE job_id=?',
+ 'SELECT job_id, host, pid, started, ended, updated, kill, '
+ 'path, exit, disk_usage, output FROM jobs WHERE job_id=?',
(job_id,))
row = c.fetchone()
return {
@@ -464,11 +465,12 @@ class StateDB(object):
'pid': row[2],
'started': row[3],
'ended': row[4],
- 'kill': row[5],
- 'path': row[6],
- 'exit': row[7],
- 'disk_usage': row[8],
- 'output': row[9],
+ 'updated': row[5],
+ 'kill': row[6],
+ 'path': row[7],
+ 'exit': row[8],
+ 'disk_usage': row[9],
+ 'output': row[10],
}
def add_new_job(self, job_id, host, pid, path, started):
@@ -478,9 +480,10 @@ class StateDB(object):
assert self.in_transaction
c = self.get_cursor()
c.execute(
- 'INSERT INTO jobs (job_id, host, pid, path, started, kill) '
- 'VALUES (?, ?, ?, ?, ?, ?)',
- (job_id, host, pid, path, started, 0))
+ 'INSERT INTO jobs (job_id, host, pid, path, started, '
+ 'updated, kill) '
+ 'VALUES (?, ?, ?, ?, ?, ?, ?)',
+ (job_id, host, pid, path, started, started, 0))
def get_job_minion_host(self, job_id):
c = self.get_cursor()
@@ -514,6 +517,24 @@ class StateDB(object):
row = c.fetchone()
return row[0], row[1]
+ def get_job_updated(self, job_id):
+ c = self.get_cursor()
+ c.execute(
+ 'SELECT updated FROM jobs WHERE job_id IS ?',
+ (job_id,))
+ row = c.fetchone()
+ return row[0]
+
+ def set_job_updated(self, job_id, updated):
+ logging.debug(
+ 'StateDB.set_job_updated(%r, %r) called',
+ job_id, updated)
+ assert self.in_transaction
+ c = self.get_cursor()
+ c.execute(
+ 'UPDATE jobs SET updated=? WHERE job_id IS ?',
+ (updated, job_id))
+
def get_job_exit(self, job_id):
c = self.get_cursor()
c.execute(
diff --git a/units/lorry-controller-remove-ghost-jobs.service b/units/lorry-controller-remove-ghost-jobs.service
new file mode 100644
index 0000000..e28a494
--- /dev/null
+++ b/units/lorry-controller-remove-ghost-jobs.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Lorry Controller remove ghost jobs
+After=lighttpd-lorry-controller-webapp.service
+
+[Install]
+WantedBy=multi-user.target
+
+[Service]
+ExecStart=/usr/bin/curl -o /dev/null -X POST -d '' http://localhost:12765/1.0/remove-ghost-jobs
+Restart=no
+User=lorry
+Group=lorry
diff --git a/units/lorry-controller-remove-ghost-jobs.timer b/units/lorry-controller-remove-ghost-jobs.timer
new file mode 100644
index 0000000..61ebaba
--- /dev/null
+++ b/units/lorry-controller-remove-ghost-jobs.timer
@@ -0,0 +1,9 @@
+[Unit]
+Description=Lorry Controller remove ghost jobs
+After=lighttpd-lorry-controller-webapp.service
+
+[Install]
+WantedBy=multi-user.target
+
+[Timer]
+OnUnitInactiveSec=60
diff --git a/yarns.webapp/040-running-jobs.yarn b/yarns.webapp/040-running-jobs.yarn
index 879d9fa..571afd6 100644
--- a/yarns.webapp/040-running-jobs.yarn
+++ b/yarns.webapp/040-running-jobs.yarn
@@ -237,6 +237,85 @@ Cleanup.
FINALLY WEBAPP terminates
+
+Forget jobs whose MINION is gone
+--------------------------------
+
+A job's status is updated when a MINION uses the `/1.0/job-update`
+call, and when the MINION uses that to report that the job has
+finished, the STATEDB is updated accordingly. However, sometimes the
+MINION never tells WEBAPP that the job if finished. This can happen
+for a variety of reasons, such as (not limited to these):
+
+* MINION crashes.
+* WEBAPP is unavailable.
+* The host reboots, killing MINION and WEBAPP both.
+
+If this happens, STATEDB still marks the job as running, and WEBAPP
+won't start a new job for that lorry specification.
+
+To deal with these, we need to have a way to clean up "ghost jobs"
+like these. We do this with the `/1.0/cleanup-ghost-jobs` API call,
+which marks all jobs finished that haven't had a `job-update` called
+on them for a long time.
+
+ SCENARIO forget jobs without MINION updates in a long time
+
+Set up a WEBAPP that uses a CONFGIT with a Lorry file, so we can start
+a job.
+
+ GIVEN a new git repository in CONFGIT
+ AND an empty lorry-controller.conf in CONFGIT
+ AND lorry-controller.conf in CONFGIT adds lorries *.lorry using prefix upstream
+ AND Lorry file CONFGIT/foo.lorry with {"foo":{"type":"git","url":"git://foo"}}
+ AND WEBAPP uses CONFGIT as its configuration directory
+ AND a running WEBAPP
+
+Pretend it is a known time (specifically, the beginning of the epoch).
+This is needed so we can trigger the ghost job timeout later.
+
+ WHEN admin makes request POST /1.0/pretend-time with now=0
+
+Tell WEBAPP to read the configuration.
+
+ WHEN admin makes request POST /1.0/read-configuration
+
+Start a new job.
+
+ WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+ THEN response has job_id set to 1
+
+Verify that the job is in the list of running jobs.
+
+ WHEN admin makes request GET /1.0/list-running-jobs
+ THEN response has running_jobs set to [1]
+
+Remove any ghosts. There aren't any yet, so nothing should be removed.
+
+ WHEN admin makes request POST /1.0/remove-ghost-jobs
+ AND admin makes request GET /1.0/list-running-jobs
+ THEN response has running_jobs set to [1]
+
+Now, pretend a long time has passed, and clean up the ghost job. The
+default value for the ghost timeout is reasonably short (less than a
+day), so we pretend it is about 10 days later (one million seconds).
+
+ WHEN admin makes request POST /1.0/pretend-time with now=1000000
+ AND admin makes request POST /1.0/remove-ghost-jobs
+ AND admin makes request GET /1.0/list-running-jobs
+ THEN response has running_jobs set to []
+
+Further, if we request for a new job now, we'll get one for the same
+lorry specification.
+
+ WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123
+ THEN response has job_id set to 2
+ AND response has path set to "upstream/foo"
+
+Finally, clean up.
+
+ FINALLY WEBAPP terminates
+
Remove a terminated job
-----------------------