summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Thursfield <sam.thursfield@codethink.co.uk>2015-06-11 16:35:11 +0100
committerSam Thursfield <sam.thursfield@codethink.co.uk>2015-06-11 16:35:37 +0100
commit26d7899ee16830b0ae5058fcd664fc69df514ff1 (patch)
tree9ad60dfbb21d3f01423c9a648ddeed9b65c1a68d
parent88d9e29c29f770e664ee785fdbf3ff0668fe1c35 (diff)
downloadmorph-cache-server-26d7899ee16830b0ae5058fcd664fc69df514ff1.tar.gz
WIP: make morph-cache-server useful for reproducible builds testing
-rw-r--r--README.rst31
-rwxr-xr-xmorph-cache-server214
-rw-r--r--morphcacheserver/artifact_database.py130
-rw-r--r--morphcacheserver/migrations/0001.init.py44
4 files changed, 355 insertions, 64 deletions
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..a96277a
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,31 @@
+------------------
+morph-cache-server
+------------------
+
+Dependencies
+------------
+
+- Bottle
+- cliapp
+- flup
+- yoyo-migrations
+
+Similar things
+--------------
+
+Debian's "Reproducible builds" project
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tests reproducibility using Jenkins jobs:
+https://jenkins.debian.net/view/reproducible/
+
+The source code for these is here:
+https://anonscm.debian.org/cgit/qa/jenkins.debian.net.git/tree/job-cfg/reproducible.yaml
+
+And the nice reports are here:
+https://reproducible.debian.net/reproducible.html
+
+Gitian
+~~~~~~
+
+Build tool, but doesn't seem to provide a server tool.
diff --git a/morph-cache-server b/morph-cache-server
index 007cfbe..61c1640 100755
--- a/morph-cache-server
+++ b/morph-cache-server
@@ -1,31 +1,34 @@
#!/usr/bin/env python
#
# Copyright (C) 2013, 2014-2015 Codethink Limited
-#
+#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
-import base64
+from bottle import Bottle, request, response, run, static_file
import cliapp
+from flup.server.fcgi import WSGIServer
+
+import base64
+import hashlib
import json
import logging
+import shutil
import os
import urllib
import urllib2
-import shutil
-from bottle import Bottle, request, response, run, static_file
-from flup.server.fcgi import WSGIServer
+from morphcacheserver.artifact_database import ArtifactDatabase
from morphcacheserver.repocache import RepoCache
@@ -33,10 +36,81 @@ defaults = {
'repo-dir': '/var/cache/morph-cache-server/gits',
'bundle-dir': '/var/cache/morph-cache-server/bundles',
'artifact-dir': '/var/cache/morph-cache-server/artifacts',
+ 'database-file': '/var/cache/morph-cache-server/database.sqlite',
'port': 8080,
}
+def checksum(filename, hasher):
+ with open(filename,'rb') as f:
+ for chunk in iter(lambda: f.read(128 * hasher.block_size), b''):
+ hasher.update(chunk)
+ return hasher.digest()
+
+
+def _fetch_artifact(self, url, filename):
+ '''Fetch a single artifact for /fetch.'''
+ in_fh = None
+ try:
+ in_fh = urllib2.urlopen(url)
+ with open(filename, "w") as localtmp:
+ shutil.copyfileobj(in_fh, localtmp)
+ in_fh.close()
+ except Exception, e:
+ if in_fh is not None:
+ in_fh.close()
+ raise
+ else:
+ if in_fh is not None:
+ in_fh.close()
+
+ # FIXME: we could hash the artifact while we download it, instead, for
+ # a bit of a speed increase.
+ hash_sha1 = checksum(filename, hashlib.sha1())
+
+ return os.stat(filename), hash_sha1
+
+def _fetch_artifacts(self, db, server, cacheid, artifacts,
+ builder_name=None, build_datetime=None):
+ '''Implements the /fetch method, to pull artifacts into the cache.
+
+ This allows any server to overwrite the files in the cache. Be aware!!
+
+ '''
+ ret = {}
+ try:
+ for artifact in artifacts:
+ artifact_name = "%s.%s" % (cacheid, artifact)
+
+ tmpname = os.path.join(self.settings['artifact-dir'],
+ ".dl.%s" % artifact_name)
+ url = "http://%s/1.0/artifacts?filename=%s" % (
+ server, urllib.quote(artifact_name))
+ stinfo, hash_sha1 = self._fetch_artifact(url, tmpname)
+
+ db.record_build(artifact_name, builder_name, build_datetime,
+ hash_sha1)
+
+ ret[artifact_name] = {
+ "size": stinfo.st_size,
+ "used": stinfo.st_blocks * 512,
+ }
+ except Exception, e:
+ for artifact in ret.iterkeys():
+ os.unlink(os.path.join(self.settings['artifact-dir'],
+ ".dl.%s" % artifact))
+ raise
+
+ for artifact in ret.iterkeys():
+ tmpname = os.path.join(self.settings['artifact-dir'],
+ ".dl.%s" % artifact)
+ artifilename = os.path.join(self.settings['artifact-dir'],
+ artifact)
+ os.rename(tmpname, artifilename)
+
+ return ret
+
+
class MorphCacheServer(cliapp.Application):
def add_settings(self):
@@ -60,6 +134,10 @@ class MorphCacheServer(cliapp.Application):
'path to the artifact cache directory',
metavar='PATH',
default=defaults['artifact-dir'])
+ self.settings.string(['database-file'],
+ 'artifact database file',
+ metavar='FILE',
+ default=defaults['database-file'])
self.settings.boolean(['direct-mode'],
'cache directories are directly managed')
self.settings.boolean(['enable-writes'],
@@ -68,61 +146,10 @@ class MorphCacheServer(cliapp.Application):
'runs a fcgi-server',
default=True)
-
- def _fetch_artifact(self, url, filename):
- in_fh = None
- try:
- in_fh = urllib2.urlopen(url)
- with open(filename, "w") as localtmp:
- shutil.copyfileobj(in_fh, localtmp)
- in_fh.close()
- except Exception, e:
- if in_fh is not None:
- in_fh.close()
- raise
- else:
- if in_fh is not None:
- in_fh.close()
- return os.stat(filename)
-
- def _fetch_artifacts(self, server, cacheid, artifacts):
- ret = {}
- try:
- for artifact in artifacts:
- artifact_name = "%s.%s" % (cacheid, artifact)
- tmpname = os.path.join(self.settings['artifact-dir'],
- ".dl.%s" % artifact_name)
- url = "http://%s/1.0/artifacts?filename=%s" % (
- server, urllib.quote(artifact_name))
- stinfo = self._fetch_artifact(url, tmpname)
- ret[artifact_name] = {
- "size": stinfo.st_size,
- "used": stinfo.st_blocks * 512,
- }
- except Exception, e:
- for artifact in ret.iterkeys():
- os.unlink(os.path.join(self.settings['artifact-dir'],
- ".dl.%s" % artifact))
- raise
-
- for artifact in ret.iterkeys():
- tmpname = os.path.join(self.settings['artifact-dir'],
- ".dl.%s" % artifact)
- artifilename = os.path.join(self.settings['artifact-dir'],
- artifact)
- os.rename(tmpname, artifilename)
-
- return ret
-
-
- def process_args(self, args):
+ def api_1_0(self, repo_cache, db):
+ '''The /1.0 set of HTTP methods.'''
app = Bottle()
- repo_cache = RepoCache(self,
- self.settings['repo-dir'],
- self.settings['bundle-dir'],
- self.settings['direct-mode'])
-
def writable(prefix):
"""Selectively enable bottle prefixes.
@@ -168,10 +195,20 @@ class MorphCacheServer(cliapp.Application):
host = self._unescape_parameter(request.query.host)
cacheid = self._unescape_parameter(request.query.cacheid)
artifacts = self._unescape_parameter(request.query.artifacts)
+
+ # Optional parameters added for bit-for-bit reproducibility
+ # checking.
+ builder_name = self._unescape_parameter(
+ request.query.get('builder_name'))
+ build_datetime = self._unescape_parameter(
+ request.query.get('build_datetime'))
+
try:
response.set_header('Cache-Control', 'no-cache')
artifacts = artifacts.split(",")
- return self._fetch_artifacts(host, cacheid, artifacts)
+ return self._fetch_artifacts(db, host, cacheid, artifacts,
+ builder_name=builder_name,
+ build_datetime=build_datetime)
except Exception, e:
response.status = 500
@@ -231,7 +268,7 @@ class MorphCacheServer(cliapp.Application):
response.set_header('Cache-Control', 'no-cache')
response.set_header('Content-Type', 'application/json')
return json.dumps(result)
-
+
@app.get('/files')
def file():
repo = self._unescape_parameter(request.query.repo)
@@ -336,9 +373,58 @@ class MorphCacheServer(cliapp.Application):
return results
- root = Bottle()
- root.mount(app, '/1.0')
+ def api_2_0(self, db):
+ '''The 2.0/ set of HTTP methods.'''
+ app = Bottle()
+
+ @app.put('/builds')
+ def put_build():
+ '''Record a build.
+
+ Expected parameters:
+ - cache_name: artifact cache key plus name
+ - builder_name: URL identifying the build worker
+ - build_datetime: time artifact build was started
+ - hash_sha1: SHA1 of built artifact
+
+ '''
+ cache_name = self._unescape_parameter(request.query.cache_name)
+ builder_name = self._unescape_parameter(request.query.builder_name)
+ build_datetime = self._unescape_parameter(
+ request.query.build_datetime)
+ hash_sha1 = self._unescape_parameter(request.query.hash_sha1)
+
+ db.record_build(cache_name, builder_name, build_datetime, hash_sha1)
+
+ @app.get('/builds')
+ def get_builds():
+ '''Return info on all known builds of a given artifact.'''
+
+ cache_name = self._unescape_parameter(request.query.cache_name)
+ results = sorted(db.iter_builds_for_artifact_file(cache_name))
+
+ if len(results) == 0:
+ response.status = 404
+ else:
+ return {cache_name: results}
+
+ return app
+
+ def process_args(self, args):
+ repo_cache = RepoCache(self,
+ self.settings['repo-dir'],
+ self.settings['bundle-dir'],
+ self.settings['direct-mode'])
+
+ db = ArtifactDatabase(self.settings['database-file'])
+
+ api_1_0 = self.api_1_0(repo_cache, db)
+ api_2_0 = self.api_2_0(db)
+
+ root = Bottle()
+ root.mount(api_1_0, '/1.0')
+ root.mount(api_2_0, '/2.0')
if self.settings['fcgi-server']:
WSGIServer(root).run()
diff --git a/morphcacheserver/artifact_database.py b/morphcacheserver/artifact_database.py
new file mode 100644
index 0000000..0da46c4
--- /dev/null
+++ b/morphcacheserver/artifact_database.py
@@ -0,0 +1,130 @@
+# Copyright (C) 2015 Codethink Limited
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import yoyo
+import yoyo.connections
+
+import logging
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+class ArtifactDatabase(object):
+ '''A database to track artifact info.
+
+ This is so that we can have multiple builders submitting different builds
+ of the same artifact, then compare whether they are identical or not.
+
+ It uses the 'yoyo-migrations' Python package to set up the database schema,
+ so that hopefully the schema can be changed in future without enourmous
+ hassle.
+
+ '''
+ def __init__(self, database_file):
+ self.db, self.paramstyle = yoyo.connections.connect(
+ 'sqlite:///' + database_file)
+ self.apply_migrations(self.migrations_dir())
+
+ def migrations_dir(self):
+ return os.path.join(os.path.dirname(__file__), 'migrations')
+
+ def apply_migrations(self, migrations_path):
+ migrations = yoyo.read_migrations(
+ self.db, self.paramstyle, migrations_path,
+ migration_table='migrations')
+ to_apply = migrations.to_apply()
+ log.info('Found %i migrations, applying %i', len(migrations),
+ len(to_apply))
+ to_apply.apply()
+ self.db.commit()
+
+ def intern_artifact_file(self, cache_name):
+ '''Record that a Baserock artifact file is now in the cache directory.
+
+ The 'cache_name' variable is the SHA256 hash of the 'cache key', plus
+ a string.
+
+ The 'cache key' is a dictionary of values that should describe exactly
+ how the artifact was built. There is no standard for what a 'cache
+ key' should contain at present, different Baserock build tools can put
+ whatever info they want. This is not ideal, but hopefully use of SHA256
+ makes collisions unlikely in any case.
+
+ Artifacts can be split into multiple files, because one single build
+ operation can produce a bunch of different things. For example, running
+ './configure; make; make install' in glibc.git produces not only the
+ GNU C library binaries, but also documentation, helper utilities and
+ other miscellanous things. Thus, the string that follows the SHA256
+ hash identifies which of these things we are talking about. There is
+ no standard for these strings, at present. It is up to the build tools
+ to make sense of them.
+
+ '''
+ cursor = self.db.cursor()
+ find_artifact_sql = 'SELECT internal_id FROM artifact_files WHERE ' \
+ 'cache_name=?'
+ row = cursor.execute(find_artifact_sql, [cache_name]).fetchone()
+ if row is None:
+ log.debug('Recording new artifact file %s', cache_name)
+ cursor.execute(
+ 'INSERT INTO artifact_files(cache_name) VALUES(?)',
+ [cache_name])
+ self.db.commit()
+ internal_id = cursor.lastrowid
+ else:
+ # If the artifact file was already known, no problem.
+ internal_id = row[0]
+ return internal_id
+
+ def record_build(self, cache_name, builder_name, build_datetime,
+ hash_sha1):
+ '''Record a build of a Baserock artifact.
+
+ The artifact file is identified by the 'cache name', which is a hash of
+ some information that describes how it is built.
+
+ It is up to the build tool and the build instructions to ensure that a
+ given 'cache key' produces a set of identical artifact files each time.
+ We record every build that we receive in order to detect and highlight
+ cases where the build output for a given 'cache key' is not
+ deterministic.
+
+ '''
+ self.intern_artifact_file(cache_name)
+
+ cursor = self.db.cursor()
+ log.debug('Recording new build of %s, %s, %s', cache_name,
+ builder_name, build_datetime)
+ cursor.execute(
+ 'INSERT INTO builds(cache_name, builder_name, build_datetime, '
+ ' hash_sha1) VALUES(?, ?, ?, ?)',
+ [cache_name, builder_name, build_datetime, hash_sha1])
+ self.db.commit()
+
+ def iter_builds_for_artifact_file(self, cache_name):
+ '''Yield info on each recorded build of a given artifact.'''
+ cursor = self.db.cursor()
+ cursor.execute(
+ 'SELECT builder_name, build_datetime, hash_sha1 FROM builds WHERE '
+ ' cache_name=?', [cache_name])
+ for item in cursor:
+ builder_name, build_datetime, hash_sha1 = item
+ yield {
+ 'builder_name': builder_name,
+ 'build_datetime': build_datetime,
+ 'hash_sha1': hash_sha1
+ }
diff --git a/morphcacheserver/migrations/0001.init.py b/morphcacheserver/migrations/0001.init.py
new file mode 100644
index 0000000..cbe8916
--- /dev/null
+++ b/morphcacheserver/migrations/0001.init.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2015 Codethink Limited
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+from yoyo import step, transaction
+
+
+# This is a 'yoyo' database migration. Each step has an 'apply' SQL statement
+# followed by a 'revert' SQL statement.
+
+
+transaction(
+ step(
+ # This table isn't much use at present, in future it would be better
+ # store the actual inputs to the cache key, and not just the hash.
+ 'CREATE TABLE artifact_files ('
+ ' internal_id INTEGER UNIQUE PRIMARY KEY,'
+ ' cache_name VARCHAR UNIQUE NOT NULL'
+ ')',
+ 'DROP TABLE artifact_files'
+ ),
+ step(
+ 'CREATE TABLE builds ('
+ ' internal_id INTEGER UNIQUE PRIMARY KEY,'
+ ' cache_name VARCHAR NOT NULL,'
+ ' builder_name VARCHAR NOT NULL,'
+ ' build_datetime DATETIME NOT NULL,'
+ ' hash_sha1 VARCHAR,'
+ ' FOREIGN KEY (cache_name) REFERENCES artifact_files(cache_name)'
+ ')',
+ 'DROP TABLE builds'
+ )
+)