diff options
author | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2015-06-11 16:35:11 +0100 |
---|---|---|
committer | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2015-06-11 16:35:37 +0100 |
commit | 26d7899ee16830b0ae5058fcd664fc69df514ff1 (patch) | |
tree | 9ad60dfbb21d3f01423c9a648ddeed9b65c1a68d | |
parent | 88d9e29c29f770e664ee785fdbf3ff0668fe1c35 (diff) | |
download | morph-cache-server-26d7899ee16830b0ae5058fcd664fc69df514ff1.tar.gz |
WIP: make morph-cache-server useful for reproducible builds testing
-rw-r--r-- | README.rst | 31 | ||||
-rwxr-xr-x | morph-cache-server | 214 | ||||
-rw-r--r-- | morphcacheserver/artifact_database.py | 130 | ||||
-rw-r--r-- | morphcacheserver/migrations/0001.init.py | 44 |
4 files changed, 355 insertions, 64 deletions
diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..a96277a --- /dev/null +++ b/README.rst @@ -0,0 +1,31 @@ +------------------ +morph-cache-server +------------------ + +Dependencies +------------ + +- Bottle +- cliapp +- flup +- yoyo-migrations + +Similar things +-------------- + +Debian's "Reproducible builds" project +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Tests reproducibility using Jenkins jobs: +https://jenkins.debian.net/view/reproducible/ + +The source code for these is here: +https://anonscm.debian.org/cgit/qa/jenkins.debian.net.git/tree/job-cfg/reproducible.yaml + +And the nice reports are here: +https://reproducible.debian.net/reproducible.html + +Gitian +~~~~~~ + +Build tool, but doesn't seem to provide a server tool. diff --git a/morph-cache-server b/morph-cache-server index 007cfbe..61c1640 100755 --- a/morph-cache-server +++ b/morph-cache-server @@ -1,31 +1,34 @@ #!/usr/bin/env python # # Copyright (C) 2013, 2014-2015 Codethink Limited -# +# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 of the License. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License along # with this program. If not, see <http://www.gnu.org/licenses/>. -import base64 +from bottle import Bottle, request, response, run, static_file import cliapp +from flup.server.fcgi import WSGIServer + +import base64 +import hashlib import json import logging +import shutil import os import urllib import urllib2 -import shutil -from bottle import Bottle, request, response, run, static_file -from flup.server.fcgi import WSGIServer +from morphcacheserver.artifact_database import ArtifactDatabase from morphcacheserver.repocache import RepoCache @@ -33,10 +36,81 @@ defaults = { 'repo-dir': '/var/cache/morph-cache-server/gits', 'bundle-dir': '/var/cache/morph-cache-server/bundles', 'artifact-dir': '/var/cache/morph-cache-server/artifacts', + 'database-file': '/var/cache/morph-cache-server/database.sqlite', 'port': 8080, } +def checksum(filename, hasher): + with open(filename,'rb') as f: + for chunk in iter(lambda: f.read(128 * hasher.block_size), b''): + hasher.update(chunk) + return hasher.digest() + + +def _fetch_artifact(self, url, filename): + '''Fetch a single artifact for /fetch.''' + in_fh = None + try: + in_fh = urllib2.urlopen(url) + with open(filename, "w") as localtmp: + shutil.copyfileobj(in_fh, localtmp) + in_fh.close() + except Exception, e: + if in_fh is not None: + in_fh.close() + raise + else: + if in_fh is not None: + in_fh.close() + + # FIXME: we could hash the artifact while we download it, instead, for + # a bit of a speed increase. + hash_sha1 = checksum(filename, hashlib.sha1()) + + return os.stat(filename), hash_sha1 + +def _fetch_artifacts(self, db, server, cacheid, artifacts, + builder_name=None, build_datetime=None): + '''Implements the /fetch method, to pull artifacts into the cache. + + This allows any server to overwrite the files in the cache. Be aware!! + + ''' + ret = {} + try: + for artifact in artifacts: + artifact_name = "%s.%s" % (cacheid, artifact) + + tmpname = os.path.join(self.settings['artifact-dir'], + ".dl.%s" % artifact_name) + url = "http://%s/1.0/artifacts?filename=%s" % ( + server, urllib.quote(artifact_name)) + stinfo, hash_sha1 = self._fetch_artifact(url, tmpname) + + db.record_build(artifact_name, builder_name, build_datetime, + hash_sha1) + + ret[artifact_name] = { + "size": stinfo.st_size, + "used": stinfo.st_blocks * 512, + } + except Exception, e: + for artifact in ret.iterkeys(): + os.unlink(os.path.join(self.settings['artifact-dir'], + ".dl.%s" % artifact)) + raise + + for artifact in ret.iterkeys(): + tmpname = os.path.join(self.settings['artifact-dir'], + ".dl.%s" % artifact) + artifilename = os.path.join(self.settings['artifact-dir'], + artifact) + os.rename(tmpname, artifilename) + + return ret + + class MorphCacheServer(cliapp.Application): def add_settings(self): @@ -60,6 +134,10 @@ class MorphCacheServer(cliapp.Application): 'path to the artifact cache directory', metavar='PATH', default=defaults['artifact-dir']) + self.settings.string(['database-file'], + 'artifact database file', + metavar='FILE', + default=defaults['database-file']) self.settings.boolean(['direct-mode'], 'cache directories are directly managed') self.settings.boolean(['enable-writes'], @@ -68,61 +146,10 @@ class MorphCacheServer(cliapp.Application): 'runs a fcgi-server', default=True) - - def _fetch_artifact(self, url, filename): - in_fh = None - try: - in_fh = urllib2.urlopen(url) - with open(filename, "w") as localtmp: - shutil.copyfileobj(in_fh, localtmp) - in_fh.close() - except Exception, e: - if in_fh is not None: - in_fh.close() - raise - else: - if in_fh is not None: - in_fh.close() - return os.stat(filename) - - def _fetch_artifacts(self, server, cacheid, artifacts): - ret = {} - try: - for artifact in artifacts: - artifact_name = "%s.%s" % (cacheid, artifact) - tmpname = os.path.join(self.settings['artifact-dir'], - ".dl.%s" % artifact_name) - url = "http://%s/1.0/artifacts?filename=%s" % ( - server, urllib.quote(artifact_name)) - stinfo = self._fetch_artifact(url, tmpname) - ret[artifact_name] = { - "size": stinfo.st_size, - "used": stinfo.st_blocks * 512, - } - except Exception, e: - for artifact in ret.iterkeys(): - os.unlink(os.path.join(self.settings['artifact-dir'], - ".dl.%s" % artifact)) - raise - - for artifact in ret.iterkeys(): - tmpname = os.path.join(self.settings['artifact-dir'], - ".dl.%s" % artifact) - artifilename = os.path.join(self.settings['artifact-dir'], - artifact) - os.rename(tmpname, artifilename) - - return ret - - - def process_args(self, args): + def api_1_0(self, repo_cache, db): + '''The /1.0 set of HTTP methods.''' app = Bottle() - repo_cache = RepoCache(self, - self.settings['repo-dir'], - self.settings['bundle-dir'], - self.settings['direct-mode']) - def writable(prefix): """Selectively enable bottle prefixes. @@ -168,10 +195,20 @@ class MorphCacheServer(cliapp.Application): host = self._unescape_parameter(request.query.host) cacheid = self._unescape_parameter(request.query.cacheid) artifacts = self._unescape_parameter(request.query.artifacts) + + # Optional parameters added for bit-for-bit reproducibility + # checking. + builder_name = self._unescape_parameter( + request.query.get('builder_name')) + build_datetime = self._unescape_parameter( + request.query.get('build_datetime')) + try: response.set_header('Cache-Control', 'no-cache') artifacts = artifacts.split(",") - return self._fetch_artifacts(host, cacheid, artifacts) + return self._fetch_artifacts(db, host, cacheid, artifacts, + builder_name=builder_name, + build_datetime=build_datetime) except Exception, e: response.status = 500 @@ -231,7 +268,7 @@ class MorphCacheServer(cliapp.Application): response.set_header('Cache-Control', 'no-cache') response.set_header('Content-Type', 'application/json') return json.dumps(result) - + @app.get('/files') def file(): repo = self._unescape_parameter(request.query.repo) @@ -336,9 +373,58 @@ class MorphCacheServer(cliapp.Application): return results - root = Bottle() - root.mount(app, '/1.0') + def api_2_0(self, db): + '''The 2.0/ set of HTTP methods.''' + app = Bottle() + + @app.put('/builds') + def put_build(): + '''Record a build. + + Expected parameters: + - cache_name: artifact cache key plus name + - builder_name: URL identifying the build worker + - build_datetime: time artifact build was started + - hash_sha1: SHA1 of built artifact + + ''' + cache_name = self._unescape_parameter(request.query.cache_name) + builder_name = self._unescape_parameter(request.query.builder_name) + build_datetime = self._unescape_parameter( + request.query.build_datetime) + hash_sha1 = self._unescape_parameter(request.query.hash_sha1) + + db.record_build(cache_name, builder_name, build_datetime, hash_sha1) + + @app.get('/builds') + def get_builds(): + '''Return info on all known builds of a given artifact.''' + + cache_name = self._unescape_parameter(request.query.cache_name) + results = sorted(db.iter_builds_for_artifact_file(cache_name)) + + if len(results) == 0: + response.status = 404 + else: + return {cache_name: results} + + return app + + def process_args(self, args): + repo_cache = RepoCache(self, + self.settings['repo-dir'], + self.settings['bundle-dir'], + self.settings['direct-mode']) + + db = ArtifactDatabase(self.settings['database-file']) + + api_1_0 = self.api_1_0(repo_cache, db) + api_2_0 = self.api_2_0(db) + + root = Bottle() + root.mount(api_1_0, '/1.0') + root.mount(api_2_0, '/2.0') if self.settings['fcgi-server']: WSGIServer(root).run() diff --git a/morphcacheserver/artifact_database.py b/morphcacheserver/artifact_database.py new file mode 100644 index 0000000..0da46c4 --- /dev/null +++ b/morphcacheserver/artifact_database.py @@ -0,0 +1,130 @@ +# Copyright (C) 2015 Codethink Limited +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. + + +import yoyo +import yoyo.connections + +import logging +import os + + +log = logging.getLogger(__name__) + + +class ArtifactDatabase(object): + '''A database to track artifact info. + + This is so that we can have multiple builders submitting different builds + of the same artifact, then compare whether they are identical or not. + + It uses the 'yoyo-migrations' Python package to set up the database schema, + so that hopefully the schema can be changed in future without enourmous + hassle. + + ''' + def __init__(self, database_file): + self.db, self.paramstyle = yoyo.connections.connect( + 'sqlite:///' + database_file) + self.apply_migrations(self.migrations_dir()) + + def migrations_dir(self): + return os.path.join(os.path.dirname(__file__), 'migrations') + + def apply_migrations(self, migrations_path): + migrations = yoyo.read_migrations( + self.db, self.paramstyle, migrations_path, + migration_table='migrations') + to_apply = migrations.to_apply() + log.info('Found %i migrations, applying %i', len(migrations), + len(to_apply)) + to_apply.apply() + self.db.commit() + + def intern_artifact_file(self, cache_name): + '''Record that a Baserock artifact file is now in the cache directory. + + The 'cache_name' variable is the SHA256 hash of the 'cache key', plus + a string. + + The 'cache key' is a dictionary of values that should describe exactly + how the artifact was built. There is no standard for what a 'cache + key' should contain at present, different Baserock build tools can put + whatever info they want. This is not ideal, but hopefully use of SHA256 + makes collisions unlikely in any case. + + Artifacts can be split into multiple files, because one single build + operation can produce a bunch of different things. For example, running + './configure; make; make install' in glibc.git produces not only the + GNU C library binaries, but also documentation, helper utilities and + other miscellanous things. Thus, the string that follows the SHA256 + hash identifies which of these things we are talking about. There is + no standard for these strings, at present. It is up to the build tools + to make sense of them. + + ''' + cursor = self.db.cursor() + find_artifact_sql = 'SELECT internal_id FROM artifact_files WHERE ' \ + 'cache_name=?' + row = cursor.execute(find_artifact_sql, [cache_name]).fetchone() + if row is None: + log.debug('Recording new artifact file %s', cache_name) + cursor.execute( + 'INSERT INTO artifact_files(cache_name) VALUES(?)', + [cache_name]) + self.db.commit() + internal_id = cursor.lastrowid + else: + # If the artifact file was already known, no problem. + internal_id = row[0] + return internal_id + + def record_build(self, cache_name, builder_name, build_datetime, + hash_sha1): + '''Record a build of a Baserock artifact. + + The artifact file is identified by the 'cache name', which is a hash of + some information that describes how it is built. + + It is up to the build tool and the build instructions to ensure that a + given 'cache key' produces a set of identical artifact files each time. + We record every build that we receive in order to detect and highlight + cases where the build output for a given 'cache key' is not + deterministic. + + ''' + self.intern_artifact_file(cache_name) + + cursor = self.db.cursor() + log.debug('Recording new build of %s, %s, %s', cache_name, + builder_name, build_datetime) + cursor.execute( + 'INSERT INTO builds(cache_name, builder_name, build_datetime, ' + ' hash_sha1) VALUES(?, ?, ?, ?)', + [cache_name, builder_name, build_datetime, hash_sha1]) + self.db.commit() + + def iter_builds_for_artifact_file(self, cache_name): + '''Yield info on each recorded build of a given artifact.''' + cursor = self.db.cursor() + cursor.execute( + 'SELECT builder_name, build_datetime, hash_sha1 FROM builds WHERE ' + ' cache_name=?', [cache_name]) + for item in cursor: + builder_name, build_datetime, hash_sha1 = item + yield { + 'builder_name': builder_name, + 'build_datetime': build_datetime, + 'hash_sha1': hash_sha1 + } diff --git a/morphcacheserver/migrations/0001.init.py b/morphcacheserver/migrations/0001.init.py new file mode 100644 index 0000000..cbe8916 --- /dev/null +++ b/morphcacheserver/migrations/0001.init.py @@ -0,0 +1,44 @@ +# Copyright (C) 2015 Codethink Limited +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. + + +from yoyo import step, transaction + + +# This is a 'yoyo' database migration. Each step has an 'apply' SQL statement +# followed by a 'revert' SQL statement. + + +transaction( + step( + # This table isn't much use at present, in future it would be better + # store the actual inputs to the cache key, and not just the hash. + 'CREATE TABLE artifact_files (' + ' internal_id INTEGER UNIQUE PRIMARY KEY,' + ' cache_name VARCHAR UNIQUE NOT NULL' + ')', + 'DROP TABLE artifact_files' + ), + step( + 'CREATE TABLE builds (' + ' internal_id INTEGER UNIQUE PRIMARY KEY,' + ' cache_name VARCHAR NOT NULL,' + ' builder_name VARCHAR NOT NULL,' + ' build_datetime DATETIME NOT NULL,' + ' hash_sha1 VARCHAR,' + ' FOREIGN KEY (cache_name) REFERENCES artifact_files(cache_name)' + ')', + 'DROP TABLE builds' + ) +) |