summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Thursfield <sam.thursfield@codethink.co.uk>2014-05-08 16:32:40 +0000
committerSam Thursfield <sam.thursfield@codethink.co.uk>2014-05-08 16:32:40 +0000
commitf50267b8056076623b694a1dfde00611e6eeab2a (patch)
treeb74b0cfc41c68e7b5b39be78056f3a41306eb88f
parentccd99277c0c7ab0d272e540a47380cea9f03d3b6 (diff)
downloadmorph-sam/validation.tar.gz
Add command to validate the local artifact cachesam/validation
This tests the checksums of artifacts and tests any unpacked chunks. The unpacked chunks are tested by turning them back into tarfiles and calculating a checksum, which we can rely on being identical to the chunk artifact in the artifact cache if there is no corruption. That is pretty cool! This code doesn't work because the zlib.adler32() code can't be used as a rolling checksum. I'm going to switch to using hashlib.md5() which should still be fast enough. Alternately we could go for sha256 right away, so that when we /do/ have bit-for-bit reproducibility from source, we can say that everything is totally secure.
-rw-r--r--morphlib/bins.py61
-rw-r--r--morphlib/localartifactcache.py119
-rw-r--r--morphlib/plugins/cache_check_plugin.py9
3 files changed, 179 insertions, 10 deletions
diff --git a/morphlib/bins.py b/morphlib/bins.py
index 28542962..7dca8172 100644
--- a/morphlib/bins.py
+++ b/morphlib/bins.py
@@ -29,6 +29,7 @@ import errno
import stat
import shutil
import tarfile
+import functools
import zlib
import morphlib
@@ -117,22 +118,64 @@ def create_chunk(rootdir, f, include, dump_memory_profile=None):
return stream.checksum
+def make_tarinfo_path_relative_to(root, info):
+ '''Strip rootdir from a file's path before adding to a tarfile.'''
+
+ # tar.gettarinfo() makes all paths relative, we must follow that.
+ root = root.lstrip('/')
+ info.name = os.path.relpath(info.name, root)
+ if info.islnk():
+ info.linkname = os.path.relpath(info.linkname, root)
+ return info
+
+
+def create_chunk_2(rootdir, f, name, include):
+ '''Create a chunk artifact, new way.
+
+ Output should be identical to create_chunk(), but it doesn't delete the
+ files after creating the chunk, and doesn't require the caller to work
+ out all the files that should go in. (But it does that because of chunk
+ splitting!!! *OH*.....)
+ '''
+
+ # This timestamp is used to normalize the mtime for every file in
+ # chunk artifact. This is useful to avoid problems from smallish
+ # clock skew. It needs to be recent enough, however, that GNU tar
+ # does not complain about an implausibly old timestamp.
+ normalized_timestamp = 683074800
+
+ stream = ChecksummingOutputStream(f)
+ with tarfile.open(fileobj=stream, mode='w|') as tar:
+ for filepath in sorted(paths):
+ if filepath == rootdir:
+ # I'm not sure how the ChunkBuilder.assemble_chunk_artifact()
+ # code path manages to avoid adding '.' to the tarfile, but it
+ # does
+ continue
+ # Normalize mtime for everything.
+ tarinfo = tar.gettarinfo(filepath)
+ tarinfo = make_tarinfo_path_relative_to(rootdir, tarinfo)
+ tarinfo.ctime = normalized_timestamp
+ tarinfo.mtime = normalized_timestamp
+ if tarinfo.isreg():
+ # FIXME: why this?
+ with open(filepath, 'rb') as f:
+ tar.addfile(tarinfo, fileobj=f)
+ else:
+ tar.addfile(tarinfo)
+
+ return stream.checksum
+
def create_system(rootdir, f, name):
'''Create a system artifact from the contents of a directory.
'''
- unslashy_root = rootdir[1:]
- def uproot_info(info):
- '''Strip rootdir from a file's path before adding to a tarfile.'''
- info.name = os.path.relpath(info.name, unslashy_root)
- if info.islnk():
- info.linkname = os.path.relpath(info.linkname, unslashy_root)
- return info
-
stream = ChecksummingOutputStream(f)
+
+ path_filter = functools.partial(make_tarinfo_path_relative_to, rootdir)
with tarfile.open(fileobj=stream, mode="w|", name=name) as tar:
- tar.add(rootdir, recursive=True, filter=uproot_info)
+ tar.add(rootdir, recursive=True, filter=path_filter)
return stream.checksum
diff --git a/morphlib/localartifactcache.py b/morphlib/localartifactcache.py
index 10ddd638..faef27f4 100644
--- a/morphlib/localartifactcache.py
+++ b/morphlib/localartifactcache.py
@@ -15,8 +15,12 @@
import collections
+import json
+import logging
import os
+import tempfile
import time
+import zlib
import morphlib
@@ -138,3 +142,118 @@ class LocalArtifactCache(object):
for filename in (x for x in self.cachefs.walkfiles()
if x.startswith(cachekey)):
self.cachefs.remove(filename)
+
+ def _calculate_checksum(self, artifact_filename):
+ # FIXME: pick a block size
+ block_size = 10 * 1024 * 1024 # 10MB
+ checksum = 0
+ with open(artifact_filename, 'rb') as f:
+ block = f.read(block_size)
+ checksum = (checksum + zlib.adler32(block)) & 0xFFFFFFFF
+ return checksum
+
+ def _calculate_unpacked_chunk_checksum(self, chunk_dir):
+ # create a chunk artifact from the unpacked chunk and return the
+ # checksum. It should be identical, right ??
+ #
+ # This code is not the same code used in builder2.ChunkBuilder.
+ # It's actually much better and as soon as I've checked that it
+ # produces identical results it should be used in builder2 too.
+ # I'm especially confused why bins.create_chunk() removes files,
+ # instead of leaving it up to the ChunkBuilder code.
+
+ def filepaths(destdir):
+ for dirname, subdirs, basenames in os.walk(destdir):
+ subdirsymlinks = [os.path.join(dirname, x) for x in subdirs
+ if os.path.islink(x)]
+ filenames = [os.path.join(dirname, x) for x in basenames]
+ for path in [dirname] + subdirsymlinks + filenames:
+ yield path
+ paths = filepaths(rootdir)
+
+ with tempfile.NamedTemporaryFile(delete=False) as f:
+ print ">>>> Filename: %s" % f.name
+ checksum = morphlib.bins.create_chunk_2(
+ chunk_dir, f, name=None, include=paths)
+
+ return checksum
+
+ def validate(self, unpacked_chunk_cache_dir):
+ '''Check for corruption in all cached binary artifacts.'''
+ cache_key = None
+ errors = {}
+
+ n_artifacts = 0
+ n_checksummed_artifacts = 0
+
+ def error(msg):
+ errors[cache_key] = errors.get(cache_key, '') + '\n' + msg
+ logging.error(
+ 'Error in locally cached build %s. %s' % (cache_key, msg))
+
+ for cache_key, artifacts, last_used in self.list_contents():
+ if len(cache_key) < 64:
+ # Morph itself leaves junk temporary files around in the
+ # artifact cache directory, as does the user. Ignore it.
+ logging.info('Ignoring %s' % cache_key)
+ continue
+
+ binary_artifacts = list(artifacts - {'build-log', 'meta'})
+ kind = binary_artifacts[0].split('.', 1)[0]
+
+ if kind == 'stratum':
+ continue
+
+ logging.info(
+ msg='Checking artifacts for %s %s' % (kind, cache_key))
+
+ n_artifacts += len(artifacts)
+
+ filename = self._source_metadata_filename(None, cache_key, 'meta')
+ try:
+ with open(filename) as f:
+ build_info = json.load(f)
+ except (IOError, OSError, ValueError) as e:
+ error('Unable to read source metadata: %s' % e)
+ continue
+
+ if 'checksums' not in build_info:
+ # This is the case for artifacts created by old versions of
+ # Morph. We don't raise an error, for compatiblity.
+ logging.warning(
+ 'No checksums for build %s %s.' % (kind, cache_key))
+ continue
+
+ for artifact in binary_artifacts:
+ if '.' not in artifact:
+ logging.warning('Invalid artifact name %s' % artifact)
+ continue
+
+ _, artifact_name = artifact.split('.', 1)
+ expected_checksum = build_info['checksums'].get(artifact_name)
+
+ if expected_checksum == None:
+ error('Checksum missing for artifact %s!' % artifact_name)
+ continue
+
+ artifact_filename = self.cachefs.getsyspath(
+ '%s.%s' % (cache_key, artifact))
+ checksum = self._calculate_checksum(artifact_filename)
+
+ if checksum != expected_checksum:
+ error('Artifact %s has checksum 0x%x, expected 0x%x' %
+ (artifact, checksum, expected_checksum))
+
+ n_checksummed_artifacts += 1
+
+ # Check for an unpacked version now.
+ cached_name = '%s.%s.d' % (cache_key, artifact)
+ cached_path = os.path.join(unpacked_chunk_cache_dir,
+ cached_name)
+ if os.path.exists(cached_path):
+ checksum = self._calculate_unpacked_chunk_checksum(
+ cached_path)
+
+ if checksum != expected_checksum:
+ error('Unpacked chunk artifact %s has checksum 0x%x, expected 0x%x' %
+ (artifact, checksum, expected_checksum))
diff --git a/morphlib/plugins/cache_check_plugin.py b/morphlib/plugins/cache_check_plugin.py
index 621d9d8a..4315c265 100644
--- a/morphlib/plugins/cache_check_plugin.py
+++ b/morphlib/plugins/cache_check_plugin.py
@@ -16,6 +16,7 @@
import cliapp
import contextlib
+import os
import uuid
import morphlib
@@ -71,5 +72,11 @@ class CacheCheckPlugin(cliapp.Plugin):
artifacts which are used at build time.
'''
+ self.app.status(
+ msg='Checking all locally cached build artifacts for corruption')
+
lac, rac = morphlib.util.new_artifact_caches(self.app.settings)
- lac.validate()
+ unpacked_chunk_cache_dir = os.path.join(self.app.settings['tempdir'], 'chunks')
+ lac.validate(unpacked_chunk_cache_dir)
+
+ # FIXME: ccache is not validated! don't use ccache, perhaps!