From 0f5f4821d801bb906d81259a58cf79e7c9aeb312 Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Tue, 25 Nov 2014 20:29:44 +0000 Subject: sourceresolver: Add comments and factor out common function --- morphlib/sourceresolver.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index 3a328eb7..4bc55bf2 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014 Codethink Limited +# Copyright (C) 2014-2015 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -117,6 +117,10 @@ class SourceResolver(object): if definitions_original_ref: definitions_ref = definitions_original_ref + # First, process the system and its stratum morphologies. These will + # all live in the same Git repository, and will point to various chunk + # morphologies. + while definitions_queue: filename = definitions_queue.popleft() @@ -127,6 +131,7 @@ class SourceResolver(object): visit(definitions_repo, definitions_ref, filename, definitions_absref, definitions_tree, morphology) + if morphology['kind'] == 'cluster': raise cliapp.AppException( "Cannot build a morphology of type 'cluster'.") @@ -149,7 +154,11 @@ class SourceResolver(object): chunk_in_definitions_repo_queue.append( (c['repo'], c['ref'], c['morph'])) - for repo, ref, filename in chunk_in_definitions_repo_queue: + # Now process all the chunks involved in the build. First those with + # morphologies in definitions.git, and then (for compatibility reasons + # only) those with the morphology in the chunk's source repository. + + def process_chunk(repo, ref, filename): if (repo, ref) not in resolved_trees: commit_sha1, tree_sha1 = self.resolve_ref(repo, ref) resolved_commits[repo, ref] = commit_sha1 @@ -162,18 +171,11 @@ class SourceResolver(object): morphology = resolved_morphologies[key] visit(repo, ref, filename, absref, tree, morphology) + for repo, ref, filename in chunk_in_definitions_repo_queue: + process_chunk_repo(repo, ref, filename) + for repo, ref, filename in chunk_in_source_repo_queue: - if (repo, ref) not in resolved_trees: - commit_sha1, tree_sha1 = self.resolve_ref(repo, ref) - resolved_commits[repo, ref] = commit_sha1 - resolved_trees[repo, commit_sha1] = tree_sha1 - absref = resolved_commits[repo, ref] - tree = resolved_trees[repo, absref] - key = (repo, absref, filename) - if key not in resolved_morphologies: - resolved_morphologies[key] = morph_factory.get_morphology(*key) - morphology = resolved_morphologies[key] - visit(repo, ref, filename, absref, tree, morphology) + process_chunk_repo(repo, ref, filename) def create_source_pool(lrc, rrc, repo, ref, filename, -- cgit v1.2.1 From bf14db9c66c80688b3ab462538c86d81d685882f Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Tue, 25 Nov 2014 21:40:06 +0000 Subject: Move MorphologyFactory into SourceResolver There's no need for this stuff to be in a separate class. This allows integrating it with the caching in the SourceResolver class. --- morphlib/sourceresolver.py | 78 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 9 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index 4bc55bf2..d9ffd049 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -22,6 +22,23 @@ import logging import morphlib +class SourceResolverError(cliapp.AppException): + pass + + +class MorphologyNotFoundError(SourceResolverError): + def __init__(self, filename): + SourceResolverError.__init__( + self, "Couldn't find morphology: %s" % filename) + + +class NotcachedError(SourceResolverError): + def __init__(self, repo_name): + SourceResolverError.__init__( + self, "Repository %s is not cached locally and there is no " + "remote cache specified" % repo_name) + + class SourceResolver(object): '''Provides a way of resolving the set of sources for a given system. @@ -55,6 +72,8 @@ class SourceResolver(object): self.status = status_cb + self._resolved_morphologies = {} + def resolve_ref(self, reponame, ref): '''Resolves commit and tree sha1s of the ref in a repo and returns it. @@ -96,12 +115,57 @@ class SourceResolver(object): tree = repo.resolve_ref_to_tree(absref) return absref, tree + def _get_morphology(self, reponame, sha1, filename): + '''Read the morphology at the specified location.''' + key = (reponame, sha1, filename) + if key in self._resolved_morphologies: + return self._resolved_morphologies[key] + + morph_name = os.path.splitext(os.path.basename(filename))[0] + loader = morphlib.morphloader.MorphologyLoader() + if self._lrc.has_repo(reponame): + self.status(msg="Looking for %s in local repo cache" % filename, + chatty=True) + try: + repo = self._lrc.get_repo(reponame) + text = repo.read_file(filename, sha1) + morph = loader.load_from_string(text) + except IOError: + morph = None + file_list = repo.list_files(ref=sha1, recurse=False) + elif self._rrc is not None: + self.status(msg="Retrieving %(reponame)s %(sha1)s %(filename)s" + " from the remote git cache.", + reponame=reponame, sha1=sha1, filename=filename, + chatty=True) + try: + text = self._rrc.cat_file(reponame, sha1, filename) + morph = loader.load_from_string(text) + except morphlib.remoterepocache.CatFileError: + morph = None + file_list = self._rrc.ls_tree(reponame, sha1) + else: + raise NotcachedError(reponame) + + if morph is None: + self.status(msg="File %s doesn't exist: attempting to infer " + "chunk morph from repo's build system" + % filename, chatty=True) + bs = morphlib.buildsystem.detect_build_system(file_list) + if bs is None: + raise MorphologyNotFoundError(filename) + morph = bs.get_morphology(morph_name) + loader.validate(morph) + loader.set_commands(morph) + loader.set_defaults(morph) + + self._resolved_morphologies[morph] = morph + return morph + def traverse_morphs(self, definitions_repo, definitions_ref, system_filenames, visit=lambda rn, rf, fn, arf, m: None, definitions_original_ref=None): - morph_factory = morphlib.morphologyfactory.MorphologyFactory( - self.lrc, self.rrc, self.status) definitions_queue = collections.deque(system_filenames) chunk_in_definitions_repo_queue = [] chunk_in_source_repo_queue = [] @@ -124,10 +188,8 @@ class SourceResolver(object): while definitions_queue: filename = definitions_queue.popleft() - key = (definitions_repo, definitions_absref, filename) - if not key in resolved_morphologies: - resolved_morphologies[key] = morph_factory.get_morphology(*key) - morphology = resolved_morphologies[key] + morphology = self._get_morphology( + definitions_repo, definitions_absref, filename) visit(definitions_repo, definitions_ref, filename, definitions_absref, definitions_tree, morphology) @@ -166,9 +228,7 @@ class SourceResolver(object): absref = resolved_commits[repo, ref] tree = resolved_trees[repo, absref] key = (definitions_repo, definitions_absref, filename) - if not key in resolved_morphologies: - resolved_morphologies[key] = morph_factory.get_morphology(*key) - morphology = resolved_morphologies[key] + morphology = self._get_morphology(*key) visit(repo, ref, filename, absref, tree, morphology) for repo, ref, filename in chunk_in_definitions_repo_queue: -- cgit v1.2.1 From eac9226720eee7cdd223d789550e35b103870cc0 Mon Sep 17 00:00:00 2001 From: Adam Coldrick Date: Thu, 22 Jan 2015 10:58:53 +0000 Subject: Add a cache manager utilising pylru and pickle --- morphlib/sourceresolver.py | 54 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index d9ffd049..e47fad8d 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -14,14 +14,64 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import cliapp - import collections +import cPickle import logging +import pylru + +import cliapp import morphlib +class PickleCacheManager(object): + '''Cache manager for PyLRU that reads and writes to Pickle files. + + The 'pickle' format is less than ideal in many ways and is actually + slower than JSON in Python. However, the data we need to cache is keyed + by tuples and in JSON a dict can only be keyed with strings. For now, + using 'pickle' seems to be the least worst option. + + ''' + + def __init__(self, filename, size): + self.filename = filename + self.size = size + + def _populate_cache_from_file(self, filename, cache): + try: + with open(filename, 'r') as f: + data = cPickle.load(f) + for key, value in data.iteritems(): + cache[key] = value + except (EOFError, IOError, cPickle.PickleError) as e: + logging.warning('Failed to load cache %s: %s', self.filename, e) + + def load_cache(self): + '''Create a pylru.lrucache object prepopulated with saved data.''' + cache = pylru.lrucache(self.size) + # There should be a more efficient way to do this, by hooking into + # the json module directly. + self._populate_cache_from_file(self.filename, cache) + return cache + + def save_cache(self, cache): + '''Save the data from a pylru.lrucache object to disk. + + Any changes that have been made by other instances or processes since + load_cache() was called will be overwritten. + + ''' + data = {} + for key, value in cache.items(): + data[key] = value + try: + with morphlib.savefile.SaveFile(self.filename, 'w') as f: + cPickle.dump(data, f) + except (IOError, cPickle.PickleError) as e: + logging.warning('Failed to save cache to %s: %s', self.filename, e) + + class SourceResolverError(cliapp.AppException): pass -- cgit v1.2.1 From cc91b8cec9f75ec5e3220878d496b612d97b4681 Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Thu, 6 Nov 2014 15:55:19 +0000 Subject: Add an LRU cache for resolved tree refs This uses the PyLRU module, from: . Python 3.2 and newer provide a built-in LRU cache, but this is specifically for in-memory use. See . Git commits are immutable, so caching information about their contents is fairly easy and trouble-free. There's no danger of the cache becoming stale. --- morphlib/sourceresolver.py | 101 ++++++++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 29 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index e47fad8d..c82a7702 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -17,12 +17,16 @@ import collections import cPickle import logging +import os import pylru import cliapp import morphlib +tree_cache_size = 10000 +tree_cache_filename = 'trees.cache.pickle' + class PickleCacheManager(object): '''Cache manager for PyLRU that reads and writes to Pickle files. @@ -92,9 +96,10 @@ class NotcachedError(SourceResolverError): class SourceResolver(object): '''Provides a way of resolving the set of sources for a given system. - There are two levels of caching involved in resolving the sources to build. + There are three levels of caching involved in resolving the sources to + build. - The canonical source for each source is specified in the build-command + The canonical repo for each source is specified in the build-command (for strata and systems) or in the stratum morphology (for chunks). It will be either a normal URL, or a keyed URL using a repo-alias like 'baserock:baserock/definitions'. @@ -111,27 +116,55 @@ class SourceResolver(object): entire repositories in $cachedir/gits. If a repo is not in the remote repo cache then it must be present in the local repo cache. + The third layer of caching is a simple commit SHA1 -> tree SHA mapping. It + turns out that even if all repos are available locally, running + 'git rev-parse' on hundreds of repos requires a lot of IO and can take + several minutes. Likewise, on a slow network connection it is time + consuming to keep querying the remote repo cache. This third layer of + caching works around both of those issues. + + The need for 3 levels of caching highlights design inconsistencies in + Baserock, but for now it is worth the effort to maintain this code to save + users from waiting 7 minutes each time that they want to build. The level 3 + cache is fairly simple because commits are immutable, so there is no danger + of this cache being stale as long as it is indexed by commit SHA1. Due to + the policy in Baserock of always using a commit SHA1 (rather than a named + ref) in the system definitions, it makes repeated builds of a system very + fast as no resolution needs to be done at all. + ''' - def __init__(self, local_repo_cache, remote_repo_cache, update_repos, - status_cb=None): + def __init__(self, local_repo_cache, remote_repo_cache, + tree_cache_manager, update_repos, status_cb=None): self.lrc = local_repo_cache self.rrc = remote_repo_cache + self.tree_cache_manager = tree_cache_manager self.update = update_repos - self.status = status_cb + self._resolved_trees = {} self._resolved_morphologies = {} - def resolve_ref(self, reponame, ref): + def _resolve_ref(self, reponame, ref): '''Resolves commit and tree sha1s of the ref in a repo and returns it. - If update is True then this has the side-effect of updating - or cloning the repository into the local repo cache. + If update is True then this has the side-effect of updating or cloning + the repository into the local repo cache. + + This function is complex due to the 3 layers of caching described in + the SourceResolver docstring. + ''' - absref = None + # The Baserock reference definitions use absolute refs so, and, if the + # absref is cached, we can short-circuit all this code. + if (reponame, ref) in self._resolved_trees: + logging.debug('Returning tree (%s, %s) from tree cache', + reponame, ref) + return ref, self._resolved_trees[(reponame, ref)] + + absref = None if self.lrc.has_repo(reponame): repo = self.lrc.get_repo(reponame) if self.update and repo.requires_update_for_ref(ref): @@ -160,9 +193,16 @@ class SourceResolver(object): repo = self.lrc.cache_repo(reponame) repo.update() else: + # This is likely to raise an exception, because if the local + # repo cache had the repo we'd have already resolved the ref. repo = self.lrc.get_repo(reponame) absref = repo.resolve_ref_to_commit(ref) tree = repo.resolve_ref_to_tree(absref) + + logging.debug('Writing tree to cache with ref (%s, %s)', + reponame, absref) + self._resolved_trees[(reponame, absref)] = tree + return absref, tree def _get_morphology(self, reponame, sha1, filename): @@ -173,23 +213,23 @@ class SourceResolver(object): morph_name = os.path.splitext(os.path.basename(filename))[0] loader = morphlib.morphloader.MorphologyLoader() - if self._lrc.has_repo(reponame): - self.status(msg="Looking for %s in local repo cache" % filename, - chatty=True) + if self.lrc.has_repo(reponame): + self.status(msg="Looking for %(reponame)s:%(filename)s in the " + "local repo cache.", + reponame=reponame, filename=filename, chatty=True) try: - repo = self._lrc.get_repo(reponame) + repo = self.lrc.get_repo(reponame) text = repo.read_file(filename, sha1) morph = loader.load_from_string(text) except IOError: morph = None file_list = repo.list_files(ref=sha1, recurse=False) - elif self._rrc is not None: - self.status(msg="Retrieving %(reponame)s %(sha1)s %(filename)s" - " from the remote git cache.", - reponame=reponame, sha1=sha1, filename=filename, - chatty=True) + elif self.rrc is not None: + self.status(msg="Looking for %(reponame)s:%(filename)s in the " + "remote repo cache.", + reponame=reponame, filename=filename, chatty=True) try: - text = self._rrc.cat_file(reponame, sha1, filename) + text = self.rrc.cat_file(reponame, sha1, filename) morph = loader.load_from_string(text) except morphlib.remoterepocache.CatFileError: morph = None @@ -221,11 +261,13 @@ class SourceResolver(object): chunk_in_source_repo_queue = [] resolved_commits = {} - resolved_trees = {} + + self._resolved_trees = self.tree_cache_manager.load_cache() + resolved_morphologies = {} # Resolve the (repo, ref) pair for the definitions repo, cache result. - definitions_absref, definitions_tree = self.resolve_ref( + definitions_absref, definitions_tree = self._resolve_ref( definitions_repo, definitions_ref) if definitions_original_ref: @@ -271,12 +313,7 @@ class SourceResolver(object): # only) those with the morphology in the chunk's source repository. def process_chunk(repo, ref, filename): - if (repo, ref) not in resolved_trees: - commit_sha1, tree_sha1 = self.resolve_ref(repo, ref) - resolved_commits[repo, ref] = commit_sha1 - resolved_trees[repo, commit_sha1] = tree_sha1 - absref = resolved_commits[repo, ref] - tree = resolved_trees[repo, absref] + absref, tree = self._resolve_ref(repo, ref) key = (definitions_repo, definitions_absref, filename) morphology = self._get_morphology(*key) visit(repo, ref, filename, absref, tree, morphology) @@ -287,8 +324,11 @@ class SourceResolver(object): for repo, ref, filename in chunk_in_source_repo_queue: process_chunk_repo(repo, ref, filename) + logging.debug('Saving contents of resolved tree cache') + self.tree_cache_manager.save_cache(self._resolved_trees) -def create_source_pool(lrc, rrc, repo, ref, filename, + +def create_source_pool(lrc, rrc, repo, ref, filename, cachedir, original_ref=None, update_repos=True, status_cb=None): '''Find all the sources involved in building a given system. @@ -314,7 +354,10 @@ def create_source_pool(lrc, rrc, repo, ref, filename, for source in sources: pool.add(source) - resolver = SourceResolver(lrc, rrc, update_repos, status_cb) + tree_cache_manager = PickleCacheManager( + os.path.join(cachedir, tree_cache_filename), tree_cache_size) + + resolver = SourceResolver(lrc, rrc, tree_cache_manager, update_repos, status_cb) resolver.traverse_morphs(repo, ref, [filename], visit=add_to_pool, definitions_original_ref=original_ref) -- cgit v1.2.1 From a1295cc039f846164eaaef40586f0ae97a49ed06 Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Tue, 25 Nov 2014 22:37:29 +0000 Subject: Add an LRU cache for detected build-systems This will speed up builds of chunks which don't have a chunk morph. It won't have much (if any) effect on the speed of the first build, but subsequent builds will be much faster as we won't have to query the git cache. --- morphlib/sourceresolver.py | 120 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 19 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index c82a7702..989bd9f9 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -26,6 +26,8 @@ import morphlib tree_cache_size = 10000 tree_cache_filename = 'trees.cache.pickle' +buildsystem_cache_size = 10000 +buildsystem_cache_filename = 'detected-chunk-buildsystems.cache.pickle' class PickleCacheManager(object): @@ -135,16 +137,19 @@ class SourceResolver(object): ''' def __init__(self, local_repo_cache, remote_repo_cache, - tree_cache_manager, update_repos, status_cb=None): + tree_cache_manager, buildsystem_cache_manager, update_repos, + status_cb=None): self.lrc = local_repo_cache self.rrc = remote_repo_cache self.tree_cache_manager = tree_cache_manager + self.buildsystem_cache_manager = buildsystem_cache_manager self.update = update_repos self.status = status_cb self._resolved_trees = {} self._resolved_morphologies = {} + self._resolved_buildsystems = {} def _resolve_ref(self, reponame, ref): '''Resolves commit and tree sha1s of the ref in a repo and returns it. @@ -206,12 +211,15 @@ class SourceResolver(object): return absref, tree def _get_morphology(self, reponame, sha1, filename): - '''Read the morphology at the specified location.''' + '''Read the morphology at the specified location. + + Returns None if the file does not exist in the specified commit. + + ''' key = (reponame, sha1, filename) if key in self._resolved_morphologies: return self._resolved_morphologies[key] - morph_name = os.path.splitext(os.path.basename(filename))[0] loader = morphlib.morphloader.MorphologyLoader() if self.lrc.has_repo(reponame): self.status(msg="Looking for %(reponame)s:%(filename)s in the " @@ -233,23 +241,62 @@ class SourceResolver(object): morph = loader.load_from_string(text) except morphlib.remoterepocache.CatFileError: morph = None - file_list = self._rrc.ls_tree(reponame, sha1) else: + # We assume that _resolve_ref() must have already been called and + # so the repo in question would have been made available already + # if it had been possible. raise NotcachedError(reponame) if morph is None: - self.status(msg="File %s doesn't exist: attempting to infer " - "chunk morph from repo's build system" - % filename, chatty=True) - bs = morphlib.buildsystem.detect_build_system(file_list) - if bs is None: - raise MorphologyNotFoundError(filename) - morph = bs.get_morphology(morph_name) + return None + else: loader.validate(morph) loader.set_commands(morph) loader.set_defaults(morph) + self._resolved_morphologies[key] = morph + return morph + + def _detect_build_system(self, reponame, sha1, expected_filename): + '''Attempt to detect buildsystem of the given commit. + + Returns None if no known build system was detected. + + ''' + self.status(msg="File %s doesn't exist: attempting to infer " + "chunk morph from repo's build system" % + expected_filename, chatty=True) + + if self.lrc.has_repo(reponame): + repo = self.lrc.get_repo(reponame) + file_list = repo.list_files(ref=sha1, recurse=False) + elif self.rrc is not None: + file_list = self.rrc.ls_tree(reponame, sha1) + else: + # We assume that _resolve_ref() must have already been called and + # so the repo in question would have been made available already + # if it had been possible. + raise NotcachedError(reponame) - self._resolved_morphologies[morph] = morph + buildsystem = morphlib.buildsystem.detect_build_system(file_list) + + if buildsystem is None: + # It might surprise you to discover that if we can't autodetect a + # build system, we raise MorphologyNotFoundError. Users are + # required to provide a morphology for any chunk where Morph can't + # infer the build instructions automatically, so this is the right + # error. + raise MorphologyNotFoundError(expected_filename) + + return buildsystem.name + + def _create_morphology_for_build_system(self, buildsystem_name, + morph_name): + bs = morphlib.buildsystem.lookup_build_system(buildsystem_name) + loader = morphlib.morphloader.MorphologyLoader() + morph = bs.get_morphology(morph_name) + loader.validate(morph) + loader.set_commands(morph) + loader.set_defaults(morph) return morph def traverse_morphs(self, definitions_repo, definitions_ref, @@ -260,9 +307,9 @@ class SourceResolver(object): chunk_in_definitions_repo_queue = [] chunk_in_source_repo_queue = [] - resolved_commits = {} - self._resolved_trees = self.tree_cache_manager.load_cache() + self._resolved_buildsystems = \ + self.buildsystem_cache_manager.load_cache() resolved_morphologies = {} @@ -314,19 +361,48 @@ class SourceResolver(object): def process_chunk(repo, ref, filename): absref, tree = self._resolve_ref(repo, ref) - key = (definitions_repo, definitions_absref, filename) - morphology = self._get_morphology(*key) + + key = (repo, ref, filename) + morph_name = os.path.splitext(os.path.basename(filename))[0] + + morphology = None + buildsystem = None + + if key in self._resolved_buildsystems: + buildsystem = self._resolved_buildsystems[key] + + if buildsystem is None: + # The morphologies aren't locally cached, so a morphology + # for a chunk kept in the chunk repo will be read every time. + # So, always keep your chunk morphs in your definitions repo. + morphology = self._get_morphology(*key) + + if morphology is None: + if buildsystem is None: + buildsystem = self._detect_build_system(*key) + if buildsystem is None: + raise MorphologyNotFoundError(filename) + else: + morphology = self._create_morphology_for_build_system( + buildsystem, morph_name) + self._resolved_morphologies[key] = morphology + visit(repo, ref, filename, absref, tree, morphology) for repo, ref, filename in chunk_in_definitions_repo_queue: - process_chunk_repo(repo, ref, filename) + process_chunk(repo, ref, filename) for repo, ref, filename in chunk_in_source_repo_queue: - process_chunk_repo(repo, ref, filename) + process_chunk(repo, ref, filename) logging.debug('Saving contents of resolved tree cache') self.tree_cache_manager.save_cache(self._resolved_trees) + logging.debug('Saving contents of build systems cache') + self.buildsystem_cache_manager.save_cache( + self._resolved_buildsystems) + + def create_source_pool(lrc, rrc, repo, ref, filename, cachedir, original_ref=None, update_repos=True, @@ -357,7 +433,13 @@ def create_source_pool(lrc, rrc, repo, ref, filename, cachedir, tree_cache_manager = PickleCacheManager( os.path.join(cachedir, tree_cache_filename), tree_cache_size) - resolver = SourceResolver(lrc, rrc, tree_cache_manager, update_repos, status_cb) + buildsystem_cache_manager = PickleCacheManager( + os.path.join(cachedir, buildsystem_cache_filename), + buildsystem_cache_size) + + resolver = SourceResolver(lrc, rrc, tree_cache_manager, + buildsystem_cache_manager, update_repos, + status_cb) resolver.traverse_morphs(repo, ref, [filename], visit=add_to_pool, definitions_original_ref=original_ref) -- cgit v1.2.1 From a9dd06285bcd3ed40cb333d8fc7f65e6f273ad88 Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Tue, 25 Nov 2014 22:43:50 +0000 Subject: Split up traverse_morphs to improve readability --- morphlib/sourceresolver.py | 140 +++++++++++++++++++++++++-------------------- 1 file changed, 79 insertions(+), 61 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index 989bd9f9..0e520b67 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -299,30 +299,15 @@ class SourceResolver(object): loader.set_defaults(morph) return morph - def traverse_morphs(self, definitions_repo, definitions_ref, - system_filenames, - visit=lambda rn, rf, fn, arf, m: None, - definitions_original_ref=None): + def _process_definitions_with_children(self, system_filenames, + definitions_repo, + definitions_ref, + definitions_absref, + definitions_tree, + visit): definitions_queue = collections.deque(system_filenames) - chunk_in_definitions_repo_queue = [] - chunk_in_source_repo_queue = [] - - self._resolved_trees = self.tree_cache_manager.load_cache() - self._resolved_buildsystems = \ - self.buildsystem_cache_manager.load_cache() - - resolved_morphologies = {} - - # Resolve the (repo, ref) pair for the definitions repo, cache result. - definitions_absref, definitions_tree = self._resolve_ref( - definitions_repo, definitions_ref) - - if definitions_original_ref: - definitions_ref = definitions_original_ref - - # First, process the system and its stratum morphologies. These will - # all live in the same Git repository, and will point to various chunk - # morphologies. + chunk_in_definitions_repo_queue = set() + chunk_in_source_repo_queue = set() while definitions_queue: filename = definitions_queue.popleft() @@ -330,6 +315,9 @@ class SourceResolver(object): morphology = self._get_morphology( definitions_repo, definitions_absref, filename) + if morphology is None: + raise MorphologyNotFoundError(filename) + visit(definitions_repo, definitions_ref, filename, definitions_absref, definitions_tree, morphology) @@ -349,59 +337,89 @@ class SourceResolver(object): if 'morph' not in c: path = morphlib.util.sanitise_morphology_path( c.get('morph', c['name'])) - chunk_in_source_repo_queue.append( + chunk_in_source_repo_queue.add( (c['repo'], c['ref'], path)) continue - chunk_in_definitions_repo_queue.append( + chunk_in_definitions_repo_queue.add( (c['repo'], c['ref'], c['morph'])) - # Now process all the chunks involved in the build. First those with - # morphologies in definitions.git, and then (for compatibility reasons - # only) those with the morphology in the chunk's source repository. + return chunk_in_definitions_repo_queue, chunk_in_source_repo_queue - def process_chunk(repo, ref, filename): - absref, tree = self._resolve_ref(repo, ref) + def process_chunk(self, definition_repo, definition_ref, chunk_repo, + chunk_ref, filename, visit): + definition_key = (definition_repo, definition_ref, filename) + chunk_key = (chunk_repo, chunk_ref, filename) - key = (repo, ref, filename) - morph_name = os.path.splitext(os.path.basename(filename))[0] + morph_name = os.path.splitext(os.path.basename(filename))[0] - morphology = None - buildsystem = None + morphology = None + buildsystem = None - if key in self._resolved_buildsystems: - buildsystem = self._resolved_buildsystems[key] + if chunk_key in self._resolved_buildsystems: + buildsystem = self._resolved_buildsystems[chunk_key] - if buildsystem is None: - # The morphologies aren't locally cached, so a morphology - # for a chunk kept in the chunk repo will be read every time. - # So, always keep your chunk morphs in your definitions repo. - morphology = self._get_morphology(*key) - - if morphology is None: - if buildsystem is None: - buildsystem = self._detect_build_system(*key) - if buildsystem is None: - raise MorphologyNotFoundError(filename) - else: - morphology = self._create_morphology_for_build_system( - buildsystem, morph_name) - self._resolved_morphologies[key] = morphology + if buildsystem is None: + # The morphologies aren't locally cached, so a morphology + # for a chunk kept in the chunk repo will be read every time. + # So, always keep your chunk morphs in your definitions repo, + # not in the chunk repo! + morphology = self._get_morphology(*definition_key) - visit(repo, ref, filename, absref, tree, morphology) + if morphology is None: + if buildsystem is None: + buildsystem = self._detect_build_system(*chunk_key) + if buildsystem is None: + raise MorphologyNotFoundError(filename) + else: + self._resolved_buildsystems[chunk_key] = buildsystem + morphology = self._create_morphology_for_build_system( + buildsystem, morph_name) + self._resolved_morphologies[definition_key] = morphology - for repo, ref, filename in chunk_in_definitions_repo_queue: - process_chunk(repo, ref, filename) + absref, tree = self._resolve_ref(chunk_repo, chunk_ref) + visit(chunk_repo, chunk_ref, filename, absref, tree, morphology) - for repo, ref, filename in chunk_in_source_repo_queue: - process_chunk(repo, ref, filename) + def traverse_morphs(self, definitions_repo, definitions_ref, + system_filenames, + visit=lambda rn, rf, fn, arf, m: None, + definitions_original_ref=None): + self._resolved_trees = self.tree_cache_manager.load_cache() + self._resolved_buildsystems = \ + self.buildsystem_cache_manager.load_cache() - logging.debug('Saving contents of resolved tree cache') - self.tree_cache_manager.save_cache(self._resolved_trees) + # Resolve the (repo, ref) pair for the definitions repo, cache result. + definitions_absref, definitions_tree = self._resolve_ref( + definitions_repo, definitions_ref) - logging.debug('Saving contents of build systems cache') - self.buildsystem_cache_manager.save_cache( - self._resolved_buildsystems) + if definitions_original_ref: + definitions_ref = definitions_original_ref + try: + # First, process the system and its stratum morphologies. These + # will all live in the same Git repository, and will point to + # various chunk morphologies. + chunk_in_definitions_repo_queue, chunk_in_source_repo_queue = \ + self._process_definitions_with_children( + system_filenames, definitions_repo, definitions_ref, + definitions_absref, definitions_tree, visit) + + # Now process all the chunks involved in the build. First those + # with morphologies in definitions.git, and then (for compatibility + # reasons only) those with the morphology in the chunk's source + # repository. + for repo, ref, filename in chunk_in_definitions_repo_queue: + self.process_chunk(definitions_repo, definitions_absref, repo, + ref, filename, visit) + + for repo, ref, filename in chunk_in_source_repo_queue: + self.process_chunk(repo, ref, repo, ref, filename, visit) + finally: + logging.debug('Saving contents of resolved tree cache') + self.tree_cache_manager.save_cache(self._resolved_trees) + + logging.debug('Saving contents of build systems cache') + self.buildsystem_cache_manager.save_cache( + self._resolved_buildsystems) def create_source_pool(lrc, rrc, repo, ref, filename, cachedir, -- cgit v1.2.1 From 2ff375e68935f17a63eebedb55b15f8e3aec3b40 Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Tue, 6 Jan 2015 18:13:28 +0000 Subject: Read files from a local clone of definitions where possible Most morphologies involved in a build are in the definitions repo these days. Currently we read each of them using `git cat-file`, which is slow. It's quicker to check out all the files in one go to a temporary directory and then read them from there. With the current workflow users often have definitions.git checked out on disk. It seems strange to not just read the files from there. There are two reasons why I don't want to do that yet: - there are commands which don't run inside a system branch, which would be broken if we expected to always be in a system branch - there may be local changes in the checked-out repo, and it takes around 5 seconds on each build to check if there aren't any local changes. It actually seems faster to just check out a known clean version from the cache. --- morphlib/sourceresolver.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index 0e520b67..b8af8aee 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -19,6 +19,8 @@ import cPickle import logging import os import pylru +import shutil +import tempfile import cliapp @@ -151,6 +153,8 @@ class SourceResolver(object): self._resolved_morphologies = {} self._resolved_buildsystems = {} + self._definitions_checkout_dir = None + def _resolve_ref(self, reponame, ref): '''Resolves commit and tree sha1s of the ref in a repo and returns it. @@ -220,8 +224,18 @@ class SourceResolver(object): if key in self._resolved_morphologies: return self._resolved_morphologies[key] + if reponame == self._definitions_repo and \ + sha1 == self._definitions_absref: + defs_filename = os.path.join(self._definitions_checkout_dir, + filename) + else: + defs_filename = None + + loader = morphlib.morphloader.MorphologyLoader() - if self.lrc.has_repo(reponame): + if defs_filename and os.path.exists(defs_filename): + morph = loader.load_from_file(defs_filename) + elif self.lrc.has_repo(reponame): self.status(msg="Looking for %(reponame)s:%(filename)s in the " "local repo cache.", reponame=reponame, filename=filename, chatty=True) @@ -394,7 +408,19 @@ class SourceResolver(object): if definitions_original_ref: definitions_ref = definitions_original_ref + self._definitions_checkout_dir = tempfile.mkdtemp() + try: + # FIXME: not an ideal way of passing this info across + self._definitions_repo = definitions_repo + self._definitions_absref = definitions_absref + try: + definitions_cached_repo = self.lrc.get_repo(definitions_repo) + except morphlib.localrepocache.NotCached: + definitions_cached_repo = self.lrc.cache_repo(definitions_repo) + definitions_cached_repo.extract_commit( + definitions_absref, self._definitions_checkout_dir) + # First, process the system and its stratum morphologies. These # will all live in the same Git repository, and will point to # various chunk morphologies. @@ -414,6 +440,9 @@ class SourceResolver(object): for repo, ref, filename in chunk_in_source_repo_queue: self.process_chunk(repo, ref, repo, ref, filename, visit) finally: + shutil.rmtree(self._definitions_checkout_dir) + self._definitions_checkout_dir = None + logging.debug('Saving contents of resolved tree cache') self.tree_cache_manager.save_cache(self._resolved_trees) -- cgit v1.2.1 From ba574f4d1ae390d4f94dc62817f68fe390b38778 Mon Sep 17 00:00:00 2001 From: Adam Coldrick Date: Wed, 21 Jan 2015 16:00:32 +0000 Subject: Add tests for sourceresolver This only adds tests for the bits which were moved from morphologyfactory into sourceresolver, namely detection of build systems and the '_get_morphology()' function. These are just the morphologyfactory tests reworked slightly to work properly with the modified API. --- morphlib/sourceresolver.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'morphlib/sourceresolver.py') diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index b8af8aee..29069d7d 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -32,7 +32,7 @@ buildsystem_cache_size = 10000 buildsystem_cache_filename = 'detected-chunk-buildsystems.cache.pickle' -class PickleCacheManager(object): +class PickleCacheManager(object): # pragma: no cover '''Cache manager for PyLRU that reads and writes to Pickle files. The 'pickle' format is less than ideal in many ways and is actually @@ -84,7 +84,7 @@ class SourceResolverError(cliapp.AppException): pass -class MorphologyNotFoundError(SourceResolverError): +class MorphologyNotFoundError(SourceResolverError): # pragma: no cover def __init__(self, filename): SourceResolverError.__init__( self, "Couldn't find morphology: %s" % filename) @@ -155,7 +155,7 @@ class SourceResolver(object): self._definitions_checkout_dir = None - def _resolve_ref(self, reponame, ref): + def _resolve_ref(self, reponame, ref): # pragma: no cover '''Resolves commit and tree sha1s of the ref in a repo and returns it. If update is True then this has the side-effect of updating or cloning @@ -225,7 +225,7 @@ class SourceResolver(object): return self._resolved_morphologies[key] if reponame == self._definitions_repo and \ - sha1 == self._definitions_absref: + sha1 == self._definitions_absref: # pragma: no cover defs_filename = os.path.join(self._definitions_checkout_dir, filename) else: @@ -233,7 +233,7 @@ class SourceResolver(object): loader = morphlib.morphloader.MorphologyLoader() - if defs_filename and os.path.exists(defs_filename): + if defs_filename and os.path.exists(defs_filename): # pragma: no cover morph = loader.load_from_file(defs_filename) elif self.lrc.has_repo(reponame): self.status(msg="Looking for %(reponame)s:%(filename)s in the " @@ -304,7 +304,7 @@ class SourceResolver(object): return buildsystem.name def _create_morphology_for_build_system(self, buildsystem_name, - morph_name): + morph_name): # pragma: no cover bs = morphlib.buildsystem.lookup_build_system(buildsystem_name) loader = morphlib.morphloader.MorphologyLoader() morph = bs.get_morphology(morph_name) @@ -318,7 +318,7 @@ class SourceResolver(object): definitions_ref, definitions_absref, definitions_tree, - visit): + visit): # pragma: no cover definitions_queue = collections.deque(system_filenames) chunk_in_definitions_repo_queue = set() chunk_in_source_repo_queue = set() @@ -360,7 +360,7 @@ class SourceResolver(object): return chunk_in_definitions_repo_queue, chunk_in_source_repo_queue def process_chunk(self, definition_repo, definition_ref, chunk_repo, - chunk_ref, filename, visit): + chunk_ref, filename, visit): # pragma: no cover definition_key = (definition_repo, definition_ref, filename) chunk_key = (chunk_repo, chunk_ref, filename) @@ -396,7 +396,7 @@ class SourceResolver(object): def traverse_morphs(self, definitions_repo, definitions_ref, system_filenames, visit=lambda rn, rf, fn, arf, m: None, - definitions_original_ref=None): + definitions_original_ref=None): # pragma: no cover self._resolved_trees = self.tree_cache_manager.load_cache() self._resolved_buildsystems = \ self.buildsystem_cache_manager.load_cache() @@ -453,7 +453,7 @@ class SourceResolver(object): def create_source_pool(lrc, rrc, repo, ref, filename, cachedir, original_ref=None, update_repos=True, - status_cb=None): + status_cb=None): # pragma: no cover '''Find all the sources involved in building a given system. Given a system morphology, this function will traverse the tree of stratum -- cgit v1.2.1