From 88f0a35ba71044b84161e3d8d9e6bbb1314118cd Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Thu, 6 Nov 2014 15:55:19 +0000 Subject: Add an LRU cache for resolved tree refs This uses the PyLRU module, from: . Python 3.2 and newer provide a built-in LRU cache, but this is specifically for in-memory use. See . Git commits are immutable, so caching information about their contents is fairly easy and trouble-free. There's no danger of the cache becoming stale. --- morphlib/buildcommand.py | 1 + morphlib/plugins/list_artifacts_plugin.py | 3 +- morphlib/sourceresolver.py | 97 ++++++++++++++++++++++--------- 3 files changed, 73 insertions(+), 28 deletions(-) diff --git a/morphlib/buildcommand.py b/morphlib/buildcommand.py index a22e689b..d121b895 100644 --- a/morphlib/buildcommand.py +++ b/morphlib/buildcommand.py @@ -96,6 +96,7 @@ class BuildCommand(object): self.app.status(msg='Creating source pool', chatty=True) srcpool = morphlib.sourceresolver.create_source_pool( self.lrc, self.rrc, repo_name, ref, filename, + cachedir=self.app.settings['cachedir'], original_ref=original_ref, update_repos=not self.app.settings['no-git-update'], status_cb=self.app.status) diff --git a/morphlib/plugins/list_artifacts_plugin.py b/morphlib/plugins/list_artifacts_plugin.py index 6944cff4..53056bad 100644 --- a/morphlib/plugins/list_artifacts_plugin.py +++ b/morphlib/plugins/list_artifacts_plugin.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014 Codethink Limited +# Copyright (C) 2014-2015 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -86,6 +86,7 @@ class ListArtifactsPlugin(cliapp.Plugin): msg='Creating source pool for %s' % system_filename, chatty=True) source_pool = morphlib.sourceresolver.create_source_pool( self.lrc, self.rrc, repo, ref, system_filename, + cachedir=self.app.settings['cachedir'], update_repos = not self.app.settings['no-git-update'], status_cb=self.app.status) diff --git a/morphlib/sourceresolver.py b/morphlib/sourceresolver.py index 35f65729..1ac0cecb 100644 --- a/morphlib/sourceresolver.py +++ b/morphlib/sourceresolver.py @@ -19,6 +19,7 @@ import cliapp import collections import cPickle import logging +import os import pylru import morphlib @@ -93,9 +94,10 @@ class NotcachedError(SourceResolverError): class SourceResolver(object): '''Provides a way of resolving the set of sources for a given system. - There are two levels of caching involved in resolving the sources to build. + There are three levels of caching involved in resolving the sources to + build. - The canonical source for each source is specified in the build-command + The canonical repo for each source is specified in the build-command (for strata and systems) or in the stratum morphology (for chunks). It will be either a normal URL, or a keyed URL using a repo-alias like 'baserock:baserock/definitions'. @@ -112,27 +114,55 @@ class SourceResolver(object): entire repositories in $cachedir/gits. If a repo is not in the remote repo cache then it must be present in the local repo cache. + The third layer of caching is a simple commit SHA1 -> tree SHA mapping. It + turns out that even if all repos are available locally, running + 'git rev-parse' on hundreds of repos requires a lot of IO and can take + several minutes. Likewise, on a slow network connection it is time + consuming to keep querying the remote repo cache. This third layer of + caching works around both of those issues. + + The need for 3 levels of caching highlights design inconsistencies in + Baserock, but for now it is worth the effort to maintain this code to save + users from waiting 7 minutes each time that they want to build. The level 3 + cache is fairly simple because commits are immutable, so there is no danger + of this cache being stale as long as it is indexed by commit SHA1. Due to + the policy in Baserock of always using a commit SHA1 (rather than a named + ref) in the system definitions, it makes repeated builds of a system very + fast as no resolution needs to be done at all. + ''' - def __init__(self, local_repo_cache, remote_repo_cache, update_repos, - status_cb=None): + def __init__(self, local_repo_cache, remote_repo_cache, + tree_cache_manager, update_repos, status_cb=None): self.lrc = local_repo_cache self.rrc = remote_repo_cache + self.tree_cache_manager = tree_cache_manager self.update = update_repos - self.status = status_cb + self._resolved_trees = {} self._resolved_morphologies = {} - def resolve_ref(self, reponame, ref): + def _resolve_ref(self, reponame, ref): '''Resolves commit and tree sha1s of the ref in a repo and returns it. - If update is True then this has the side-effect of updating - or cloning the repository into the local repo cache. + If update is True then this has the side-effect of updating or cloning + the repository into the local repo cache. + + This function is complex due to the 3 layers of caching described in + the SourceResolver docstring. + ''' - absref = None + # The Baserock reference definitions use absolute refs so, and, if the + # absref is cached, we can short-circuit all this code. + if (reponame, ref) in self._resolved_trees: + logging.debug('Returning tree (%s, %s) from tree cache', + reponame, ref) + return ref, self._resolved_trees[(reponame, ref)] + + absref = None if self.lrc.has_repo(reponame): repo = self.lrc.get_repo(reponame) if self.update and repo.requires_update_for_ref(ref): @@ -161,9 +191,15 @@ class SourceResolver(object): repo = self.lrc.cache_repo(reponame) repo.update() else: + # This is likely to raise an exception, because if the local + # repo cache had the repo we'd have already resolved the ref. repo = self.lrc.get_repo(reponame) absref = repo.resolve_ref_to_commit(ref) tree = repo.resolve_ref_to_tree(absref) + + logging.debug('Writing tree to cache with ref (%s, %s)', reponame, ref) + self._resolved_trees[(reponame, absref)] = tree + return absref, tree def _get_morphology(self, reponame, sha1, filename): @@ -174,23 +210,24 @@ class SourceResolver(object): morph_name = os.path.splitext(os.path.basename(filename))[0] loader = morphlib.morphloader.MorphologyLoader() - if self._lrc.has_repo(reponame): - self.status(msg="Looking for %s in local repo cache" % filename, - chatty=True) + if self.lrc.has_repo(reponame): + self.status(msg="Looking for %(reponame)s:%(filename)s in local " + "repo cache", + reponame=reponame, filename=filename, chatty=True) try: - repo = self._lrc.get_repo(reponame) + repo = self.lrc.get_repo(reponame) text = repo.read_file(filename, sha1) morph = loader.load_from_string(text) except IOError: morph = None file_list = repo.list_files(ref=sha1, recurse=False) - elif self._rrc is not None: - self.status(msg="Retrieving %(reponame)s %(sha1)s %(filename)s" - " from the remote git cache.", + elif self.rrc is not None: + self.status(msg="Looking for %(reponame)s %(sha1)s %(filename)s" + " in the remote git cache.", reponame=reponame, sha1=sha1, filename=filename, chatty=True) try: - text = self._rrc.cat_file(reponame, sha1, filename) + text = self.rrc.cat_file(reponame, sha1, filename) morph = loader.load_from_string(text) except morphlib.remoterepocache.CatFileError: morph = None @@ -222,11 +259,13 @@ class SourceResolver(object): chunk_in_source_repo_queue = [] resolved_commits = {} - resolved_trees = {} + + self._resolved_trees = self.tree_cache_manager.load_cache() + resolved_morphologies = {} # Resolve the (repo, ref) pair for the definitions repo, cache result. - definitions_absref, definitions_tree = self.resolve_ref( + definitions_absref, definitions_tree = self._resolve_ref( definitions_repo, definitions_ref) if definitions_original_ref: @@ -272,12 +311,7 @@ class SourceResolver(object): # only) those with the morphology in the chunk's source repository. def process_chunk(repo, ref, filename): - if (repo, ref) not in resolved_trees: - commit_sha1, tree_sha1 = self.resolve_ref(repo, ref) - resolved_commits[repo, ref] = commit_sha1 - resolved_trees[repo, commit_sha1] = tree_sha1 - absref = resolved_commits[repo, ref] - tree = resolved_trees[repo, absref] + absref, tree = self._resolve_ref(repo, ref) key = (definitions_repo, definitions_absref, filename) morphology = self._get_morphology(*key) visit(repo, ref, filename, absref, tree, morphology) @@ -288,8 +322,11 @@ class SourceResolver(object): for repo, ref, filename in chunk_in_source_repo_queue: process_chunk_repo(repo, ref, filename) + logging.debug('Saving contents of resolved tree cache') + self.tree_cache_manager.save_cache(self._resolved_trees) + -def create_source_pool(lrc, rrc, repo, ref, filename, +def create_source_pool(lrc, rrc, repo, ref, filename, cachedir, original_ref=None, update_repos=True, status_cb=None): '''Find all the sources involved in building a given system. @@ -315,7 +352,13 @@ def create_source_pool(lrc, rrc, repo, ref, filename, for source in sources: pool.add(source) - resolver = SourceResolver(lrc, rrc, update_repos, status_cb) + update_repos = not app.settings['no-git-update'] + + tree_cache_size = 10000 + tree_cache_manager = PickleCacheManager( + os.path.join(cachedir, 'trees.cache.pickle'), tree_cache_size) + + resolver = SourceResolver(lrc, rrc, tree_cache_manager, update_repos, status_cb) resolver.traverse_morphs(repo, ref, [filename], visit=add_to_pool, definitions_original_ref=original_ref) -- cgit v1.2.1