summaryrefslogtreecommitdiff
path: root/morphlib/repocache.py
diff options
context:
space:
mode:
authorSam Thursfield <sam.thursfield@codethink.co.uk>2016-03-02 17:11:34 +0000
committerSam Thursfield <sam.thursfield@codethink.co.uk>2016-03-16 19:11:21 +0000
commitd58d8e8f7a4ec03ff14021a4515c8283dad52573 (patch)
tree5ece2d0524e4423bb953e6140831c9fde93b7219 /morphlib/repocache.py
parent014a029ade9a045a839ca86c35690b218098ea33 (diff)
downloadmorph-d58d8e8f7a4ec03ff14021a4515c8283dad52573.tar.gz
Unify local and remote repo cache modules
There's not really any reason you'd want to use the RemoteRepoCache class except as a workaround for the slow speed of some LocalRepoCache operations, so I can't see this ruining anyone's day. The main reason for doing this is so we can simply the sourceresolver code. One reason that the sourceresolver class is so hopelessly complicated is that it right now has to use two incompatible interfaces for Git repo caches. I've taken the opportunity to detangle the RepoCache class from the App class. Now all of the configuration for the RepoCache class is passed into the constructor explicitly. This makes the class usable from outside Morph: resolver = morphlib.repoaliasresolver.RepoAliasResolver(aliases=[]) repo_cache = morphlib.repocache.RepoCache('/src/cache/gits', resolver) Change-Id: I596c81d7645b67504c88e555172a8c238f4f8a66
Diffstat (limited to 'morphlib/repocache.py')
-rw-r--r--morphlib/repocache.py565
1 files changed, 565 insertions, 0 deletions
diff --git a/morphlib/repocache.py b/morphlib/repocache.py
new file mode 100644
index 00000000..f6978ec4
--- /dev/null
+++ b/morphlib/repocache.py
@@ -0,0 +1,565 @@
+# Copyright (C) 2012-2016 Codethink Limited
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import cliapp
+import fs.osfs
+
+import json
+import logging
+import os
+import string
+import sys
+import tempfile
+import urllib2
+import urlparse
+import urllib
+
+import morphlib
+from morphlib.util import word_join_list as _word_join_list
+
+
+# urlparse.urljoin needs to know details of the URL scheme being used.
+# It does not know about git:// by default, so we teach it here.
+gitscheme = ['git']
+urlparse.uses_relative.extend(gitscheme)
+urlparse.uses_netloc.extend(gitscheme)
+urlparse.uses_params.extend(gitscheme)
+urlparse.uses_query.extend(gitscheme)
+urlparse.uses_fragment.extend(gitscheme)
+
+
+def quote_url(url):
+ ''' Convert URIs to strings that only contain digits, letters, % and _.
+
+ NOTE: When changing the code of this function, make sure to also apply
+ the same to the quote_url() function of lorry. Otherwise the git tarballs
+ generated by lorry may no longer be found by morph.
+
+ '''
+ valid_chars = string.digits + string.letters + '%_'
+ transl = lambda x: x if x in valid_chars else '_'
+ return ''.join([transl(x) for x in url])
+
+
+class NoRemote(morphlib.Error):
+
+ def __init__(self, reponame, errors):
+ self.reponame = reponame
+ self.errors = errors
+
+ def __str__(self):
+ return '\n\t'.join(['Cannot find remote git repository: %s' %
+ self.reponame] + self.errors)
+
+
+class NotCached(morphlib.Error):
+ def __init__(self, reponame):
+ self.reponame = reponame
+
+ def __str__(self): # pragma: no cover
+ return 'Repository %s is not cached yet' % self.reponame
+
+
+class UpdateError(cliapp.AppException): # pragma: no cover
+
+ def __init__(self, repo):
+ cliapp.AppException.__init__(
+ self, 'Failed to update cached version of repo %s' % repo)
+
+
+class CachedRepo(morphlib.gitdir.GitDirectory):
+ '''A locally cached Git repository with an origin remote set up.
+
+ On instance of this class represents a locally cached version of a
+ remote Git repository. This remote repository is set up as the
+ 'origin' remote.
+
+ Cached repositories are bare mirrors of the upstream. Locally created
+ branches will be lost the next time the repository updates.
+
+ '''
+ def __init__(self, path, original_name, url):
+ self.original_name = original_name
+ self.url = url
+ self.is_mirror = not url.startswith('file://')
+ self.already_updated = False
+
+ super(CachedRepo, self).__init__(path)
+
+ def __str__(self): # pragma: no cover
+ return self.url
+
+
+class RepoCache(object):
+ '''Manage a collection of Git repositories.
+
+ When we build stuff, we need a local copy of the git repository.
+ To avoid having to clone the repositories for every build, we
+ maintain a local cache of the repositories: we first clone the
+ remote repository to the cache, and then make a local clone from
+ the cache to the build environment. This class manages the local
+ cached repositories.
+
+ Repositories may be specified either using a full URL, in a form
+ understood by git(1), or as a repository name to which a base url
+ is prepended. The base urls are given to the class when it is
+ created.
+
+ Instead of cloning via a normal 'git clone' directly from the
+ git server, we first try to download a tarball from a url, and
+ if that works, we unpack the tarball.
+
+ Certain questions about a repo can be resolved without cloning the whole
+ thing, if an instance of 'morph-cache-server' is available on the remote
+ Git server. This makes calculating the build graph for the first time
+ a whole lot faster, as we avoid cloning every repo locally. The
+ git_resolve_cache_url parameter enables this feature. Baserock 'Trove'
+ systems run 'morph-cache-server' by default.
+
+ The 'custom_fs' parameter takes a PyFilesystem instance, which you can use
+ to override where 'cachedir' is stored. This should probably only be used
+ for testing.
+
+ '''
+ def __init__(self, cachedir, resolver, tarball_base_url=None,
+ git_resolve_cache_url=None,
+ update_gits=True,
+ runcmd_cb=cliapp.runcmd, status_cb=lambda **kwargs: None,
+ verbose=False, debug=False,
+ custom_fs=None):
+ self.fs = custom_fs or fs.osfs.OSFS('/')
+
+ self.fs.makedir(cachedir, recursive=True, allow_recreate=True)
+
+ self.cachedir = cachedir
+ self._resolver = resolver
+ if tarball_base_url and not tarball_base_url.endswith('/'):
+ tarball_base_url += '/'
+ self._tarball_base_url = tarball_base_url
+ self._cached_repo_objects = {}
+
+ # Corresponds to the app 'no-git-update' setting
+ self.update_gits = update_gits
+
+ self.runcmd_cb = runcmd_cb
+ self.status_cb = status_cb
+ self.verbose = verbose
+ self.debug = debug
+
+ if git_resolve_cache_url: # pragma: no cover
+ self.remote_cache = RemoteRepoCache(git_resolve_cache_url,
+ resolver)
+ else:
+ self.remote_cache = None
+
+ def _git(self, args, **kwargs): # pragma: no cover
+ '''Execute git command.
+
+ This is a method of its own so that unit tests can easily override
+ all use of the external git command.
+
+ '''
+
+ morphlib.git.gitcmd(self.runcmd_cb, *args, **kwargs)
+
+ def _fetch(self, url, path): # pragma: no cover
+ '''Fetch contents of url into a file.
+
+ This method is meant to be overridden by unit tests.
+
+ '''
+ self.status_cb(msg="Trying to fetch %(tarball)s to seed the cache",
+ tarball=url, chatty=True)
+
+ if self.verbose:
+ verbosity_flags = []
+ kwargs = dict(stderr=sys.stderr)
+ else:
+ verbosity_flags = ['--quiet']
+ kwargs = dict()
+
+ def wget_command():
+ return ['wget'] + verbosity_flags + ['-O-', url]
+
+ self.runcmd_cb(wget_command(),
+ ['tar', '--no-same-owner', '-xf', '-'],
+ cwd=path, **kwargs)
+
+ def _mkdtemp(self, dirname): # pragma: no cover
+ '''Creates a temporary directory.
+
+ This method is meant to be overridden by unit tests.
+
+ '''
+ return tempfile.mkdtemp(dir=self.fs.getsyspath(dirname))
+
+ def _escape(self, url):
+ '''Escape a URL so it can be used as a basename in a file.'''
+
+ # FIXME: The following is a nicer way than to do this.
+ # However, for compatibility, we need to use the same as the
+ # tarball server (set up by Lorry) uses.
+ # return urllib.quote(url, safe='')
+
+ return quote_url(url)
+
+ def _cache_name(self, url):
+ scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
+ if scheme != 'file':
+ path = os.path.join(self.cachedir, self._escape(url))
+ return path
+
+ def has_repo(self, reponame):
+ '''Have we already got a cache of a given repo?'''
+ url = self._resolver.pull_url(reponame)
+ path = self._cache_name(url)
+ return self.fs.exists(path)
+
+ def _clone_with_tarball(self, repourl, path):
+ tarball_url = urlparse.urljoin(self._tarball_base_url,
+ self._escape(repourl)) + '.tar'
+ try:
+ self.fs.makedir(path)
+ self._fetch(tarball_url, path)
+ self._git(['config', 'remote.origin.url', repourl], cwd=path)
+ self._git(['config', 'remote.origin.mirror', 'true'], cwd=path)
+ self._git(['config', 'remote.origin.fetch', '+refs/*:refs/*'],
+ cwd=path)
+ except BaseException as e:
+ if self.fs.exists(path):
+ self.fs.removedir(path, force=True)
+ return False, 'Unable to extract tarball %s: %s' % (
+ tarball_url, e)
+
+ return True, None
+
+ def _new_cached_repo_instance(self, path, reponame, repourl):
+ return CachedRepo(path, reponame, repourl)
+
+ def _cache_repo(self, reponame):
+ '''Clone the given repo into the cache.
+
+ If the repo is already cloned, do nothing.
+
+ '''
+ errors = []
+
+ repourl = self._resolver.pull_url(reponame)
+ path = self._cache_name(repourl)
+ if self._tarball_base_url:
+ ok, error = self._clone_with_tarball(repourl, path)
+ if ok:
+ repo = self._get_repo(reponame)
+ self._update_repo(repo)
+ return repo
+ else:
+ errors.append(error)
+ self.status_cb(msg='Using git clone.')
+
+ target = self._mkdtemp(self.cachedir)
+
+ try:
+ self._git(['clone', '--mirror', '-n', repourl, target],
+ echo_stderr=self.debug)
+ except cliapp.AppException as e:
+ errors.append('Unable to clone from %s to %s: %s' %
+ (repourl, target, e))
+ if self.fs.exists(target):
+ self.fs.removedir(target, force=True)
+ raise NoRemote(reponame, errors)
+
+ self.fs.rename(target, path)
+
+ repo = self._new_cached_repo_instance(path, reponame, repourl)
+ repo.already_updated = True
+ return repo
+
+ def _get_repo(self, reponame):
+ '''Return an object representing a cached repository.'''
+
+ if reponame in self._cached_repo_objects:
+ return self._cached_repo_objects[reponame]
+ else:
+ repourl = self._resolver.pull_url(reponame)
+ path = self._cache_name(repourl)
+ if self.fs.exists(path):
+ repo = self._new_cached_repo_instance(path, reponame, repourl)
+ self._cached_repo_objects[reponame] = repo
+ return repo
+ elif self.update_gits:
+ return self._cache_repo(reponame)
+ else:
+ raise NotCached(reponame)
+
+ def _update_repo(self, cachedrepo): # pragma: no cover
+ try:
+ cachedrepo.update_remotes(
+ echo_stderr=self.verbose)
+ cachedrepo.already_updated = True
+ except cliapp.AppException:
+ raise UpdateError(self)
+
+ def get_updated_repo(self, repo_name,
+ ref=None, refs=None):
+ '''Return object representing cached repository.
+
+ If all the specified refs in 'ref' or 'refs' point to SHA1s that are
+ already in the repository, or --no-git-update is set, then the
+ repository won't be updated.
+
+ '''
+
+ if not self.update_gits:
+ self.status_cb(msg='Not updating existing git repository '
+ '%(repo_name)s '
+ 'because of no-git-update being set',
+ chatty=True,
+ repo_name=repo_name)
+ return self._get_repo(repo_name)
+
+ if ref is not None and refs is None:
+ refs = (ref,)
+ else:
+ refs = list(refs)
+
+ if self.has_repo(repo_name):
+ repo = self._get_repo(repo_name)
+ if refs:
+ required_refs = set(refs)
+ missing_refs = set()
+ for required_ref in required_refs: # pragma: no cover
+ if morphlib.git.is_valid_sha1(required_ref):
+ try:
+ repo.resolve_ref_to_commit(required_ref)
+ continue
+ except morphlib.gitdir.InvalidRefError:
+ pass
+ missing_refs.add(required_ref)
+
+ if not missing_refs: # pragma: no cover
+ self.status_cb(
+ msg='Not updating git repository %(repo_name)s '
+ 'because it already contains %(sha1s)s',
+ chatty=True, repo_name=repo_name,
+ sha1s=_word_join_list(tuple(required_refs)))
+ return repo
+
+ if ref:
+ ref_str = 'ref %s' % ref
+ else:
+ ref_str = '%i refs' % len(refs)
+ self.status_cb(msg='Updating %(repo_name)s for %(ref_str)s',
+ repo_name=repo_name, ref_str=ref_str)
+ self._update_repo(repo)
+ return repo
+ else:
+ self.status_cb(msg='Cloning %(repo_name)s', repo_name=repo_name)
+ return self._get_repo(repo_name)
+
+ def ensure_submodules(self, toplevel_repo,
+ toplevel_ref): # pragma: no cover
+ '''Ensure any submodules of a given repo are cached and up to date.'''
+
+ def submodules_for_repo(repo_path, ref):
+ try:
+ submodules = morphlib.git.Submodules(repo_path, ref,
+ runcmd_cb=self.runcmd_cb)
+ submodules.load()
+ return [(submod.url, submod.commit) for submod in submodules]
+ except morphlib.git.NoModulesFileError:
+ return []
+
+ done = set()
+ subs_to_process = submodules_for_repo(toplevel_repo.dirname,
+ toplevel_ref)
+ while subs_to_process:
+ url, ref = subs_to_process.pop()
+ done.add((url, ref))
+
+ cached_repo = self.get_updated_repo(url, ref=ref)
+
+ for submod in submodules_for_repo(cached_repo.dirname, ref):
+ if submod not in done:
+ subs_to_process.append(submod)
+
+ def resolve_ref_to_commit_and_tree(self, repo_name,
+ ref): # pragma: no cover
+ '''Given the name of a ref, returns the commit and tree SHA1.
+
+ If a remote cache server is available, this function can query the
+ remote cache server to avoid needing to clone the entire repo.
+
+ This might break if the ref points to a tag, not a commit.
+
+ '''
+ absref = None
+ tree = None
+
+ if self.has_repo(repo_name):
+ repo = self.get_updated_repo(repo_name, ref)
+ # If the user passed --no-git-update, and the ref is a SHA1 not
+ # available locally, this call will raise an exception.
+ absref = repo.resolve_ref_to_commit(ref)
+ tree = repo.resolve_ref_to_tree(absref)
+ elif self.remote_cache is not None:
+ try:
+ absref, tree = self.remote_cache.resolve_ref(repo_name, ref)
+ if absref is not None:
+ self.status_cb(
+ msg='Resolved %(repo_name)s %(ref)s via remote repo '
+ 'cache', repo_name=repo_name, ref=ref, chatty=True)
+ except BaseException as e:
+ logging.warning('Caught (and ignored) exception: %s' % str(e))
+
+ if absref is None:
+ # As a last resort, clone the repo to resolve the ref.
+ repo = self.get_updated_repo(repo_name, ref)
+ absref = repo.resolve_ref_to_commit(ref)
+ tree = repo.resolve_ref_to_tree(absref)
+
+ return absref, tree
+
+ def ls_tree(self, repo_name, ref): # pragma: no cover
+ '''Lists the files contained in a commit.
+
+ If a remote cache server is available, this function can query the
+ remote cache server to avoid needing to clone the entire repo.
+
+ The list is non-recursive, so you can only see files in the top
+ directory of the repo. To do a recursive operation, use a GitDir
+ instance returned by get_updated_repo().
+
+ '''
+ files = []
+
+ if self.has_repo(repo_name):
+ repo = self.get_updated_repo(repo_name, ref)
+ files = repo.list_files(ref=ref, recurse=False)
+ elif self.remote_cache is not None:
+ files = self.remote_cache.ls_tree(repo_name, ref)
+
+ if len(files) == 0:
+ # As a last resort, clone the repo to do get the file list.
+ repo = self.get_updated_repo(repo_name, ref)
+ files = repo.list_files(ref=ref, recurse=False)
+
+ return files
+
+ def cat_file(self, repo_name, ref, filename): # pragma: no cover
+ '''Returns a single file from a repo.
+
+ If a remote cache server is available, this function can query the
+ remote cache server to avoid needing to clone the entire repo.
+
+ '''
+ contents = None
+
+ if self.has_repo(repo_name):
+ repo = self.get_updated_repo(repo_name, ref)
+ contents = repo.get_file_from_ref(ref, filename)
+ elif self.remote_cache is not None:
+ contents = self.remote_cache.cat_file(repo_name, ref, filename)
+
+ if not contents:
+ # As a last resort, clone the repo to do get the file list.
+ repo = self.get_updated_repo(repo_name, ref)
+ contents = repo.get_file_from_ref(ref, filename)
+
+ return contents
+
+
+class RemoteResolveRefError(cliapp.AppException):
+
+ def __init__(self, repo_name, ref):
+ cliapp.AppException.__init__(
+ self, 'Failed to resolve ref %s for repo %s from remote cache' %
+ (ref, repo_name))
+
+
+class RemoteCatFileError(cliapp.AppException):
+
+ def __init__(self, repo_name, ref, filename):
+ cliapp.AppException.__init__(
+ self, 'Failed to cat file %s in ref %s of repo %s, from remote '
+ 'cache' % (filename, ref, repo_name))
+
+
+class RemoteLsTreeError(cliapp.AppException):
+
+ def __init__(self, repo_name, ref):
+ cliapp.AppException.__init__(
+ self, 'Failed to list tree in ref %s of repo %s, from remote'
+ 'cache' % (ref, repo_name))
+
+
+class RemoteRepoCache(object):
+
+ def __init__(self, server_url, resolver):
+ self.server_url = server_url
+ self._resolver = resolver
+
+ def resolve_ref(self, repo_name, ref):
+ repo_url = self._resolver.pull_url(repo_name)
+ try:
+ return self._resolve_ref_for_repo_url(repo_url, ref)
+ except BaseException as e:
+ logging.error('Caught exception: %s' % str(e))
+ raise RemoteResolveRefError(repo_name, ref)
+
+ def cat_file(self, repo_name, ref, filename):
+ repo_url = self._resolver.pull_url(repo_name)
+ try:
+ return self._cat_file_for_repo_url(repo_url, ref, filename)
+ except urllib2.HTTPError as e:
+ logging.error('Caught exception: %s' % str(e))
+ if e.code == 404:
+ raise RemoteCatFileError(repo_name, ref, filename)
+ raise # pragma: no cover
+
+ def ls_tree(self, repo_name, ref):
+ repo_url = self._resolver.pull_url(repo_name)
+ try:
+ info = json.loads(self._ls_tree_for_repo_url(repo_url, ref))
+ return info['tree'].keys()
+ except BaseException as e:
+ logging.error('Caught exception: %s' % str(e))
+ raise RemoteLsTreeError(repo_name, ref)
+
+ def _resolve_ref_for_repo_url(self, repo_url, ref): # pragma: no cover
+ data = self._make_request(
+ 'sha1s?repo=%s&ref=%s' % self._quote_strings(repo_url, ref))
+ info = json.loads(data)
+ return info['sha1'], info['tree']
+
+ def _cat_file_for_repo_url(self, repo_url, ref,
+ filename): # pragma: no cover
+ return self._make_request(
+ 'files?repo=%s&ref=%s&filename=%s'
+ % self._quote_strings(repo_url, ref, filename))
+
+ def _ls_tree_for_repo_url(self, repo_url, ref): # pragma: no cover
+ return self._make_request(
+ 'trees?repo=%s&ref=%s' % self._quote_strings(repo_url, ref))
+
+ def _quote_strings(self, *args): # pragma: no cover
+ return tuple(urllib.quote(string) for string in args)
+
+ def _make_request(self, path): # pragma: no cover
+ server_url = self.server_url
+ if not server_url.endswith('/'):
+ server_url += '/'
+ url = urlparse.urljoin(server_url, '/1.0/%s' % path)
+ handle = urllib2.urlopen(url)
+ return handle.read()