diff options
Diffstat (limited to 'morphlib/repocache.py')
-rw-r--r-- | morphlib/repocache.py | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/morphlib/repocache.py b/morphlib/repocache.py new file mode 100644 index 00000000..f6978ec4 --- /dev/null +++ b/morphlib/repocache.py @@ -0,0 +1,565 @@ +# Copyright (C) 2012-2016 Codethink Limited +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. + + +import cliapp +import fs.osfs + +import json +import logging +import os +import string +import sys +import tempfile +import urllib2 +import urlparse +import urllib + +import morphlib +from morphlib.util import word_join_list as _word_join_list + + +# urlparse.urljoin needs to know details of the URL scheme being used. +# It does not know about git:// by default, so we teach it here. +gitscheme = ['git'] +urlparse.uses_relative.extend(gitscheme) +urlparse.uses_netloc.extend(gitscheme) +urlparse.uses_params.extend(gitscheme) +urlparse.uses_query.extend(gitscheme) +urlparse.uses_fragment.extend(gitscheme) + + +def quote_url(url): + ''' Convert URIs to strings that only contain digits, letters, % and _. + + NOTE: When changing the code of this function, make sure to also apply + the same to the quote_url() function of lorry. Otherwise the git tarballs + generated by lorry may no longer be found by morph. + + ''' + valid_chars = string.digits + string.letters + '%_' + transl = lambda x: x if x in valid_chars else '_' + return ''.join([transl(x) for x in url]) + + +class NoRemote(morphlib.Error): + + def __init__(self, reponame, errors): + self.reponame = reponame + self.errors = errors + + def __str__(self): + return '\n\t'.join(['Cannot find remote git repository: %s' % + self.reponame] + self.errors) + + +class NotCached(morphlib.Error): + def __init__(self, reponame): + self.reponame = reponame + + def __str__(self): # pragma: no cover + return 'Repository %s is not cached yet' % self.reponame + + +class UpdateError(cliapp.AppException): # pragma: no cover + + def __init__(self, repo): + cliapp.AppException.__init__( + self, 'Failed to update cached version of repo %s' % repo) + + +class CachedRepo(morphlib.gitdir.GitDirectory): + '''A locally cached Git repository with an origin remote set up. + + On instance of this class represents a locally cached version of a + remote Git repository. This remote repository is set up as the + 'origin' remote. + + Cached repositories are bare mirrors of the upstream. Locally created + branches will be lost the next time the repository updates. + + ''' + def __init__(self, path, original_name, url): + self.original_name = original_name + self.url = url + self.is_mirror = not url.startswith('file://') + self.already_updated = False + + super(CachedRepo, self).__init__(path) + + def __str__(self): # pragma: no cover + return self.url + + +class RepoCache(object): + '''Manage a collection of Git repositories. + + When we build stuff, we need a local copy of the git repository. + To avoid having to clone the repositories for every build, we + maintain a local cache of the repositories: we first clone the + remote repository to the cache, and then make a local clone from + the cache to the build environment. This class manages the local + cached repositories. + + Repositories may be specified either using a full URL, in a form + understood by git(1), or as a repository name to which a base url + is prepended. The base urls are given to the class when it is + created. + + Instead of cloning via a normal 'git clone' directly from the + git server, we first try to download a tarball from a url, and + if that works, we unpack the tarball. + + Certain questions about a repo can be resolved without cloning the whole + thing, if an instance of 'morph-cache-server' is available on the remote + Git server. This makes calculating the build graph for the first time + a whole lot faster, as we avoid cloning every repo locally. The + git_resolve_cache_url parameter enables this feature. Baserock 'Trove' + systems run 'morph-cache-server' by default. + + The 'custom_fs' parameter takes a PyFilesystem instance, which you can use + to override where 'cachedir' is stored. This should probably only be used + for testing. + + ''' + def __init__(self, cachedir, resolver, tarball_base_url=None, + git_resolve_cache_url=None, + update_gits=True, + runcmd_cb=cliapp.runcmd, status_cb=lambda **kwargs: None, + verbose=False, debug=False, + custom_fs=None): + self.fs = custom_fs or fs.osfs.OSFS('/') + + self.fs.makedir(cachedir, recursive=True, allow_recreate=True) + + self.cachedir = cachedir + self._resolver = resolver + if tarball_base_url and not tarball_base_url.endswith('/'): + tarball_base_url += '/' + self._tarball_base_url = tarball_base_url + self._cached_repo_objects = {} + + # Corresponds to the app 'no-git-update' setting + self.update_gits = update_gits + + self.runcmd_cb = runcmd_cb + self.status_cb = status_cb + self.verbose = verbose + self.debug = debug + + if git_resolve_cache_url: # pragma: no cover + self.remote_cache = RemoteRepoCache(git_resolve_cache_url, + resolver) + else: + self.remote_cache = None + + def _git(self, args, **kwargs): # pragma: no cover + '''Execute git command. + + This is a method of its own so that unit tests can easily override + all use of the external git command. + + ''' + + morphlib.git.gitcmd(self.runcmd_cb, *args, **kwargs) + + def _fetch(self, url, path): # pragma: no cover + '''Fetch contents of url into a file. + + This method is meant to be overridden by unit tests. + + ''' + self.status_cb(msg="Trying to fetch %(tarball)s to seed the cache", + tarball=url, chatty=True) + + if self.verbose: + verbosity_flags = [] + kwargs = dict(stderr=sys.stderr) + else: + verbosity_flags = ['--quiet'] + kwargs = dict() + + def wget_command(): + return ['wget'] + verbosity_flags + ['-O-', url] + + self.runcmd_cb(wget_command(), + ['tar', '--no-same-owner', '-xf', '-'], + cwd=path, **kwargs) + + def _mkdtemp(self, dirname): # pragma: no cover + '''Creates a temporary directory. + + This method is meant to be overridden by unit tests. + + ''' + return tempfile.mkdtemp(dir=self.fs.getsyspath(dirname)) + + def _escape(self, url): + '''Escape a URL so it can be used as a basename in a file.''' + + # FIXME: The following is a nicer way than to do this. + # However, for compatibility, we need to use the same as the + # tarball server (set up by Lorry) uses. + # return urllib.quote(url, safe='') + + return quote_url(url) + + def _cache_name(self, url): + scheme, netloc, path, query, fragment = urlparse.urlsplit(url) + if scheme != 'file': + path = os.path.join(self.cachedir, self._escape(url)) + return path + + def has_repo(self, reponame): + '''Have we already got a cache of a given repo?''' + url = self._resolver.pull_url(reponame) + path = self._cache_name(url) + return self.fs.exists(path) + + def _clone_with_tarball(self, repourl, path): + tarball_url = urlparse.urljoin(self._tarball_base_url, + self._escape(repourl)) + '.tar' + try: + self.fs.makedir(path) + self._fetch(tarball_url, path) + self._git(['config', 'remote.origin.url', repourl], cwd=path) + self._git(['config', 'remote.origin.mirror', 'true'], cwd=path) + self._git(['config', 'remote.origin.fetch', '+refs/*:refs/*'], + cwd=path) + except BaseException as e: + if self.fs.exists(path): + self.fs.removedir(path, force=True) + return False, 'Unable to extract tarball %s: %s' % ( + tarball_url, e) + + return True, None + + def _new_cached_repo_instance(self, path, reponame, repourl): + return CachedRepo(path, reponame, repourl) + + def _cache_repo(self, reponame): + '''Clone the given repo into the cache. + + If the repo is already cloned, do nothing. + + ''' + errors = [] + + repourl = self._resolver.pull_url(reponame) + path = self._cache_name(repourl) + if self._tarball_base_url: + ok, error = self._clone_with_tarball(repourl, path) + if ok: + repo = self._get_repo(reponame) + self._update_repo(repo) + return repo + else: + errors.append(error) + self.status_cb(msg='Using git clone.') + + target = self._mkdtemp(self.cachedir) + + try: + self._git(['clone', '--mirror', '-n', repourl, target], + echo_stderr=self.debug) + except cliapp.AppException as e: + errors.append('Unable to clone from %s to %s: %s' % + (repourl, target, e)) + if self.fs.exists(target): + self.fs.removedir(target, force=True) + raise NoRemote(reponame, errors) + + self.fs.rename(target, path) + + repo = self._new_cached_repo_instance(path, reponame, repourl) + repo.already_updated = True + return repo + + def _get_repo(self, reponame): + '''Return an object representing a cached repository.''' + + if reponame in self._cached_repo_objects: + return self._cached_repo_objects[reponame] + else: + repourl = self._resolver.pull_url(reponame) + path = self._cache_name(repourl) + if self.fs.exists(path): + repo = self._new_cached_repo_instance(path, reponame, repourl) + self._cached_repo_objects[reponame] = repo + return repo + elif self.update_gits: + return self._cache_repo(reponame) + else: + raise NotCached(reponame) + + def _update_repo(self, cachedrepo): # pragma: no cover + try: + cachedrepo.update_remotes( + echo_stderr=self.verbose) + cachedrepo.already_updated = True + except cliapp.AppException: + raise UpdateError(self) + + def get_updated_repo(self, repo_name, + ref=None, refs=None): + '''Return object representing cached repository. + + If all the specified refs in 'ref' or 'refs' point to SHA1s that are + already in the repository, or --no-git-update is set, then the + repository won't be updated. + + ''' + + if not self.update_gits: + self.status_cb(msg='Not updating existing git repository ' + '%(repo_name)s ' + 'because of no-git-update being set', + chatty=True, + repo_name=repo_name) + return self._get_repo(repo_name) + + if ref is not None and refs is None: + refs = (ref,) + else: + refs = list(refs) + + if self.has_repo(repo_name): + repo = self._get_repo(repo_name) + if refs: + required_refs = set(refs) + missing_refs = set() + for required_ref in required_refs: # pragma: no cover + if morphlib.git.is_valid_sha1(required_ref): + try: + repo.resolve_ref_to_commit(required_ref) + continue + except morphlib.gitdir.InvalidRefError: + pass + missing_refs.add(required_ref) + + if not missing_refs: # pragma: no cover + self.status_cb( + msg='Not updating git repository %(repo_name)s ' + 'because it already contains %(sha1s)s', + chatty=True, repo_name=repo_name, + sha1s=_word_join_list(tuple(required_refs))) + return repo + + if ref: + ref_str = 'ref %s' % ref + else: + ref_str = '%i refs' % len(refs) + self.status_cb(msg='Updating %(repo_name)s for %(ref_str)s', + repo_name=repo_name, ref_str=ref_str) + self._update_repo(repo) + return repo + else: + self.status_cb(msg='Cloning %(repo_name)s', repo_name=repo_name) + return self._get_repo(repo_name) + + def ensure_submodules(self, toplevel_repo, + toplevel_ref): # pragma: no cover + '''Ensure any submodules of a given repo are cached and up to date.''' + + def submodules_for_repo(repo_path, ref): + try: + submodules = morphlib.git.Submodules(repo_path, ref, + runcmd_cb=self.runcmd_cb) + submodules.load() + return [(submod.url, submod.commit) for submod in submodules] + except morphlib.git.NoModulesFileError: + return [] + + done = set() + subs_to_process = submodules_for_repo(toplevel_repo.dirname, + toplevel_ref) + while subs_to_process: + url, ref = subs_to_process.pop() + done.add((url, ref)) + + cached_repo = self.get_updated_repo(url, ref=ref) + + for submod in submodules_for_repo(cached_repo.dirname, ref): + if submod not in done: + subs_to_process.append(submod) + + def resolve_ref_to_commit_and_tree(self, repo_name, + ref): # pragma: no cover + '''Given the name of a ref, returns the commit and tree SHA1. + + If a remote cache server is available, this function can query the + remote cache server to avoid needing to clone the entire repo. + + This might break if the ref points to a tag, not a commit. + + ''' + absref = None + tree = None + + if self.has_repo(repo_name): + repo = self.get_updated_repo(repo_name, ref) + # If the user passed --no-git-update, and the ref is a SHA1 not + # available locally, this call will raise an exception. + absref = repo.resolve_ref_to_commit(ref) + tree = repo.resolve_ref_to_tree(absref) + elif self.remote_cache is not None: + try: + absref, tree = self.remote_cache.resolve_ref(repo_name, ref) + if absref is not None: + self.status_cb( + msg='Resolved %(repo_name)s %(ref)s via remote repo ' + 'cache', repo_name=repo_name, ref=ref, chatty=True) + except BaseException as e: + logging.warning('Caught (and ignored) exception: %s' % str(e)) + + if absref is None: + # As a last resort, clone the repo to resolve the ref. + repo = self.get_updated_repo(repo_name, ref) + absref = repo.resolve_ref_to_commit(ref) + tree = repo.resolve_ref_to_tree(absref) + + return absref, tree + + def ls_tree(self, repo_name, ref): # pragma: no cover + '''Lists the files contained in a commit. + + If a remote cache server is available, this function can query the + remote cache server to avoid needing to clone the entire repo. + + The list is non-recursive, so you can only see files in the top + directory of the repo. To do a recursive operation, use a GitDir + instance returned by get_updated_repo(). + + ''' + files = [] + + if self.has_repo(repo_name): + repo = self.get_updated_repo(repo_name, ref) + files = repo.list_files(ref=ref, recurse=False) + elif self.remote_cache is not None: + files = self.remote_cache.ls_tree(repo_name, ref) + + if len(files) == 0: + # As a last resort, clone the repo to do get the file list. + repo = self.get_updated_repo(repo_name, ref) + files = repo.list_files(ref=ref, recurse=False) + + return files + + def cat_file(self, repo_name, ref, filename): # pragma: no cover + '''Returns a single file from a repo. + + If a remote cache server is available, this function can query the + remote cache server to avoid needing to clone the entire repo. + + ''' + contents = None + + if self.has_repo(repo_name): + repo = self.get_updated_repo(repo_name, ref) + contents = repo.get_file_from_ref(ref, filename) + elif self.remote_cache is not None: + contents = self.remote_cache.cat_file(repo_name, ref, filename) + + if not contents: + # As a last resort, clone the repo to do get the file list. + repo = self.get_updated_repo(repo_name, ref) + contents = repo.get_file_from_ref(ref, filename) + + return contents + + +class RemoteResolveRefError(cliapp.AppException): + + def __init__(self, repo_name, ref): + cliapp.AppException.__init__( + self, 'Failed to resolve ref %s for repo %s from remote cache' % + (ref, repo_name)) + + +class RemoteCatFileError(cliapp.AppException): + + def __init__(self, repo_name, ref, filename): + cliapp.AppException.__init__( + self, 'Failed to cat file %s in ref %s of repo %s, from remote ' + 'cache' % (filename, ref, repo_name)) + + +class RemoteLsTreeError(cliapp.AppException): + + def __init__(self, repo_name, ref): + cliapp.AppException.__init__( + self, 'Failed to list tree in ref %s of repo %s, from remote' + 'cache' % (ref, repo_name)) + + +class RemoteRepoCache(object): + + def __init__(self, server_url, resolver): + self.server_url = server_url + self._resolver = resolver + + def resolve_ref(self, repo_name, ref): + repo_url = self._resolver.pull_url(repo_name) + try: + return self._resolve_ref_for_repo_url(repo_url, ref) + except BaseException as e: + logging.error('Caught exception: %s' % str(e)) + raise RemoteResolveRefError(repo_name, ref) + + def cat_file(self, repo_name, ref, filename): + repo_url = self._resolver.pull_url(repo_name) + try: + return self._cat_file_for_repo_url(repo_url, ref, filename) + except urllib2.HTTPError as e: + logging.error('Caught exception: %s' % str(e)) + if e.code == 404: + raise RemoteCatFileError(repo_name, ref, filename) + raise # pragma: no cover + + def ls_tree(self, repo_name, ref): + repo_url = self._resolver.pull_url(repo_name) + try: + info = json.loads(self._ls_tree_for_repo_url(repo_url, ref)) + return info['tree'].keys() + except BaseException as e: + logging.error('Caught exception: %s' % str(e)) + raise RemoteLsTreeError(repo_name, ref) + + def _resolve_ref_for_repo_url(self, repo_url, ref): # pragma: no cover + data = self._make_request( + 'sha1s?repo=%s&ref=%s' % self._quote_strings(repo_url, ref)) + info = json.loads(data) + return info['sha1'], info['tree'] + + def _cat_file_for_repo_url(self, repo_url, ref, + filename): # pragma: no cover + return self._make_request( + 'files?repo=%s&ref=%s&filename=%s' + % self._quote_strings(repo_url, ref, filename)) + + def _ls_tree_for_repo_url(self, repo_url, ref): # pragma: no cover + return self._make_request( + 'trees?repo=%s&ref=%s' % self._quote_strings(repo_url, ref)) + + def _quote_strings(self, *args): # pragma: no cover + return tuple(urllib.quote(string) for string in args) + + def _make_request(self, path): # pragma: no cover + server_url = self.server_url + if not server_url.endswith('/'): + server_url += '/' + url = urlparse.urljoin(server_url, '/1.0/%s' % path) + handle = urllib2.urlopen(url) + return handle.read() |