summaryrefslogtreecommitdiff
path: root/morphlib/repocache.py
diff options
context:
space:
mode:
Diffstat (limited to 'morphlib/repocache.py')
-rw-r--r--morphlib/repocache.py565
1 files changed, 565 insertions, 0 deletions
diff --git a/morphlib/repocache.py b/morphlib/repocache.py
new file mode 100644
index 00000000..f6978ec4
--- /dev/null
+++ b/morphlib/repocache.py
@@ -0,0 +1,565 @@
+# Copyright (C) 2012-2016 Codethink Limited
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import cliapp
+import fs.osfs
+
+import json
+import logging
+import os
+import string
+import sys
+import tempfile
+import urllib2
+import urlparse
+import urllib
+
+import morphlib
+from morphlib.util import word_join_list as _word_join_list
+
+
+# urlparse.urljoin needs to know details of the URL scheme being used.
+# It does not know about git:// by default, so we teach it here.
+gitscheme = ['git']
+urlparse.uses_relative.extend(gitscheme)
+urlparse.uses_netloc.extend(gitscheme)
+urlparse.uses_params.extend(gitscheme)
+urlparse.uses_query.extend(gitscheme)
+urlparse.uses_fragment.extend(gitscheme)
+
+
+def quote_url(url):
+ ''' Convert URIs to strings that only contain digits, letters, % and _.
+
+ NOTE: When changing the code of this function, make sure to also apply
+ the same to the quote_url() function of lorry. Otherwise the git tarballs
+ generated by lorry may no longer be found by morph.
+
+ '''
+ valid_chars = string.digits + string.letters + '%_'
+ transl = lambda x: x if x in valid_chars else '_'
+ return ''.join([transl(x) for x in url])
+
+
+class NoRemote(morphlib.Error):
+
+ def __init__(self, reponame, errors):
+ self.reponame = reponame
+ self.errors = errors
+
+ def __str__(self):
+ return '\n\t'.join(['Cannot find remote git repository: %s' %
+ self.reponame] + self.errors)
+
+
+class NotCached(morphlib.Error):
+ def __init__(self, reponame):
+ self.reponame = reponame
+
+ def __str__(self): # pragma: no cover
+ return 'Repository %s is not cached yet' % self.reponame
+
+
+class UpdateError(cliapp.AppException): # pragma: no cover
+
+ def __init__(self, repo):
+ cliapp.AppException.__init__(
+ self, 'Failed to update cached version of repo %s' % repo)
+
+
+class CachedRepo(morphlib.gitdir.GitDirectory):
+ '''A locally cached Git repository with an origin remote set up.
+
+ On instance of this class represents a locally cached version of a
+ remote Git repository. This remote repository is set up as the
+ 'origin' remote.
+
+ Cached repositories are bare mirrors of the upstream. Locally created
+ branches will be lost the next time the repository updates.
+
+ '''
+ def __init__(self, path, original_name, url):
+ self.original_name = original_name
+ self.url = url
+ self.is_mirror = not url.startswith('file://')
+ self.already_updated = False
+
+ super(CachedRepo, self).__init__(path)
+
+ def __str__(self): # pragma: no cover
+ return self.url
+
+
+class RepoCache(object):
+ '''Manage a collection of Git repositories.
+
+ When we build stuff, we need a local copy of the git repository.
+ To avoid having to clone the repositories for every build, we
+ maintain a local cache of the repositories: we first clone the
+ remote repository to the cache, and then make a local clone from
+ the cache to the build environment. This class manages the local
+ cached repositories.
+
+ Repositories may be specified either using a full URL, in a form
+ understood by git(1), or as a repository name to which a base url
+ is prepended. The base urls are given to the class when it is
+ created.
+
+ Instead of cloning via a normal 'git clone' directly from the
+ git server, we first try to download a tarball from a url, and
+ if that works, we unpack the tarball.
+
+ Certain questions about a repo can be resolved without cloning the whole
+ thing, if an instance of 'morph-cache-server' is available on the remote
+ Git server. This makes calculating the build graph for the first time
+ a whole lot faster, as we avoid cloning every repo locally. The
+ git_resolve_cache_url parameter enables this feature. Baserock 'Trove'
+ systems run 'morph-cache-server' by default.
+
+ The 'custom_fs' parameter takes a PyFilesystem instance, which you can use
+ to override where 'cachedir' is stored. This should probably only be used
+ for testing.
+
+ '''
+ def __init__(self, cachedir, resolver, tarball_base_url=None,
+ git_resolve_cache_url=None,
+ update_gits=True,
+ runcmd_cb=cliapp.runcmd, status_cb=lambda **kwargs: None,
+ verbose=False, debug=False,
+ custom_fs=None):
+ self.fs = custom_fs or fs.osfs.OSFS('/')
+
+ self.fs.makedir(cachedir, recursive=True, allow_recreate=True)
+
+ self.cachedir = cachedir
+ self._resolver = resolver
+ if tarball_base_url and not tarball_base_url.endswith('/'):
+ tarball_base_url += '/'
+ self._tarball_base_url = tarball_base_url
+ self._cached_repo_objects = {}
+
+ # Corresponds to the app 'no-git-update' setting
+ self.update_gits = update_gits
+
+ self.runcmd_cb = runcmd_cb
+ self.status_cb = status_cb
+ self.verbose = verbose
+ self.debug = debug
+
+ if git_resolve_cache_url: # pragma: no cover
+ self.remote_cache = RemoteRepoCache(git_resolve_cache_url,
+ resolver)
+ else:
+ self.remote_cache = None
+
+ def _git(self, args, **kwargs): # pragma: no cover
+ '''Execute git command.
+
+ This is a method of its own so that unit tests can easily override
+ all use of the external git command.
+
+ '''
+
+ morphlib.git.gitcmd(self.runcmd_cb, *args, **kwargs)
+
+ def _fetch(self, url, path): # pragma: no cover
+ '''Fetch contents of url into a file.
+
+ This method is meant to be overridden by unit tests.
+
+ '''
+ self.status_cb(msg="Trying to fetch %(tarball)s to seed the cache",
+ tarball=url, chatty=True)
+
+ if self.verbose:
+ verbosity_flags = []
+ kwargs = dict(stderr=sys.stderr)
+ else:
+ verbosity_flags = ['--quiet']
+ kwargs = dict()
+
+ def wget_command():
+ return ['wget'] + verbosity_flags + ['-O-', url]
+
+ self.runcmd_cb(wget_command(),
+ ['tar', '--no-same-owner', '-xf', '-'],
+ cwd=path, **kwargs)
+
+ def _mkdtemp(self, dirname): # pragma: no cover
+ '''Creates a temporary directory.
+
+ This method is meant to be overridden by unit tests.
+
+ '''
+ return tempfile.mkdtemp(dir=self.fs.getsyspath(dirname))
+
+ def _escape(self, url):
+ '''Escape a URL so it can be used as a basename in a file.'''
+
+ # FIXME: The following is a nicer way than to do this.
+ # However, for compatibility, we need to use the same as the
+ # tarball server (set up by Lorry) uses.
+ # return urllib.quote(url, safe='')
+
+ return quote_url(url)
+
+ def _cache_name(self, url):
+ scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
+ if scheme != 'file':
+ path = os.path.join(self.cachedir, self._escape(url))
+ return path
+
+ def has_repo(self, reponame):
+ '''Have we already got a cache of a given repo?'''
+ url = self._resolver.pull_url(reponame)
+ path = self._cache_name(url)
+ return self.fs.exists(path)
+
+ def _clone_with_tarball(self, repourl, path):
+ tarball_url = urlparse.urljoin(self._tarball_base_url,
+ self._escape(repourl)) + '.tar'
+ try:
+ self.fs.makedir(path)
+ self._fetch(tarball_url, path)
+ self._git(['config', 'remote.origin.url', repourl], cwd=path)
+ self._git(['config', 'remote.origin.mirror', 'true'], cwd=path)
+ self._git(['config', 'remote.origin.fetch', '+refs/*:refs/*'],
+ cwd=path)
+ except BaseException as e:
+ if self.fs.exists(path):
+ self.fs.removedir(path, force=True)
+ return False, 'Unable to extract tarball %s: %s' % (
+ tarball_url, e)
+
+ return True, None
+
+ def _new_cached_repo_instance(self, path, reponame, repourl):
+ return CachedRepo(path, reponame, repourl)
+
+ def _cache_repo(self, reponame):
+ '''Clone the given repo into the cache.
+
+ If the repo is already cloned, do nothing.
+
+ '''
+ errors = []
+
+ repourl = self._resolver.pull_url(reponame)
+ path = self._cache_name(repourl)
+ if self._tarball_base_url:
+ ok, error = self._clone_with_tarball(repourl, path)
+ if ok:
+ repo = self._get_repo(reponame)
+ self._update_repo(repo)
+ return repo
+ else:
+ errors.append(error)
+ self.status_cb(msg='Using git clone.')
+
+ target = self._mkdtemp(self.cachedir)
+
+ try:
+ self._git(['clone', '--mirror', '-n', repourl, target],
+ echo_stderr=self.debug)
+ except cliapp.AppException as e:
+ errors.append('Unable to clone from %s to %s: %s' %
+ (repourl, target, e))
+ if self.fs.exists(target):
+ self.fs.removedir(target, force=True)
+ raise NoRemote(reponame, errors)
+
+ self.fs.rename(target, path)
+
+ repo = self._new_cached_repo_instance(path, reponame, repourl)
+ repo.already_updated = True
+ return repo
+
+ def _get_repo(self, reponame):
+ '''Return an object representing a cached repository.'''
+
+ if reponame in self._cached_repo_objects:
+ return self._cached_repo_objects[reponame]
+ else:
+ repourl = self._resolver.pull_url(reponame)
+ path = self._cache_name(repourl)
+ if self.fs.exists(path):
+ repo = self._new_cached_repo_instance(path, reponame, repourl)
+ self._cached_repo_objects[reponame] = repo
+ return repo
+ elif self.update_gits:
+ return self._cache_repo(reponame)
+ else:
+ raise NotCached(reponame)
+
+ def _update_repo(self, cachedrepo): # pragma: no cover
+ try:
+ cachedrepo.update_remotes(
+ echo_stderr=self.verbose)
+ cachedrepo.already_updated = True
+ except cliapp.AppException:
+ raise UpdateError(self)
+
+ def get_updated_repo(self, repo_name,
+ ref=None, refs=None):
+ '''Return object representing cached repository.
+
+ If all the specified refs in 'ref' or 'refs' point to SHA1s that are
+ already in the repository, or --no-git-update is set, then the
+ repository won't be updated.
+
+ '''
+
+ if not self.update_gits:
+ self.status_cb(msg='Not updating existing git repository '
+ '%(repo_name)s '
+ 'because of no-git-update being set',
+ chatty=True,
+ repo_name=repo_name)
+ return self._get_repo(repo_name)
+
+ if ref is not None and refs is None:
+ refs = (ref,)
+ else:
+ refs = list(refs)
+
+ if self.has_repo(repo_name):
+ repo = self._get_repo(repo_name)
+ if refs:
+ required_refs = set(refs)
+ missing_refs = set()
+ for required_ref in required_refs: # pragma: no cover
+ if morphlib.git.is_valid_sha1(required_ref):
+ try:
+ repo.resolve_ref_to_commit(required_ref)
+ continue
+ except morphlib.gitdir.InvalidRefError:
+ pass
+ missing_refs.add(required_ref)
+
+ if not missing_refs: # pragma: no cover
+ self.status_cb(
+ msg='Not updating git repository %(repo_name)s '
+ 'because it already contains %(sha1s)s',
+ chatty=True, repo_name=repo_name,
+ sha1s=_word_join_list(tuple(required_refs)))
+ return repo
+
+ if ref:
+ ref_str = 'ref %s' % ref
+ else:
+ ref_str = '%i refs' % len(refs)
+ self.status_cb(msg='Updating %(repo_name)s for %(ref_str)s',
+ repo_name=repo_name, ref_str=ref_str)
+ self._update_repo(repo)
+ return repo
+ else:
+ self.status_cb(msg='Cloning %(repo_name)s', repo_name=repo_name)
+ return self._get_repo(repo_name)
+
+ def ensure_submodules(self, toplevel_repo,
+ toplevel_ref): # pragma: no cover
+ '''Ensure any submodules of a given repo are cached and up to date.'''
+
+ def submodules_for_repo(repo_path, ref):
+ try:
+ submodules = morphlib.git.Submodules(repo_path, ref,
+ runcmd_cb=self.runcmd_cb)
+ submodules.load()
+ return [(submod.url, submod.commit) for submod in submodules]
+ except morphlib.git.NoModulesFileError:
+ return []
+
+ done = set()
+ subs_to_process = submodules_for_repo(toplevel_repo.dirname,
+ toplevel_ref)
+ while subs_to_process:
+ url, ref = subs_to_process.pop()
+ done.add((url, ref))
+
+ cached_repo = self.get_updated_repo(url, ref=ref)
+
+ for submod in submodules_for_repo(cached_repo.dirname, ref):
+ if submod not in done:
+ subs_to_process.append(submod)
+
+ def resolve_ref_to_commit_and_tree(self, repo_name,
+ ref): # pragma: no cover
+ '''Given the name of a ref, returns the commit and tree SHA1.
+
+ If a remote cache server is available, this function can query the
+ remote cache server to avoid needing to clone the entire repo.
+
+ This might break if the ref points to a tag, not a commit.
+
+ '''
+ absref = None
+ tree = None
+
+ if self.has_repo(repo_name):
+ repo = self.get_updated_repo(repo_name, ref)
+ # If the user passed --no-git-update, and the ref is a SHA1 not
+ # available locally, this call will raise an exception.
+ absref = repo.resolve_ref_to_commit(ref)
+ tree = repo.resolve_ref_to_tree(absref)
+ elif self.remote_cache is not None:
+ try:
+ absref, tree = self.remote_cache.resolve_ref(repo_name, ref)
+ if absref is not None:
+ self.status_cb(
+ msg='Resolved %(repo_name)s %(ref)s via remote repo '
+ 'cache', repo_name=repo_name, ref=ref, chatty=True)
+ except BaseException as e:
+ logging.warning('Caught (and ignored) exception: %s' % str(e))
+
+ if absref is None:
+ # As a last resort, clone the repo to resolve the ref.
+ repo = self.get_updated_repo(repo_name, ref)
+ absref = repo.resolve_ref_to_commit(ref)
+ tree = repo.resolve_ref_to_tree(absref)
+
+ return absref, tree
+
+ def ls_tree(self, repo_name, ref): # pragma: no cover
+ '''Lists the files contained in a commit.
+
+ If a remote cache server is available, this function can query the
+ remote cache server to avoid needing to clone the entire repo.
+
+ The list is non-recursive, so you can only see files in the top
+ directory of the repo. To do a recursive operation, use a GitDir
+ instance returned by get_updated_repo().
+
+ '''
+ files = []
+
+ if self.has_repo(repo_name):
+ repo = self.get_updated_repo(repo_name, ref)
+ files = repo.list_files(ref=ref, recurse=False)
+ elif self.remote_cache is not None:
+ files = self.remote_cache.ls_tree(repo_name, ref)
+
+ if len(files) == 0:
+ # As a last resort, clone the repo to do get the file list.
+ repo = self.get_updated_repo(repo_name, ref)
+ files = repo.list_files(ref=ref, recurse=False)
+
+ return files
+
+ def cat_file(self, repo_name, ref, filename): # pragma: no cover
+ '''Returns a single file from a repo.
+
+ If a remote cache server is available, this function can query the
+ remote cache server to avoid needing to clone the entire repo.
+
+ '''
+ contents = None
+
+ if self.has_repo(repo_name):
+ repo = self.get_updated_repo(repo_name, ref)
+ contents = repo.get_file_from_ref(ref, filename)
+ elif self.remote_cache is not None:
+ contents = self.remote_cache.cat_file(repo_name, ref, filename)
+
+ if not contents:
+ # As a last resort, clone the repo to do get the file list.
+ repo = self.get_updated_repo(repo_name, ref)
+ contents = repo.get_file_from_ref(ref, filename)
+
+ return contents
+
+
+class RemoteResolveRefError(cliapp.AppException):
+
+ def __init__(self, repo_name, ref):
+ cliapp.AppException.__init__(
+ self, 'Failed to resolve ref %s for repo %s from remote cache' %
+ (ref, repo_name))
+
+
+class RemoteCatFileError(cliapp.AppException):
+
+ def __init__(self, repo_name, ref, filename):
+ cliapp.AppException.__init__(
+ self, 'Failed to cat file %s in ref %s of repo %s, from remote '
+ 'cache' % (filename, ref, repo_name))
+
+
+class RemoteLsTreeError(cliapp.AppException):
+
+ def __init__(self, repo_name, ref):
+ cliapp.AppException.__init__(
+ self, 'Failed to list tree in ref %s of repo %s, from remote'
+ 'cache' % (ref, repo_name))
+
+
+class RemoteRepoCache(object):
+
+ def __init__(self, server_url, resolver):
+ self.server_url = server_url
+ self._resolver = resolver
+
+ def resolve_ref(self, repo_name, ref):
+ repo_url = self._resolver.pull_url(repo_name)
+ try:
+ return self._resolve_ref_for_repo_url(repo_url, ref)
+ except BaseException as e:
+ logging.error('Caught exception: %s' % str(e))
+ raise RemoteResolveRefError(repo_name, ref)
+
+ def cat_file(self, repo_name, ref, filename):
+ repo_url = self._resolver.pull_url(repo_name)
+ try:
+ return self._cat_file_for_repo_url(repo_url, ref, filename)
+ except urllib2.HTTPError as e:
+ logging.error('Caught exception: %s' % str(e))
+ if e.code == 404:
+ raise RemoteCatFileError(repo_name, ref, filename)
+ raise # pragma: no cover
+
+ def ls_tree(self, repo_name, ref):
+ repo_url = self._resolver.pull_url(repo_name)
+ try:
+ info = json.loads(self._ls_tree_for_repo_url(repo_url, ref))
+ return info['tree'].keys()
+ except BaseException as e:
+ logging.error('Caught exception: %s' % str(e))
+ raise RemoteLsTreeError(repo_name, ref)
+
+ def _resolve_ref_for_repo_url(self, repo_url, ref): # pragma: no cover
+ data = self._make_request(
+ 'sha1s?repo=%s&ref=%s' % self._quote_strings(repo_url, ref))
+ info = json.loads(data)
+ return info['sha1'], info['tree']
+
+ def _cat_file_for_repo_url(self, repo_url, ref,
+ filename): # pragma: no cover
+ return self._make_request(
+ 'files?repo=%s&ref=%s&filename=%s'
+ % self._quote_strings(repo_url, ref, filename))
+
+ def _ls_tree_for_repo_url(self, repo_url, ref): # pragma: no cover
+ return self._make_request(
+ 'trees?repo=%s&ref=%s' % self._quote_strings(repo_url, ref))
+
+ def _quote_strings(self, *args): # pragma: no cover
+ return tuple(urllib.quote(string) for string in args)
+
+ def _make_request(self, path): # pragma: no cover
+ server_url = self.server_url
+ if not server_url.endswith('/'):
+ server_url += '/'
+ url = urlparse.urljoin(server_url, '/1.0/%s' % path)
+ handle = urllib2.urlopen(url)
+ return handle.read()