# Copyright (C) 2012-2016 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . import cliapp import fs.osfs import json import logging import os import string import sys import tempfile import urllib2 import urlparse import urllib import morphlib from morphlib.util import word_join_list as _word_join_list # urlparse.urljoin needs to know details of the URL scheme being used. # It does not know about git:// by default, so we teach it here. gitscheme = ['git'] urlparse.uses_relative.extend(gitscheme) urlparse.uses_netloc.extend(gitscheme) urlparse.uses_params.extend(gitscheme) urlparse.uses_query.extend(gitscheme) urlparse.uses_fragment.extend(gitscheme) def quote_url(url): ''' Convert URIs to strings that only contain digits, letters, % and _. NOTE: When changing the code of this function, make sure to also apply the same to the quote_url() function of lorry. Otherwise the git tarballs generated by lorry may no longer be found by morph. ''' valid_chars = string.digits + string.letters + '%_' transl = lambda x: x if x in valid_chars else '_' return ''.join([transl(x) for x in url]) class NoRemote(morphlib.Error): def __init__(self, reponame, errors): self.reponame = reponame self.errors = errors def __str__(self): return '\n\t'.join(['Cannot find remote git repository: %s' % self.reponame] + self.errors) class NotCached(morphlib.Error): def __init__(self, reponame): self.reponame = reponame def __str__(self): # pragma: no cover return 'Repository %s is not cached yet' % self.reponame class UpdateError(cliapp.AppException): # pragma: no cover def __init__(self, repo): cliapp.AppException.__init__( self, 'Failed to update cached version of repo %s' % repo) class CachedRepo(morphlib.gitdir.GitDirectory): '''A locally cached Git repository with an origin remote set up. On instance of this class represents a locally cached version of a remote Git repository. This remote repository is set up as the 'origin' remote. Cached repositories are bare mirrors of the upstream. Locally created branches will be lost the next time the repository updates. ''' def __init__(self, path, original_name, url): self.original_name = original_name self.url = url self.is_mirror = not url.startswith('file://') self.already_updated = False super(CachedRepo, self).__init__(path) def __str__(self): # pragma: no cover return self.url class RepoCache(object): '''Manage a collection of Git repositories. When we build stuff, we need a local copy of the git repository. To avoid having to clone the repositories for every build, we maintain a local cache of the repositories: we first clone the remote repository to the cache, and then make a local clone from the cache to the build environment. This class manages the local cached repositories. Repositories may be specified either using a full URL, in a form understood by git(1), or as a repository name to which a base url is prepended. The base urls are given to the class when it is created. Instead of cloning via a normal 'git clone' directly from the git server, we first try to download a tarball from a url, and if that works, we unpack the tarball. Certain questions about a repo can be resolved without cloning the whole thing, if an instance of 'morph-cache-server' is available on the remote Git server. This makes calculating the build graph for the first time a whole lot faster, as we avoid cloning every repo locally. The git_resolve_cache_url parameter enables this feature. Baserock 'Trove' systems run 'morph-cache-server' by default. The 'custom_fs' parameter takes a PyFilesystem instance, which you can use to override where 'cachedir' is stored. This should probably only be used for testing. ''' def __init__(self, cachedir, resolver, tarball_base_url=None, git_resolve_cache_url=None, update_gits=True, runcmd_cb=cliapp.runcmd, status_cb=lambda **kwargs: None, verbose=False, debug=False, custom_fs=None): self.fs = custom_fs or fs.osfs.OSFS('/') self.fs.makedir(cachedir, recursive=True, allow_recreate=True) self.cachedir = cachedir self._resolver = resolver if tarball_base_url and not tarball_base_url.endswith('/'): tarball_base_url += '/' self._tarball_base_url = tarball_base_url self._cached_repo_objects = {} # Corresponds to the app 'no-git-update' setting self.update_gits = update_gits self.runcmd_cb = runcmd_cb self.status_cb = status_cb self.verbose = verbose self.debug = debug if git_resolve_cache_url: # pragma: no cover self.remote_cache = RemoteRepoCache(git_resolve_cache_url, resolver) else: self.remote_cache = None def _git(self, args, **kwargs): # pragma: no cover '''Execute git command. This is a method of its own so that unit tests can easily override all use of the external git command. ''' morphlib.git.gitcmd(self.runcmd_cb, *args, **kwargs) def _fetch(self, url, path): # pragma: no cover '''Fetch contents of url into a file. This method is meant to be overridden by unit tests. ''' self.status_cb(msg="Trying to fetch %(tarball)s to seed the cache", tarball=url, chatty=True) if self.verbose: verbosity_flags = [] kwargs = dict(stderr=sys.stderr) else: verbosity_flags = ['--quiet'] kwargs = dict() def wget_command(): return ['wget'] + verbosity_flags + ['-O-', url] self.runcmd_cb(wget_command(), ['tar', '--no-same-owner', '-xf', '-'], cwd=path, **kwargs) def _mkdtemp(self, dirname): # pragma: no cover '''Creates a temporary directory. This method is meant to be overridden by unit tests. ''' return tempfile.mkdtemp(dir=self.fs.getsyspath(dirname)) def _escape(self, url): '''Escape a URL so it can be used as a basename in a file.''' # FIXME: The following is a nicer way than to do this. # However, for compatibility, we need to use the same as the # tarball server (set up by Lorry) uses. # return urllib.quote(url, safe='') return quote_url(url) def _cache_name(self, url): scheme, netloc, path, query, fragment = urlparse.urlsplit(url) if scheme != 'file': path = os.path.join(self.cachedir, self._escape(url)) return path def has_repo(self, reponame): '''Have we already got a cache of a given repo?''' url = self._resolver.pull_url(reponame) path = self._cache_name(url) return self.fs.exists(path) def _clone_with_tarball(self, repourl, path): tarball_url = urlparse.urljoin(self._tarball_base_url, self._escape(repourl)) + '.tar' try: self.fs.makedir(path) self._fetch(tarball_url, path) self._git(['config', 'remote.origin.url', repourl], cwd=path) self._git(['config', 'remote.origin.mirror', 'true'], cwd=path) self._git(['config', 'remote.origin.fetch', '+refs/*:refs/*'], cwd=path) except BaseException as e: if self.fs.exists(path): self.fs.removedir(path, force=True) return False, 'Unable to extract tarball %s: %s' % ( tarball_url, e) return True, None def _new_cached_repo_instance(self, path, reponame, repourl): return CachedRepo(path, reponame, repourl) def _cache_repo(self, reponame): '''Clone the given repo into the cache. If the repo is already cloned, do nothing. ''' errors = [] repourl = self._resolver.pull_url(reponame) path = self._cache_name(repourl) if self._tarball_base_url: ok, error = self._clone_with_tarball(repourl, path) if ok: repo = self._get_repo(reponame) self._update_repo(repo) return repo else: errors.append(error) self.status_cb(msg='Using git clone.') target = self._mkdtemp(self.cachedir) try: self._git(['clone', '--mirror', '-n', repourl, target], echo_stderr=self.debug) except cliapp.AppException as e: errors.append('Unable to clone from %s to %s: %s' % (repourl, target, e)) if self.fs.exists(target): self.fs.removedir(target, force=True) raise NoRemote(reponame, errors) self.fs.rename(target, path) repo = self._new_cached_repo_instance(path, reponame, repourl) repo.already_updated = True return repo def _get_repo(self, reponame): '''Return an object representing a cached repository.''' if reponame in self._cached_repo_objects: return self._cached_repo_objects[reponame] else: repourl = self._resolver.pull_url(reponame) path = self._cache_name(repourl) if self.fs.exists(path): repo = self._new_cached_repo_instance(path, reponame, repourl) self._cached_repo_objects[reponame] = repo return repo elif self.update_gits: return self._cache_repo(reponame) else: raise NotCached(reponame) def _update_repo(self, cachedrepo): # pragma: no cover try: cachedrepo.update_remotes( echo_stderr=self.verbose) cachedrepo.already_updated = True except cliapp.AppException: raise UpdateError(self) def get_updated_repo(self, repo_name, ref=None, refs=None): '''Return object representing cached repository. If all the specified refs in 'ref' or 'refs' point to SHA1s that are already in the repository, or --no-git-update is set, then the repository won't be updated. ''' if not self.update_gits: self.status_cb(msg='Not updating existing git repository ' '%(repo_name)s ' 'because of no-git-update being set', chatty=True, repo_name=repo_name) return self._get_repo(repo_name) if ref is not None and refs is None: refs = (ref,) else: refs = list(refs) if self.has_repo(repo_name): repo = self._get_repo(repo_name) if refs: required_refs = set(refs) missing_refs = set() for required_ref in required_refs: # pragma: no cover if morphlib.git.is_valid_sha1(required_ref): try: repo.resolve_ref_to_commit(required_ref) continue except morphlib.gitdir.InvalidRefError: pass missing_refs.add(required_ref) if not missing_refs: # pragma: no cover self.status_cb( msg='Not updating git repository %(repo_name)s ' 'because it already contains %(sha1s)s', chatty=True, repo_name=repo_name, sha1s=_word_join_list(tuple(required_refs))) return repo if ref: ref_str = 'ref %s' % ref else: ref_str = '%i refs' % len(refs) self.status_cb(msg='Updating %(repo_name)s for %(ref_str)s', repo_name=repo_name, ref_str=ref_str) self._update_repo(repo) return repo else: self.status_cb(msg='Cloning %(repo_name)s', repo_name=repo_name) return self._get_repo(repo_name) def ensure_submodules(self, toplevel_repo, toplevel_ref, submodules={}): # pragma: no cover '''Ensure any submodules of a given repo are cached and up to date.''' def submodules_for_repo(repo_path, ref): try: submodules = morphlib.git.Submodules(repo_path, ref, runcmd_cb=self.runcmd_cb) submodules.load() return [(submod.name, submod.url, submod.commit) for submod in submodules] except morphlib.git.NoModulesFileError: return [] done = set() subs_to_process = submodules_for_repo(toplevel_repo.dirname, toplevel_ref) while subs_to_process: name, url, ref = subs_to_process.pop() done.add((url, ref)) if name in submodules: url = submodules[name]['url'] cached_repo = self.get_updated_repo(url, ref=ref) for submod in submodules_for_repo(cached_repo.dirname, ref): if submod not in done: subs_to_process.append(submod) def resolve_ref_to_commit_and_tree(self, repo_name, ref): # pragma: no cover '''Given the name of a ref, returns the commit and tree SHA1. If a remote cache server is available, this function can query the remote cache server to avoid needing to clone the entire repo. This might break if the ref points to a tag, not a commit. ''' absref = None tree = None if self.has_repo(repo_name): repo = self.get_updated_repo(repo_name, ref) # If the user passed --no-git-update, and the ref is a SHA1 not # available locally, this call will raise an exception. absref = repo.resolve_ref_to_commit(ref) tree = repo.resolve_ref_to_tree(absref) elif self.remote_cache is not None: try: absref, tree = self.remote_cache.resolve_ref(repo_name, ref) if absref is not None: self.status_cb( msg='Resolved %(repo_name)s %(ref)s via remote repo ' 'cache', repo_name=repo_name, ref=ref, chatty=True) except BaseException as e: logging.warning('Caught (and ignored) exception: %s' % str(e)) if absref is None: # As a last resort, clone the repo to resolve the ref. repo = self.get_updated_repo(repo_name, ref) absref = repo.resolve_ref_to_commit(ref) tree = repo.resolve_ref_to_tree(absref) return absref, tree def ls_tree(self, repo_name, ref): # pragma: no cover '''Lists the files contained in a commit. If a remote cache server is available, this function can query the remote cache server to avoid needing to clone the entire repo. The list is non-recursive, so you can only see files in the top directory of the repo. To do a recursive operation, use a GitDir instance returned by get_updated_repo(). ''' files = [] if self.has_repo(repo_name): repo = self.get_updated_repo(repo_name, ref) files = repo.list_files(ref=ref, recurse=False) elif self.remote_cache is not None: files = self.remote_cache.ls_tree(repo_name, ref) if len(files) == 0: # As a last resort, clone the repo to do get the file list. repo = self.get_updated_repo(repo_name, ref) files = repo.list_files(ref=ref, recurse=False) return files def cat_file(self, repo_name, ref, filename): # pragma: no cover '''Returns a single file from a repo. If a remote cache server is available, this function can query the remote cache server to avoid needing to clone the entire repo. ''' contents = None if self.has_repo(repo_name): repo = self.get_updated_repo(repo_name, ref) contents = repo.get_file_from_ref(ref, filename) elif self.remote_cache is not None: contents = self.remote_cache.cat_file(repo_name, ref, filename) if not contents: # As a last resort, clone the repo to do get the file list. repo = self.get_updated_repo(repo_name, ref) contents = repo.get_file_from_ref(ref, filename) return contents class RemoteResolveRefError(cliapp.AppException): def __init__(self, repo_name, ref): cliapp.AppException.__init__( self, 'Failed to resolve ref %s for repo %s from remote cache' % (ref, repo_name)) class RemoteCatFileError(cliapp.AppException): def __init__(self, repo_name, ref, filename): cliapp.AppException.__init__( self, 'Failed to cat file %s in ref %s of repo %s, from remote ' 'cache' % (filename, ref, repo_name)) class RemoteLsTreeError(cliapp.AppException): def __init__(self, repo_name, ref): cliapp.AppException.__init__( self, 'Failed to list tree in ref %s of repo %s, from remote' 'cache' % (ref, repo_name)) class RemoteRepoCache(object): def __init__(self, server_url, resolver): self.server_url = server_url self._resolver = resolver def resolve_ref(self, repo_name, ref): repo_url = self._resolver.pull_url(repo_name) try: return self._resolve_ref_for_repo_url(repo_url, ref) except BaseException as e: logging.error('Caught exception: %s' % str(e)) raise RemoteResolveRefError(repo_name, ref) def cat_file(self, repo_name, ref, filename): repo_url = self._resolver.pull_url(repo_name) try: return self._cat_file_for_repo_url(repo_url, ref, filename) except urllib2.HTTPError as e: logging.error('Caught exception: %s' % str(e)) if e.code == 404: raise RemoteCatFileError(repo_name, ref, filename) raise # pragma: no cover def ls_tree(self, repo_name, ref): repo_url = self._resolver.pull_url(repo_name) try: info = json.loads(self._ls_tree_for_repo_url(repo_url, ref)) return info['tree'].keys() except BaseException as e: logging.error('Caught exception: %s' % str(e)) raise RemoteLsTreeError(repo_name, ref) def _resolve_ref_for_repo_url(self, repo_url, ref): # pragma: no cover data = self._make_request( 'sha1s?repo=%s&ref=%s' % self._quote_strings(repo_url, ref)) info = json.loads(data) return info['sha1'], info['tree'] def _cat_file_for_repo_url(self, repo_url, ref, filename): # pragma: no cover return self._make_request( 'files?repo=%s&ref=%s&filename=%s' % self._quote_strings(repo_url, ref, filename)) def _ls_tree_for_repo_url(self, repo_url, ref): # pragma: no cover return self._make_request( 'trees?repo=%s&ref=%s' % self._quote_strings(repo_url, ref)) def _quote_strings(self, *args): # pragma: no cover return tuple(urllib.quote(string) for string in args) def _make_request(self, path): # pragma: no cover server_url = self.server_url if not server_url.endswith('/'): server_url += '/' url = urlparse.urljoin(server_url, '/1.0/%s' % path) handle = urllib2.urlopen(url) return handle.read()