From 4f73763ad1ff0bcb109f685afafcc6d1dffcfb0e Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Sat, 14 Jun 2014 18:53:30 +0100 Subject: Read remote morphologies in batches rather than one at a time This should be a useful performance boost. --- morphlib/app.py | 79 ++++++++++++++++++++++++------------------- morphlib/morphologyfactory.py | 70 ++++++++++++++++++++++++++++++++++++++ morphlib/remoterepocache.py | 23 +++++++++++-- 3 files changed, 134 insertions(+), 38 deletions(-) diff --git a/morphlib/app.py b/morphlib/app.py index c7fe237d..19f61de9 100644 --- a/morphlib/app.py +++ b/morphlib/app.py @@ -359,43 +359,52 @@ class Morph(cliapp.Application): resolved_refs = {} resolved_morphologies = {} + def fetch_morphologies(triplets): + morph_factory.get_morphologies(resolved_refs, resolved_morphologies, triplets) + while queue: - reponame, ref, filename = queue.popleft() - update_repo = update and reponame not in updated_repos - - # Resolve the (repo, ref) reference, cache result. - reference = (reponame, ref) - if not reference in resolved_refs: - resolved_refs[reference] = self.resolve_ref( - lrc, rrc, reponame, ref, update_repo) - absref, tree = resolved_refs[reference] - - updated_repos.add(reponame) - - # Fetch the (repo, ref, filename) morphology, cache result. - reference = (reponame, absref, filename) - if not reference in resolved_morphologies: - resolved_morphologies[reference] = \ - morph_factory.get_morphology(reponame, absref, filename) - morphology = resolved_morphologies[reference] - - visit(reponame, ref, filename, absref, tree, morphology) - if morphology['kind'] == 'cluster': - raise cliapp.AppException( - "Cannot build a morphology of type 'cluster'.") - elif morphology['kind'] == 'system': - queue.extend((s.get('repo') or reponame, - s.get('ref') or ref, - '%s.morph' % s['morph']) - for s in morphology['strata']) - elif morphology['kind'] == 'stratum': - if morphology['build-depends']: + to_fetch = set() + while queue: + reponame, ref, filename = queue.popleft() + update_repo = update and reponame not in updated_repos + + # Resolve the (repo, ref) reference, cache result. + reference = (reponame, ref) + if not reference in resolved_refs: + resolved_refs[reference] = self.resolve_ref( + lrc, rrc, reponame, ref, update_repo) + absref, tree = resolved_refs[reference] + + updated_repos.add(reponame) + print 'resolved: %s %s %s' % ((reponame, ref, filename)) + to_fetch.add((reponame, ref, filename)) + print 'to_fetch: %s' % to_fetch + + to_visit = to_fetch + if len(to_fetch) > 0: + fetch_morphologies(to_fetch) + + while to_visit: + reponame, ref, filename = to_visit.pop() + absref, tree = resolved_refs[(reponame, ref)] + morphology = resolved_morphologies[(reponame, ref, filename)] + visit(reponame, ref, filename, absref, tree, morphology) + if morphology['kind'] == 'cluster': + raise cliapp.AppException( + "Cannot build a morphology of type 'cluster'.") + elif morphology['kind'] == 'system': queue.extend((s.get('repo') or reponame, - s.get('ref') or ref, - '%s.morph' % s['morph']) - for s in morphology['build-depends']) - queue.extend((c['repo'], c['ref'], '%s.morph' % c['morph']) - for c in morphology['chunks']) + s.get('ref') or ref, + '%s.morph' % s['morph']) + for s in morphology['strata']) + elif morphology['kind'] == 'stratum': + if morphology['build-depends']: + queue.extend((s.get('repo') or reponame, + s.get('ref') or ref, + '%s.morph' % s['morph']) + for s in morphology['build-depends']) + queue.extend((c['repo'], c['ref'], '%s.morph' % c['morph']) + for c in morphology['chunks']) def cache_repo_and_submodules(self, cache, url, ref, done): subs_to_process = set() diff --git a/morphlib/morphologyfactory.py b/morphlib/morphologyfactory.py index 1de42d57..cebc382f 100644 --- a/morphlib/morphologyfactory.py +++ b/morphlib/morphologyfactory.py @@ -14,6 +14,9 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import base64 +import logging + import morphlib import cliapp @@ -141,6 +144,73 @@ class MorphologyFactory(object): text) return morphology + def get_morphologies(self, resolved_refs, resolved_morphologies, triplets): + '''Fetch morphologies that are not already resolved. + + The resolved_morphologies dict is updated with the new morphologies. + + All refs in the list of triplets are assumed to be stored in the + resolved_refs dict already. + + ''' + text_dict = {} + file_list_dict = {} + to_read = dict() + to_autodetect = set() + for triplet in triplets: + if triplet in resolved_morphologies: + continue + + reponame, ref, filename = triplet + absref, tree = resolved_refs[(reponame, ref)] + + if self._lrc.has_repo(reponame): + text = self._read_local_repo( + reponame, absref, filename) + text_dict[(reponame, ref, filename)] = text + if self._rrc is None: + raise NotcachedError(reponame) + else: + repourl = self._rrc._resolver.pull_url(reponame) + remote_triplet = (repourl, absref, filename) + #print 'to_read[%s,%s,%s] set' % remote_triplet + to_read[remote_triplet] = triplet + + if len(to_read) > 0: + self.status(msg='Fetching %i morphologies from remote repo cache' % + len(to_read)) + result = self._rrc.cat_file_multiple(to_read.keys()) + for item in result: + remote_triplet = (item['repo'], item['ref'], item['filename']) + triplet = to_read[remote_triplet] + if 'data' in item: + text = base64.decodestring(item['data']) + text_dict[triplet] = text + else: + logging.debug('Remote cache: %s', item) + to_autodetect.add(triplet) + + for triplet in to_autodetect: + reponame, ref, filename = triplet + self.status(msg='Fetching file list for %s from remote repo cache' + % reponame, chatty=True) + absref, tree = resolved_refs[(reponame, ref)] + file_list = self._rrc.ls_tree(reponame, absref) + assert file_list is not None + file_list_dict[triplet] = file_list + + for triplet in triplets: + if triplet in resolved_morphologies: + continue + reponame, ref, filename = triplet + absref, tree = resolved_refs[(reponame, ref)] + file_list = file_list_dict.get(triplet, []) + text = text_dict.get(triplet, None) + assert text or file_list + morphology = self._parse_or_generate_morphology( + reponame, absref, filename, file_list, text) + resolved_morphologies[triplet] = morphology + def _check_and_tweak_system(self, morphology, reponame, sha1, filename): '''Check and tweak a system morphology.''' diff --git a/morphlib/remoterepocache.py b/morphlib/remoterepocache.py index b1544b03..f790862e 100644 --- a/morphlib/remoterepocache.py +++ b/morphlib/remoterepocache.py @@ -55,7 +55,7 @@ class RemoteRepoCache(object): repo_url = self._resolver.pull_url(repo_name) try: return self._resolve_ref_for_repo_url(repo_url, ref) - except BaseException, e: + except urllib2.URLError, e: logging.error('Caught exception: %s' % str(e)) raise ResolveRefError(repo_name, ref) @@ -76,6 +76,18 @@ class RemoteRepoCache(object): logging.error('Caught exception: %s' % str(e)) raise LsTreeError(repo_name, ref) + def cat_file_multiple(self, triplets): + if len(triplets) == 0: + return + request = [] + for repo_name, ref, filename in triplets: + repo_url = self._resolver.pull_url(repo_name) + request.append( + dict(repo=repo_url, ref=ref, filename=filename)) + result = self._make_request( + 'files', json_post_data=json.dumps(request)) + return json.loads(result) + def _resolve_ref_for_repo_url(self, repo_url, ref): # pragma: no cover data = self._make_request( 'sha1s?repo=%s&ref=%s' % self._quote_strings(repo_url, ref)) @@ -95,10 +107,15 @@ class RemoteRepoCache(object): def _quote_strings(self, *args): # pragma: no cover return tuple(urllib.quote(string) for string in args) - def _make_request(self, path): # pragma: no cover + def _make_request(self, path, json_post_data=None): # pragma: no cover server_url = self.server_url if not server_url.endswith('/'): server_url += '/' url = urlparse.urljoin(server_url, '/1.0/%s' % path) - handle = urllib2.urlopen(url) + if json_post_data is None: + headers = {} + else: + headers = {'Content-type': 'application/json'} + request = urllib2.Request(url, data=json_post_data, headers=headers) + handle = urllib2.urlopen(request) return handle.read() -- cgit v1.2.1