#!/usr/bin/env python # # Copyright (C) 2012 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # Remove directories in /src/cache/gits that aren't used in any morphologies. # Stale directories can occur as a result of switching a chunk from upstream # git to a lorried repo, or from large scale URI changes. import glob import json import logging import os import re import shutil import string import subprocess import cliapp ### Copied and pasted from morphlib/repoaliasresolver.py !!! class RepoAliasResolver(object): def __init__(self, aliases): self.aliases = aliases def pull_url(self, reponame): '''Expand a possibly shortened repo name to a pull url.''' return self._expand_reponame(reponame, 'pullpat') def push_url(self, reponame): '''Expand a possibly shortened repo name to a push url.''' return self._expand_reponame(reponame, 'pushpat') def _expand_reponame(self, reponame, patname): logging.debug('expanding: reponame=%s' % reponame) logging.debug('expanding: patname=%s' % patname) prefix, suffix = self._split_reponame(reponame) logging.debug('expanding: prefix=%s' % prefix) logging.debug('expanding: suffix=%s' % suffix) # There was no prefix. if prefix is None: logging.debug('expanding: no prefix') return reponame pat = r'^(?P[a-z0-9]+)=(?P[^#]+)#(?P[^#]+)$' for alias in self.aliases: logging.debug('expanding: alias="%s"' % alias) m = re.match(pat, alias) logging.debug('expanding: m=%s' % repr(m)) if m: logging.debug('expanding: prefix group=%s' % m.group('prefix')) if m and m.group('prefix') == prefix: pullpat = m.group(patname) logging.debug('expanding: pullpat=%s' % pullpat) return self._apply_url_pattern(pullpat, suffix) # Unknown prefix. Which means it may be a real URL instead. # Let the caller deal with it. logging.debug('expanding: unknown prefix') return reponame def _split_reponame(self, reponame): '''Split reponame into prefix and suffix. The prefix is returned as None if there was no prefix. ''' pat = r'^(?P[a-z0-9]+):(?P.*)$' m = re.match(pat, reponame) if m: return m.group('prefix'), m.group('rest') else: return None, reponame def _apply_url_pattern(self, pattern, shortname): if '%s' in pattern: return shortname.join(pattern.split('%s')) else: return pattern + shortname ### Copied and pasted from morph/morphlib/localrepocache.py !!! def quote_url(url): ''' Convert URIs to strings that only contain digits, letters, % and _. NOTE: When changing the code of this function, make sure to also apply the same to the quote_url() function of lorry. Otherwise the git tarballs generated by lorry may no longer be found by morph. ''' valid_chars = string.digits + string.letters + '%_' transl = lambda x: x if x in valid_chars else '_' return ''.join([transl(x) for x in url]) ### class CleanGitCache(cliapp.Application): def add_settings(self): self.settings.boolean(['remove', 'r'], 'don\'t just list unused gits, remove them too') def process_inputs(self, args): morph = os.environ.get('MORPH', 'morph') if len(args) != 2: print "Usage: clean-git-cache REPO BRANCH" print print "For example: clean-git-cache baserock:morphs master" print print "This will detect any repositories that are not referenced " print "by the morphologies available in that repo/branch." return print "Reading default config from '%s'" % morph print if morph == 'morph': print "Note: if you use Morph from a git checkout instead of the " print "systemwide version, set MORPH in the environment to point " print "to that version or you may remove the wrong things, due to " print "out of date configuration." print config = self.runcmd ([morph, '--dump-config']) cache_dir = re.search('cachedir = (.*)', config).group(1) repo_alias = re.search('repo-alias = (.*)', config).group(1) repo_alias = repo_alias.split(', ') self.resolver = RepoAliasResolver(repo_alias) morphs_repo = args[0] morphs_branch = args[1] git_cache_dir = os.path.join(cache_dir, 'gits') unused_gits = self._get_unused_gits(morphs_repo, morphs_branch, git_cache_dir) if unused_gits == []: print "No unused git repos found." elif self.settings['remove']: print "Removing unused git repositories ..." for git in unused_gits: f = os.path.join(git_cache_dir, git) print f shutil.rmtree(f) else: print "Unused gits:" for git in unused_gits: f = os.path.join(git_cache_dir, git) #size = self.runcmd(['du', '-sh', f]) print f print print "Run again with --remove to delete these repository caches." def _get_unused_gits(self, morphs_repo, morphs_branch, git_cache_dir): morphs_repo = self.resolver.pull_url(morphs_repo) morphs_dir = os.path.join(git_cache_dir, quote_url(morphs_repo)) refs = self._show_ref(morphs_dir, morphs_branch).split('\n') refs = [x.split() for x in refs if 'origin' in x] morphs_ref = refs[0][0] gits_in_use_list = [quote_url(morphs_repo)] for filename in self._ls_tree(morphs_dir, morphs_ref): if not filename.endswith('.morph'): continue tree = json.loads(self._cat_file(morphs_dir, morphs_ref, filename)) if tree['kind'] != 'stratum' or 'chunks' not in tree: continue for chunk in tree['chunks']: repo = chunk['repo'] directory = quote_url(self.resolver.pull_url(repo)) gits_in_use_list.append(directory) unused_dir_list = [] submodule_dir_list = [] for git_dir in glob.glob(os.path.join(git_cache_dir, '*')): git_dir_base = os.path.split(git_dir)[1] if git_dir_base not in gits_in_use_list: unused_dir_list.append(git_dir_base) else: submodules_file = os.path.join(git_dir, '.gitmodules') if os.path.exists(submodules_file): submodules = self._get_submodules(submodules_file) submodule_dir_list.append(submodules) result = [] for d in unused_dir_list: if d not in submodule_dir_list: result.append(d) return result def _get_submodules(self, gitmodules_file): result = [] for line in open(gitmodules_file): if line.strip().startswith('url'): result.append(quote_url(line[line.find('=') + 1:].strip())) return result def _show_ref(self, cwd, ref): return self.runcmd(['git', 'show-ref', ref], cwd = cwd) def _ls_tree(self, cwd, ref): result = self.runcmd(['git', 'ls-tree', '--name-only', ref], cwd = cwd) return result.split('\n') def _cat_file(self, cwd, ref, filename): return self.runcmd(['git', 'cat-file', 'blob', '%s:%s' % (ref, filename)], cwd = cwd) CleanGitCache().run()