diff options
Diffstat (limited to 'pip/index.py')
-rw-r--r-- | pip/index.py | 421 |
1 files changed, 258 insertions, 163 deletions
diff --git a/pip/index.py b/pip/index.py index 8e53e44b7..e9ce48630 100644 --- a/pip/index.py +++ b/pip/index.py @@ -1,8 +1,10 @@ -"""Routines related to PyPI, indexes""" - +""" +Routines related to PyPI, indexes, PEP381 mirrors +""" import sys import os import re +import hashlib import gzip import mimetypes try: @@ -15,21 +17,31 @@ import random import socket import string import zlib + +import requests + +from pip.locations import serverkey_file from pip.log import logger -from pip.util import Inf -from pip.util import normalize_name, splitext +from pip.util import Inf, normalize_name, splitext from pip.exceptions import DistributionNotFound, BestVersionAlreadyInstalled from pip.backwardcompat import (WindowsError, BytesIO, - Queue, httplib, urlparse, - URLError, HTTPError, u, - product, url2pathname) -from pip.backwardcompat import Empty as QueueEmpty -from pip.download import urlopen, path_to_url2, url_to_path, geturl, Urllib2HeadRequest + Queue, urlparse, + URLError, HTTPError, b, u, + product, url2pathname, + OrderedDict, _ord as ord, + decode_base64, _long, + Empty as QueueEmpty) +from pip.download import urlopen, path_to_url2, url_to_path, geturl +from pip.mirrors import verify, load_key, find_mirrors __all__ = ['PackageFinder'] -DEFAULT_MIRROR_URL = "last.pypi.python.org" +_egg_fragment_re = re.compile(r'#egg=([^&]*)') +_egg_info_re = re.compile(r'([a-z0-9_.]+)-([a-z0-9_.-]+)', re.I) +_py_version_re = re.compile(r'-py([123]\.?[0-9]?)$') +_clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%~_\\|-]', re.I) +_md5_re = re.compile(r'md5=([a-f0-9]+)') class PackageFinder(object): @@ -38,9 +50,8 @@ class PackageFinder(object): This is meant to match easy_install's technique for looking for packages, by reading pages and looking for appropriate links """ - def __init__(self, find_links, index_urls, - use_mirrors=False, mirrors=None, main_mirror_url=None): + use_mirrors=False, mirrors=None): self.find_links = find_links self.index_urls = index_urls self.dependency_links = [] @@ -48,10 +59,16 @@ class PackageFinder(object): # These are boring links that have already been logged somehow: self.logged_links = set() if use_mirrors: - self.mirror_urls = self._get_mirror_urls(mirrors, main_mirror_url) - logger.info('Using PyPI mirrors: %s' % ', '.join(self.mirror_urls)) + self.mirror_urls = self._get_mirror_urls(mirrors) + logger.info('Using PyPI mirrors:\n* %s' % + '\n* '.join([url.url for url in self.mirror_urls])) else: - self.mirror_urls = [] + self.mirror_urls = () + serverkey_cache = open(serverkey_file, 'rb') + try: + self.serverkey = load_key(serverkey_cache.read()) + finally: + serverkey_cache.close() def add_dependency_links(self, links): ## FIXME: this shouldn't be global list this, it should only @@ -64,76 +81,119 @@ class PackageFinder(object): def _sort_locations(locations): """ Sort locations into "files" (archives) and "urls", and return - a pair of lists (files,urls) + a pair of lists (files, urls) """ files = [] urls = [] # puts the url for the given file path into the appropriate # list - def sort_path(path): - url = path_to_url2(path) - if mimetypes.guess_type(url, strict=False)[0] == 'text/html': + def sort_path(url, path): + new_url = path_to_url2(path) + mimetype = mimetypes.guess_type(new_url, strict=False)[0] + url.url = new_url + if mimetype == 'text/html': urls.append(url) else: files.append(url) for url in locations: - if url.startswith('file:'): - path = url_to_path(url) + if isinstance(url, Link): + url = url.copy() + else: + url = Link(url) + if url.url.startswith('file:'): + path = url_to_path(url.url) if os.path.isdir(path): path = os.path.realpath(path) for item in os.listdir(path): - sort_path(os.path.join(path, item)) + sort_path(url, os.path.join(path, item)) elif os.path.isfile(path): - sort_path(path) + sort_path(url, path) else: urls.append(url) return files, urls + def make_package_url(self, url, name): + """ + For maximum compatibility with easy_install, ensure the path + ends in a trailing slash. Although this isn't in the spec + (and PyPI can handle it without the slash) some other index + implementations might break if they relied on easy_install's + behavior. + """ + if isinstance(url, Link): + package_url = url.copy() + else: + package_url = Link(url) + new_url = posixpath.join(package_url.url, name) + if not new_url.endswith('/'): + new_url = new_url + '/' + package_url.url = new_url + return package_url + + def verify(self, requirement, url): + """ + Verifies the URL for the given requirement using the PEP381 + verification code. + """ + if url.comes_from and url.base_url: + try: + data = b(url.comes_from.content) + serversig = requirement.serversig(url.base_url) + if data and serversig: + return verify(self.serverkey, data, serversig) + except ValueError: + return False + return False + def find_requirement(self, req, upgrade): url_name = req.url_name # Only check main index if index URL is given: main_index_url = None if self.index_urls: # Check that we have the url_name correctly spelled: - main_index_url = Link(posixpath.join(self.index_urls[0], url_name)) - # This will also cache the page, so it's okay that we get it again later: + main_index_url = self.make_package_url(self.index_urls[0], + url_name) + # This will also cache the page, + # so it's okay that we get it again later: page = self._get_page(main_index_url, req) if page is None: url_name = self._find_url_name(Link(self.index_urls[0]), url_name, req) or req.url_name # Combine index URLs with mirror URLs here to allow # adding more index URLs from requirements files - all_index_urls = self.index_urls + self.mirror_urls - - def mkurl_pypi_url(url): - loc = posixpath.join(url, url_name) - # For maximum compatibility with easy_install, ensure the path - # ends in a trailing slash. Although this isn't in the spec - # (and PyPI can handle it without the slash) some other index - # implementations might break if they relied on easy_install's behavior. - if not loc.endswith('/'): - loc = loc + '/' - return loc + + locations = [] + indexes_package_urls = [] + mirrors_package_urls = [] if url_name is not None: - locations = [ - mkurl_pypi_url(url) - for url in all_index_urls] + self.find_links - else: - locations = list(self.find_links) - locations.extend(self.dependency_links) + indexes_package_urls = [self.make_package_url(url, url_name) + for url in self.index_urls] + locations.extend(indexes_package_urls) + mirrors_package_urls = [self.make_package_url(url, url_name) + for url in self.mirror_urls] + locations.extend(mirrors_package_urls) + + locations.extend(self.find_links + self.dependency_links) + for version in req.absolute_versions: if url_name is not None and main_index_url is not None: - locations = [ - posixpath.join(main_index_url.url, version)] + locations + version_url = posixpath.join(main_index_url.url, version) + locations = [version_url] + locations file_locations, url_locations = self._sort_locations(locations) - locations = [Link(url) for url in url_locations] + locations = [] + for url in url_locations: + if isinstance(url, Link): + locations.append(url) + else: + locations.append(Link(url)) logger.debug('URLs to search for versions for %s:' % req) for location in locations: logger.debug('* %s' % location) + found_versions = [] found_versions.extend( self._package_versions( @@ -143,60 +203,109 @@ class PackageFinder(object): logger.debug('Analyzing links from page %s' % page.url) logger.indent += 2 try: - page_versions.extend(self._package_versions(page.links, req.name.lower())) + page_versions.extend(self._package_versions( + page.links, req.name.lower())) finally: logger.indent -= 2 + dependency_versions = list(self._package_versions( [Link(url) for url in self.dependency_links], req.name.lower())) if dependency_versions: - logger.info('dependency_links found: %s' % ', '.join([link.url for parsed, link, version in dependency_versions])) + dependency_urls = [link.url for _, link, _ in dependency_versions] + logger.info('dependency_links found: %s' % + ', '.join(dependency_urls)) + file_versions = list(self._package_versions( [Link(url) for url in file_locations], req.name.lower())) - if not found_versions and not page_versions and not dependency_versions and not file_versions: - logger.fatal('Could not find any downloads that satisfy the requirement %s' % req) - raise DistributionNotFound('No distributions at all found for %s' % req) + if (not found_versions and not page_versions and + not dependency_versions and not file_versions): + logger.fatal('Could not find any downloads that satisfy ' + 'the requirement %s' % req) + raise DistributionNotFound('No distributions at all found for %s' + % req) + if req.satisfied_by is not None: - found_versions.append((req.satisfied_by.parsed_version, Inf, req.satisfied_by.version)) + found_versions.append((req.satisfied_by.parsed_version, + Inf, req.satisfied_by.version)) + if file_versions: file_versions.sort(reverse=True) - logger.info('Local files found: %s' % ', '.join([url_to_path(link.url) for parsed, link, version in file_versions])) + file_urls = [url_to_path(link.url) for _, link, _ in file_versions] + logger.info('Local files found: %s' % ', '.join(file_urls)) found_versions = file_versions + found_versions + all_versions = found_versions + page_versions + dependency_versions - applicable_versions = [] - for (parsed_version, link, version) in all_versions: + + applicable_versions = OrderedDict() + for parsed_version, link, version in all_versions: if version not in req.req: - logger.info("Ignoring link %s, version %s doesn't match %s" - % (link, version, ','.join([''.join(s) for s in req.req.specs]))) + req_specs = [''.join(s) for s in req.req.specs] + logger.info("Ignoring link %s, version %s doesn't match %s" % + (link, version, ','.join(req_specs))) continue - applicable_versions.append((link, version)) - applicable_versions = sorted(applicable_versions, key=lambda v: pkg_resources.parse_version(v[1]), reverse=True) - existing_applicable = bool([link for link, version in applicable_versions if link is Inf]) + link_comes_from = None + mirror_urls = [mirror.url for mirror in mirrors_package_urls] + if link is not Inf: + link_comes_from = getattr(link, 'comes_from', None) + if link_comes_from is not None: + link.is_mirror = link_comes_from.url in mirror_urls + applicable_versions.setdefault(version, []).append(link) + + for version in applicable_versions: + random.shuffle(applicable_versions[version]) + + sorted_applicable_versions = sorted(applicable_versions.items(), + key=lambda v: pkg_resources.parse_version(v[0]), reverse=True) + applicable_versions = OrderedDict(sorted_applicable_versions) + + all_links = [link for link in [links + for links in applicable_versions.values()] + if link is Inf] + existing_applicable = bool(all_links) + if not upgrade and existing_applicable: - if applicable_versions[0][1] is Inf: - logger.info('Existing installed version (%s) is most up-to-date and satisfies requirement' - % req.satisfied_by.version) + if Inf in applicable_versions[0][1]: + logger.info('Existing installed version (%s) is most ' + 'up-to-date and satisfies requirement' % + req.satisfied_by.version) raise BestVersionAlreadyInstalled else: - logger.info('Existing installed version (%s) satisfies requirement (most up-to-date version is %s)' - % (req.satisfied_by.version, applicable_versions[0][1])) + logger.info('Existing installed version (%s) satisfies ' + 'requirement (most up-to-date version is %s)' % + (req.satisfied_by.version, + applicable_versions[0][1])) return None + if not applicable_versions: - logger.fatal('Could not find a version that satisfies the requirement %s (from versions: %s)' - % (req, ', '.join([version for parsed_version, link, version in found_versions]))) - raise DistributionNotFound('No distributions matching the version for %s' % req) - if applicable_versions[0][0] is Inf: - # We have an existing version, and its the best version - logger.info('Installed version (%s) is most up-to-date (past versions: %s)' - % (req.satisfied_by.version, ', '.join([version for link, version in applicable_versions[1:]]) or 'none')) + show_versions = [version for _, _, version in found_versions] + logger.fatal('Could not find a version that satisfies ' + 'the requirement %s (from versions: %s)' % + (req, ', '.join(show_versions))) + raise DistributionNotFound('No distributions matching ' + 'the version for %s' % req) + + newest = list(applicable_versions.keys())[0] + + if Inf in applicable_versions: + # We have an existing version, and it's the best version + show_versions = [vers for vers in applicable_versions.keys()[1:]] + logger.info('Installed version (%s) is most up-to-date ' + '(past versions: %s)' % + (req.satisfied_by.version, + ', '.join(show_versions) or 'none')) raise BestVersionAlreadyInstalled + if len(applicable_versions) > 1: logger.info('Using version %s (newest of versions: %s)' % - (applicable_versions[0][1], ', '.join([version for link, version in applicable_versions]))) - return applicable_versions[0][0] + (newest, ', '.join(applicable_versions.keys()))) + + return applicable_versions[newest] def _find_url_name(self, index_url, url_name, req): - """Finds the true URL name of a package, when the given name isn't quite correct. - This is usually used to implement case-insensitivity.""" + """ + Finds the true URL name of a package, when the given name isn't + quite correct. This is usually used to implement case-insensitivity. + """ if not index_url.url.endswith('/'): # Vaguely part of the PyPI API... weird but true. ## FIXME: bad to modify this? @@ -249,12 +358,11 @@ class PackageFinder(object): for link in page.rel_links(): pending_queue.put(link) - _egg_fragment_re = re.compile(r'#egg=([^&]*)') - _egg_info_re = re.compile(r'([a-z0-9_.]+)-([a-z0-9_.-]+)', re.I) - _py_version_re = re.compile(r'-py([123]\.?[0-9]?)$') - def _sort_links(self, links): - "Returns elements of links in order, non-egg links first, egg links second, while eliminating duplicates" + """ + Returns elements of links in order, non-egg links first, + egg links second, while eliminating duplicates + """ eggs, no_eggs = [], [] seen = set() for link in links: @@ -306,7 +414,7 @@ class PackageFinder(object): if version is None: logger.debug('Skipping link %s; wrong project name (not %s)' % (link, search_name)) return [] - match = self._py_version_re.search(version) + match = _py_version_re.search(version) if match: version = version[:match.start()] py_version = match.group(1) @@ -319,7 +427,7 @@ class PackageFinder(object): version)] def _egg_info_matches(self, egg_info, search_name, link): - match = self._egg_info_re.search(egg_info) + match = _egg_info_re.search(egg_info) if not match: logger.debug('Could not parse version from link: %s' % link) return None @@ -334,25 +442,26 @@ class PackageFinder(object): def _get_page(self, link, req): return HTMLPage.get_page(link, req, cache=self.cache) - def _get_mirror_urls(self, mirrors=None, main_mirror_url=None): - """Retrieves a list of URLs from the main mirror DNS entry + def _get_mirror_urls(self, mirrors=None): + """ + Retrieves a list of URLs from the main mirror DNS entry unless a list of mirror URLs are passed. """ if not mirrors: - mirrors = get_mirrors(main_mirror_url) - # Should this be made "less random"? E.g. netselect like? - random.shuffle(mirrors) + mirrors = find_mirrors(amount=10, start_with='b', prefer_fastest=False) + mirrors = [mirror[0] for mirror in mirrors] - mirror_urls = set() + mirror_urls = [] for mirror_url in mirrors: # Make sure we have a valid URL - if not ("http://" or "https://" or "file://") in mirror_url: + if not mirror_url.startswith(("http://", "https://", "file://")): mirror_url = "http://%s" % mirror_url if not mirror_url.endswith("/simple"): mirror_url = "%s/simple/" % mirror_url - mirror_urls.add(mirror_url) + if mirror_url not in mirror_urls: + mirror_urls.append(mirror_url) - return list(mirror_urls) + return tuple(Link(url, is_mirror=True) for url in mirror_urls) class PageCache(object): @@ -415,7 +524,8 @@ class HTMLPage(object): from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': - logger.debug('Cannot look at %(scheme)s URL %(link)s' % locals()) + logger.debug('Cannot look at %s URL %s' % + (scheme, link)) return None if cache is not None: @@ -441,37 +551,30 @@ class HTMLPage(object): logger.debug('Getting page %s' % url) # Tack index.html onto file:// URLs that point to directories - (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) + parsed_url = urlparse.urlparse(url) + scheme, netloc, path, params, query, fragment = parsed_url if scheme == 'file' and os.path.isdir(url2pathname(path)): - # add trailing slash if not present so urljoin doesn't trim final segment + # add trailing slash if not present so urljoin + # doesn't trim final segment if not url.endswith('/'): url += '/' url = urlparse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s' % url) - resp = urlopen(url) - - real_url = geturl(resp) - headers = resp.info() - contents = resp.read() - encoding = headers.get('Content-Encoding', None) - #XXX need to handle exceptions and add testing for this - if encoding is not None: - if encoding == 'gzip': - contents = gzip.GzipFile(fileobj=BytesIO(contents)).read() - if encoding == 'deflate': - contents = zlib.decompress(contents) - inst = cls(u(contents), real_url, headers) - except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError): + response = urlopen(url) + inst = cls(u(response.content), response.url, response.headers) + except (HTTPError, URLError, socket.timeout, + socket.error, OSError, WindowsError, + requests.RequestException): e = sys.exc_info()[1] desc = str(e) - if isinstance(e, socket.timeout): + if isinstance(e, (socket.timeout, requests.Timeout)): log_meth = logger.info - level =1 + level = 1 desc = 'timed out' elif isinstance(e, URLError): log_meth = logger.info - if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): + if hasattr(e, 'reason') and isinstance(e.reason, (socket.timeout, requests.Timeout)): desc = 'timed out' level = 1 else: @@ -489,7 +592,7 @@ class HTMLPage(object): cache.add_page_failure(url, level) return None if cache is not None: - cache.add_page([url, real_url], inst) + cache.add_page([url, response.url], inst) return inst @staticmethod @@ -500,15 +603,11 @@ class HTMLPage(object): ## FIXME: some warning or something? ## assertion error? return '' - req = Urllib2HeadRequest(url, headers={'Host': netloc}) - resp = urlopen(req) - try: - if hasattr(resp, 'code') and resp.code != 200 and scheme not in ('ftp', 'ftps'): - ## FIXME: doesn't handle redirects - return '' - return resp.info().get('content-type', '') - finally: - resp.close() + response = urlopen(url, method='head') + if hasattr(response, 'status_code') and response.status_code != 200 and scheme not in ('ftp', 'ftps'): + ## FIXME: doesn't handle redirects + return '' + return response.headers.get('content-type', '') @property def base_url(self): @@ -522,7 +621,9 @@ class HTMLPage(object): @property def links(self): - """Yields all links in the page""" + """ + Yields all links in the page + """ for match in self._href_re.finditer(self.content): url = match.group(1) or match.group(2) or match.group(3) url = self.clean_link(urlparse.urljoin(self.base_url, url)) @@ -535,7 +636,9 @@ class HTMLPage(object): yield url def explicit_rel_links(self, rels=('homepage', 'download')): - """Yields all links with the given relations""" + """ + Yields all links with the given relations + """ for match in self._rel_re.finditer(self.content): found_rels = match.group(1).lower().split() for rel in rels: @@ -564,21 +667,30 @@ class HTMLPage(object): url = self.clean_link(urlparse.urljoin(self.base_url, url)) yield Link(url, self) - _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I) - def clean_link(self, url): - """Makes sure a link is fully encoded. That is, if a ' ' shows up in + """ + Makes sure a link is fully encoded. That is, if a ' ' shows up in the link, it will be rewritten to %20 (while not over-quoting - % or other characters).""" - return self._clean_re.sub( - lambda match: '%%%2x' % ord(match.group(0)), url) + % or other characters). + """ + def replacer(match): + matched_group = match.group(0) + return '%%%2x' % ord(matched_group) + return _clean_re.sub(replacer, url.strip()) class Link(object): - def __init__(self, url, comes_from=None): + def __init__(self, url, comes_from=None, is_mirror=False, mirror_urls=None): self.url = url self.comes_from = comes_from + self.is_mirror = is_mirror + if mirror_urls is not None: + for mirror in mirror_urls: + if not isinstance(mirror, Link): + mirror = Link(mirror) + if self.url.startswith(mirror.base_url): + self.is_mirror = True def __str__(self): if self.comes_from: @@ -603,6 +715,14 @@ class Link(object): return name @property + def netloc(self): + return urlparse.urlsplit(self.url)[1] + + @property + def base_url(self): + return urlparse.urlunsplit((self.scheme, self.netloc, '', '', '')) + + @property def scheme(self): return urlparse.urlsplit(self.url)[0] @@ -621,20 +741,16 @@ class Link(object): url = url.rstrip('/') return url - _egg_fragment_re = re.compile(r'#egg=([^&]*)') - @property def egg_fragment(self): - match = self._egg_fragment_re.search(self.url) + match = _egg_fragment_re.search(self.url) if not match: return None return match.group(1) - _md5_re = re.compile(r'md5=([a-f0-9]+)') - @property def md5_hash(self): - match = self._md5_re.search(self.url) + match = _md5_re.search(self.url) if match: return match.group(1) return None @@ -643,6 +759,10 @@ class Link(object): def show_url(self): return posixpath.basename(self.url.split('#', 1)[0].split('?', 1)[0]) + def copy(self): + return self.__class__(self.url, comes_from=self.comes_from, + is_mirror=self.is_mirror) + def get_requirement_from_url(url): """Get a requirement from the URL, if possible. This looks for #egg @@ -669,33 +789,9 @@ def package_to_requirement(package_name): return name -def get_mirrors(hostname=None): - """Return the list of mirrors from the last record found on the DNS - entry:: - - >>> from pip.index import get_mirrors - >>> get_mirrors() - ['a.pypi.python.org', 'b.pypi.python.org', 'c.pypi.python.org', - 'd.pypi.python.org'] - - Originally written for the distutils2 project by Alexis Metaireau. - """ - if hostname is None: - hostname = DEFAULT_MIRROR_URL - - # return the last mirror registered on PyPI. - try: - hostname = socket.gethostbyname_ex(hostname)[0] - except socket.gaierror: - return [] - end_letter = hostname.split(".", 1) - - # determine the list from the last one. - return ["%s.%s" % (s, end_letter[1]) for s in string_range(end_letter[0])] - - def string_range(last): - """Compute the range of string between "a" and last. + """ + Compute the range of string between "a" and last. This works for simple "a to z" lists, but also for "a to zz" lists. """ @@ -705,4 +801,3 @@ def string_range(last): yield result if result == last: return - |