diff options
Diffstat (limited to 'Lib/urllib/parse.py')
-rw-r--r-- | Lib/urllib/parse.py | 630 |
1 files changed, 630 insertions, 0 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py new file mode 100644 index 0000000000..71cc36908d --- /dev/null +++ b/Lib/urllib/parse.py @@ -0,0 +1,630 @@ +"""Parse (absolute and relative) URLs. + +See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, +UC Irvine, June 1995. +""" + +__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", + "urlsplit", "urlunsplit"] + +# A classification of schemes ('' means apply by default) +uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', + 'wais', 'file', 'https', 'shttp', 'mms', + 'prospero', 'rtsp', 'rtspu', '', 'sftp'] +uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', + 'imap', 'wais', 'file', 'mms', 'https', 'shttp', + 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', + 'svn', 'svn+ssh', 'sftp'] +non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', + 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] +uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', + 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', + 'mms', '', 'sftp'] +uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', + 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] +uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', + 'nntp', 'wais', 'https', 'shttp', 'snews', + 'file', 'prospero', ''] + +# Characters valid in scheme names +scheme_chars = ('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789' + '+-.') + +MAX_CACHE_SIZE = 20 +_parse_cache = {} + +def clear_cache(): + """Clear the parse cache.""" + _parse_cache.clear() + + +class ResultMixin(object): + """Shared methods for the parsed result objects.""" + + @property + def username(self): + netloc = self.netloc + if "@" in netloc: + userinfo = netloc.rsplit("@", 1)[0] + if ":" in userinfo: + userinfo = userinfo.split(":", 1)[0] + return userinfo + return None + + @property + def password(self): + netloc = self.netloc + if "@" in netloc: + userinfo = netloc.rsplit("@", 1)[0] + if ":" in userinfo: + return userinfo.split(":", 1)[1] + return None + + @property + def hostname(self): + netloc = self.netloc + if "@" in netloc: + netloc = netloc.rsplit("@", 1)[1] + if ":" in netloc: + netloc = netloc.split(":", 1)[0] + return netloc.lower() or None + + @property + def port(self): + netloc = self.netloc + if "@" in netloc: + netloc = netloc.rsplit("@", 1)[1] + if ":" in netloc: + port = netloc.split(":", 1)[1] + return int(port, 10) + return None + +from collections import namedtuple + +class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): + + __slots__ = () + + def geturl(self): + return urlunsplit(self) + + +class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): + + __slots__ = () + + def geturl(self): + return urlunparse(self) + + +def urlparse(url, scheme='', allow_fragments=True): + """Parse a URL into 6 components: + <scheme>://<netloc>/<path>;<params>?<query>#<fragment> + Return a 6-tuple: (scheme, netloc, path, params, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + tuple = urlsplit(url, scheme, allow_fragments) + scheme, netloc, url, query, fragment = tuple + if scheme in uses_params and ';' in url: + url, params = _splitparams(url) + else: + params = '' + return ParseResult(scheme, netloc, url, params, query, fragment) + +def _splitparams(url): + if '/' in url: + i = url.find(';', url.rfind('/')) + if i < 0: + return url, '' + else: + i = url.find(';') + return url[:i], url[i+1:] + +def _splitnetloc(url, start=0): + delim = len(url) # position of end of domain part of url, default is end + for c in '/?#': # look for delimiters; the order is NOT important + wdelim = url.find(c, start) # find first of this delim + if wdelim >= 0: # if found + delim = min(delim, wdelim) # use earliest delim position + return url[start:delim], url[delim:] # return (domain, rest) + +def urlsplit(url, scheme='', allow_fragments=True): + """Parse a URL into 5 components: + <scheme>://<netloc>/<path>?<query>#<fragment> + Return a 5-tuple: (scheme, netloc, path, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + allow_fragments = bool(allow_fragments) + key = url, scheme, allow_fragments, type(url), type(scheme) + cached = _parse_cache.get(key, None) + if cached: + return cached + if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth + clear_cache() + netloc = query = fragment = '' + i = url.find(':') + if i > 0: + if url[:i] == 'http': # optimize the common case + scheme = url[:i].lower() + url = url[i+1:] + if url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if allow_fragments and '#' in url: + url, fragment = url.split('#', 1) + if '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return v + for c in url[:i]: + if c not in scheme_chars: + break + else: + scheme, url = url[:i].lower(), url[i+1:] + if scheme in uses_netloc and url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if allow_fragments and scheme in uses_fragment and '#' in url: + url, fragment = url.split('#', 1) + if scheme in uses_query and '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return v + +def urlunparse(components): + """Put a parsed URL back together again. This may result in a + slightly different, but equivalent URL, if the URL that was parsed + originally had redundant delimiters, e.g. a ? with an empty query + (the draft states that these are equivalent).""" + scheme, netloc, url, params, query, fragment = components + if params: + url = "%s;%s" % (url, params) + return urlunsplit((scheme, netloc, url, query, fragment)) + +def urlunsplit(components): + scheme, netloc, url, query, fragment = components + if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if url and url[:1] != '/': url = '/' + url + url = '//' + (netloc or '') + url + if scheme: + url = scheme + ':' + url + if query: + url = url + '?' + query + if fragment: + url = url + '#' + fragment + return url + +def urljoin(base, url, allow_fragments=True): + """Join a base URL and a possibly relative URL to form an absolute + interpretation of the latter.""" + if not base: + return url + if not url: + return base + bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ + urlparse(base, '', allow_fragments) + scheme, netloc, path, params, query, fragment = \ + urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: + return url + if scheme in uses_netloc: + if netloc: + return urlunparse((scheme, netloc, path, + params, query, fragment)) + netloc = bnetloc + if path[:1] == '/': + return urlunparse((scheme, netloc, path, + params, query, fragment)) + if not (path or params or query): + return urlunparse((scheme, netloc, bpath, + bparams, bquery, fragment)) + segments = bpath.split('/')[:-1] + path.split('/') + # XXX The stuff below is bogus in various ways... + if segments[-1] == '.': + segments[-1] = '' + while '.' in segments: + segments.remove('.') + while 1: + i = 1 + n = len(segments) - 1 + while i < n: + if (segments[i] == '..' + and segments[i-1] not in ('', '..')): + del segments[i-1:i+1] + break + i = i+1 + else: + break + if segments == ['', '..']: + segments[-1] = '' + elif len(segments) >= 2 and segments[-1] == '..': + segments[-2:] = [''] + return urlunparse((scheme, netloc, '/'.join(segments), + params, query, fragment)) + +def urldefrag(url): + """Removes any existing fragment from URL. + + Returns a tuple of the defragmented URL and the fragment. If + the URL contained no fragments, the second element is the + empty string. + """ + if '#' in url: + s, n, p, a, q, frag = urlparse(url) + defrag = urlunparse((s, n, p, a, q, '')) + return defrag, frag + else: + return url, '' + + +_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) +_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) + +def unquote(s): + """unquote('abc%20def') -> 'abc def'.""" + res = s.split('%') + for i in range(1, len(res)): + item = res[i] + try: + res[i] = _hextochr[item[:2]] + item[2:] + except KeyError: + res[i] = '%' + item + except UnicodeDecodeError: + res[i] = chr(int(item[:2], 16)) + item[2:] + return "".join(res) + +def unquote_plus(s): + """unquote('%7e/abc+def') -> '~/abc def'""" + s = s.replace('+', ' ') + return unquote(s) + +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') +_safe_quoters= {} + +class Quoter: + def __init__(self, safe): + self.cache = {} + self.safe = safe + always_safe + + def __call__(self, c): + try: + return self.cache[c] + except KeyError: + if ord(c) < 256: + res = (c in self.safe) and c or ('%%%02X' % ord(c)) + self.cache[c] = res + return res + else: + return "".join(['%%%02X' % i for i in c.encode("utf-8")]) + +def quote(s, safe = '/'): + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ + cachekey = (safe, always_safe) + try: + quoter = _safe_quoters[cachekey] + except KeyError: + quoter = Quoter(safe) + _safe_quoters[cachekey] = quoter + res = map(quoter, s) + return ''.join(res) + +def quote_plus(s, safe = ''): + """Quote the query fragment of a URL; replacing ' ' with '+'""" + if ' ' in s: + s = quote(s, safe + ' ') + return s.replace(' ', '+') + return quote(s, safe) + +def urlencode(query,doseq=0): + """Encode a sequence of two-element tuples or dictionary into a URL query string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + # non-empty strings will fail this + if len(query) and not isinstance(query[0], tuple): + raise TypeError + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = quote_plus(str(k)) + v = quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = quote_plus(str(k)) + if isinstance(v, str): + v = quote_plus(v) + l.append(k + '=' + v) + elif isinstance(v, str): + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + quote_plus(str(elt))) + return '&'.join(l) + +# Utilities to parse URLs (most of these return None for missing parts): +# unwrap('<URL:type://host/path>') --> 'type://host/path' +# splittype('type:opaquestring') --> 'type', 'opaquestring' +# splithost('//host[:port]/path') --> 'host[:port]', '/path' +# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' +# splitpasswd('user:passwd') -> 'user', 'passwd' +# splitport('host:port') --> 'host', 'port' +# splitquery('/path?query') --> '/path', 'query' +# splittag('/path#tag') --> '/path', 'tag' +# splitattr('/path;attr1=value1;attr2=value2;...') -> +# '/path', ['attr1=value1', 'attr2=value2', ...] +# splitvalue('attr=value') --> 'attr', 'value' +# urllib.parse.unquote('abc%20def') -> 'abc def' +# quote('abc def') -> 'abc%20def') + +def toBytes(url): + """toBytes(u"URL") --> 'URL'.""" + # Most URL schemes require ASCII. If that changes, the conversion + # can be relaxed. + # XXX get rid of toBytes() + if isinstance(url, str): + try: + url = url.encode("ASCII").decode() + except UnicodeError: + raise UnicodeError("URL " + repr(url) + + " contains non-ASCII characters") + return url + +def unwrap(url): + """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" + url = str(url).strip() + if url[:1] == '<' and url[-1:] == '>': + url = url[1:-1].strip() + if url[:4] == 'URL:': url = url[4:].strip() + return url + +_typeprog = None +def splittype(url): + """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" + global _typeprog + if _typeprog is None: + import re + _typeprog = re.compile('^([^/:]+):') + + match = _typeprog.match(url) + if match: + scheme = match.group(1) + return scheme.lower(), url[len(scheme) + 1:] + return None, url + +_hostprog = None +def splithost(url): + """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" + global _hostprog + if _hostprog is None: + import re + _hostprog = re.compile('^//([^/?]*)(.*)$') + + match = _hostprog.match(url) + if match: return match.group(1, 2) + return None, url + +_userprog = None +def splituser(host): + """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" + global _userprog + if _userprog is None: + import re + _userprog = re.compile('^(.*)@(.*)$') + + match = _userprog.match(host) + if match: return map(unquote, match.group(1, 2)) + return None, host + +_passwdprog = None +def splitpasswd(user): + """splitpasswd('user:passwd') -> 'user', 'passwd'.""" + global _passwdprog + if _passwdprog is None: + import re + _passwdprog = re.compile('^([^:]*):(.*)$') + + match = _passwdprog.match(user) + if match: return match.group(1, 2) + return user, None + +# splittag('/path#tag') --> '/path', 'tag' +_portprog = None +def splitport(host): + """splitport('host:port') --> 'host', 'port'.""" + global _portprog + if _portprog is None: + import re + _portprog = re.compile('^(.*):([0-9]+)$') + + match = _portprog.match(host) + if match: return match.group(1, 2) + return host, None + +_nportprog = None +def splitnport(host, defport=-1): + """Split host and port, returning numeric port. + Return given default port if no ':' found; defaults to -1. + Return numerical port if a valid number are found after ':'. + Return None if ':' but not a valid number.""" + global _nportprog + if _nportprog is None: + import re + _nportprog = re.compile('^(.*):(.*)$') + + match = _nportprog.match(host) + if match: + host, port = match.group(1, 2) + try: + if not port: raise ValueError("no digits") + nport = int(port) + except ValueError: + nport = None + return host, nport + return host, defport + +_queryprog = None +def splitquery(url): + """splitquery('/path?query') --> '/path', 'query'.""" + global _queryprog + if _queryprog is None: + import re + _queryprog = re.compile('^(.*)\?([^?]*)$') + + match = _queryprog.match(url) + if match: return match.group(1, 2) + return url, None + +_tagprog = None +def splittag(url): + """splittag('/path#tag') --> '/path', 'tag'.""" + global _tagprog + if _tagprog is None: + import re + _tagprog = re.compile('^(.*)#([^#]*)$') + + match = _tagprog.match(url) + if match: return match.group(1, 2) + return url, None + +def splitattr(url): + """splitattr('/path;attr1=value1;attr2=value2;...') -> + '/path', ['attr1=value1', 'attr2=value2', ...].""" + words = url.split(';') + return words[0], words[1:] + +_valueprog = None +def splitvalue(attr): + """splitvalue('attr=value') --> 'attr', 'value'.""" + global _valueprog + if _valueprog is None: + import re + _valueprog = re.compile('^([^=]*)=(.*)$') + + match = _valueprog.match(attr) + if match: return match.group(1, 2) + return attr, None + +test_input = """ + http://a/b/c/d + + g:h = <URL:g:h> + http:g = <URL:http://a/b/c/g> + http: = <URL:http://a/b/c/d> + g = <URL:http://a/b/c/g> + ./g = <URL:http://a/b/c/g> + g/ = <URL:http://a/b/c/g/> + /g = <URL:http://a/g> + //g = <URL:http://g> + ?y = <URL:http://a/b/c/d?y> + g?y = <URL:http://a/b/c/g?y> + g?y/./x = <URL:http://a/b/c/g?y/./x> + . = <URL:http://a/b/c/> + ./ = <URL:http://a/b/c/> + .. = <URL:http://a/b/> + ../ = <URL:http://a/b/> + ../g = <URL:http://a/b/g> + ../.. = <URL:http://a/> + ../../g = <URL:http://a/g> + ../../../g = <URL:http://a/../g> + ./../g = <URL:http://a/b/g> + ./g/. = <URL:http://a/b/c/g/> + /./g = <URL:http://a/./g> + g/./h = <URL:http://a/b/c/g/h> + g/../h = <URL:http://a/b/c/h> + http:g = <URL:http://a/b/c/g> + http: = <URL:http://a/b/c/d> + http:?y = <URL:http://a/b/c/d?y> + http:g?y = <URL:http://a/b/c/g?y> + http:g?y/./x = <URL:http://a/b/c/g?y/./x> +""" + +def test(): + import sys + base = '' + if sys.argv[1:]: + fn = sys.argv[1] + if fn == '-': + fp = sys.stdin + else: + fp = open(fn) + else: + from io import StringIO + fp = StringIO(test_input) + for line in fp: + words = line.split() + if not words: + continue + url = words[0] + parts = urlparse(url) + print('%-10s : %s' % (url, parts)) + abs = urljoin(base, url) + if not base: + base = abs + wrapped = '<URL:%s>' % abs + print('%-10s = %s' % (url, wrapped)) + if len(words) == 3 and words[1] == '=': + if wrapped != words[2]: + print('EXPECTED', words[2], '!!!!!!!!!!') + +if __name__ == '__main__': + test() |