summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth Michael Larson <sethmichaellarson@gmail.com>2019-04-20 20:43:17 -0500
committerGitHub <noreply@github.com>2019-04-20 20:43:17 -0500
commit5d523706c7b03f947dc50a7e783758a2bfff0532 (patch)
tree816f4b2d83d681fc936c233fb6b26dd5d437c40a
parent1d3e60e86fce8938845fdc052f47ed9ef3da8859 (diff)
downloadurllib3-5d523706c7b03f947dc50a7e783758a2bfff0532.tar.gz
Use rfc3986.validator.Validator for parse_url (#1531)
-rw-r--r--src/urllib3/connectionpool.py18
-rw-r--r--src/urllib3/packages/rfc3986/__init__.py6
-rw-r--r--src/urllib3/packages/rfc3986/_mixin.py353
-rw-r--r--src/urllib3/packages/rfc3986/abnf_regexp.py95
-rw-r--r--src/urllib3/packages/rfc3986/api.py15
-rw-r--r--src/urllib3/packages/rfc3986/exceptions.py9
-rw-r--r--src/urllib3/packages/rfc3986/iri.py143
-rw-r--r--src/urllib3/packages/rfc3986/misc.py46
-rw-r--r--src/urllib3/packages/rfc3986/normalizers.py17
-rw-r--r--src/urllib3/packages/rfc3986/uri.py373
-rw-r--r--src/urllib3/packages/rfc3986/validators.py24
-rw-r--r--src/urllib3/util/url.py84
-rw-r--r--test/test_util.py71
13 files changed, 841 insertions, 413 deletions
diff --git a/src/urllib3/connectionpool.py b/src/urllib3/connectionpool.py
index 57502c33..157568a3 100644
--- a/src/urllib3/connectionpool.py
+++ b/src/urllib3/connectionpool.py
@@ -26,6 +26,7 @@ from .exceptions import (
from .packages.ssl_match_hostname import CertificateError
from .packages import six
from .packages.six.moves import queue
+from .packages.rfc3986.normalizers import normalize_host
from .connection import (
port_by_scheme,
DummyConnection,
@@ -65,7 +66,7 @@ class ConnectionPool(object):
if not host:
raise LocationValueError("No host specified.")
- self.host = _ipv6_host(host, self.scheme)
+ self.host = _normalize_host(host, scheme=self.scheme)
self._proxy_host = host.lower()
self.port = port
@@ -434,8 +435,8 @@ class HTTPConnectionPool(ConnectionPool, RequestMethods):
# TODO: Add optional support for socket.gethostbyname checking.
scheme, host, port = get_host(url)
-
- host = _ipv6_host(host, self.scheme)
+ if host is not None:
+ host = _normalize_host(host, scheme=scheme)
# Use explicit default port for comparison when none is given
if self.port and not port:
@@ -878,9 +879,9 @@ def connection_from_url(url, **kw):
return HTTPConnectionPool(host, port=port, **kw)
-def _ipv6_host(host, scheme):
+def _normalize_host(host, scheme):
"""
- Process IPv6 address literals
+ Normalize hosts for comparisons and use with sockets.
"""
# httplib doesn't like it when we include brackets in IPv6 addresses
@@ -889,11 +890,8 @@ def _ipv6_host(host, scheme):
# Instead, we need to make sure we never pass ``None`` as the port.
# However, for backward compatibility reasons we can't actually
# *assert* that. See http://bugs.python.org/issue28539
- #
- # Also if an IPv6 address literal has a zone identifier, the
- # percent sign might be URIencoded, convert it back into ASCII
if host.startswith('[') and host.endswith(']'):
- host = host.replace('%25', '%').strip('[]')
+ host = host.strip('[]')
if scheme in NORMALIZABLE_SCHEMES:
- host = host.lower()
+ host = normalize_host(host)
return host
diff --git a/src/urllib3/packages/rfc3986/__init__.py b/src/urllib3/packages/rfc3986/__init__.py
index 9719d6f7..13a786df 100644
--- a/src/urllib3/packages/rfc3986/__init__.py
+++ b/src/urllib3/packages/rfc3986/__init__.py
@@ -22,6 +22,8 @@ See http://rfc3986.readthedocs.io/ for detailed documentation.
:license: Apache v2.0, see LICENSE for details
"""
+from .api import iri_reference
+from .api import IRIReference
from .api import is_valid_uri
from .api import normalize_uri
from .api import uri_reference
@@ -34,14 +36,16 @@ __author__ = 'Ian Stapleton Cordasco'
__author_email__ = 'graffatcolmingov@gmail.com'
__license__ = 'Apache v2.0'
__copyright__ = 'Copyright 2014 Rackspace'
-__version__ = '1.2.0'
+__version__ = '1.3.0'
__all__ = (
'ParseResult',
'URIReference',
+ 'IRIReference',
'is_valid_uri',
'normalize_uri',
'uri_reference',
+ 'iri_reference',
'urlparse',
'__title__',
'__author__',
diff --git a/src/urllib3/packages/rfc3986/_mixin.py b/src/urllib3/packages/rfc3986/_mixin.py
new file mode 100644
index 00000000..543925cd
--- /dev/null
+++ b/src/urllib3/packages/rfc3986/_mixin.py
@@ -0,0 +1,353 @@
+"""Module containing the implementation of the URIMixin class."""
+import warnings
+
+from . import exceptions as exc
+from . import misc
+from . import normalizers
+from . import validators
+
+
+class URIMixin(object):
+ """Mixin with all shared methods for URIs and IRIs."""
+
+ __hash__ = tuple.__hash__
+
+ def authority_info(self):
+ """Return a dictionary with the ``userinfo``, ``host``, and ``port``.
+
+ If the authority is not valid, it will raise a
+ :class:`~rfc3986.exceptions.InvalidAuthority` Exception.
+
+ :returns:
+ ``{'userinfo': 'username:password', 'host': 'www.example.com',
+ 'port': '80'}``
+ :rtype: dict
+ :raises rfc3986.exceptions.InvalidAuthority:
+ If the authority is not ``None`` and can not be parsed.
+ """
+ if not self.authority:
+ return {'userinfo': None, 'host': None, 'port': None}
+
+ match = self._match_subauthority()
+
+ if match is None:
+ # In this case, we have an authority that was parsed from the URI
+ # Reference, but it cannot be further parsed by our
+ # misc.SUBAUTHORITY_MATCHER. In this case it must not be a valid
+ # authority.
+ raise exc.InvalidAuthority(self.authority.encode(self.encoding))
+
+ # We had a match, now let's ensure that it is actually a valid host
+ # address if it is IPv4
+ matches = match.groupdict()
+ host = matches.get('host')
+
+ if (host and misc.IPv4_MATCHER.match(host) and not
+ validators.valid_ipv4_host_address(host)):
+ # If we have a host, it appears to be IPv4 and it does not have
+ # valid bytes, it is an InvalidAuthority.
+ raise exc.InvalidAuthority(self.authority.encode(self.encoding))
+
+ return matches
+
+ def _match_subauthority(self):
+ return misc.SUBAUTHORITY_MATCHER.match(self.authority)
+
+ @property
+ def host(self):
+ """If present, a string representing the host."""
+ try:
+ authority = self.authority_info()
+ except exc.InvalidAuthority:
+ return None
+ return authority['host']
+
+ @property
+ def port(self):
+ """If present, the port extracted from the authority."""
+ try:
+ authority = self.authority_info()
+ except exc.InvalidAuthority:
+ return None
+ return authority['port']
+
+ @property
+ def userinfo(self):
+ """If present, the userinfo extracted from the authority."""
+ try:
+ authority = self.authority_info()
+ except exc.InvalidAuthority:
+ return None
+ return authority['userinfo']
+
+ def is_absolute(self):
+ """Determine if this URI Reference is an absolute URI.
+
+ See http://tools.ietf.org/html/rfc3986#section-4.3 for explanation.
+
+ :returns: ``True`` if it is an absolute URI, ``False`` otherwise.
+ :rtype: bool
+ """
+ return bool(misc.ABSOLUTE_URI_MATCHER.match(self.unsplit()))
+
+ def is_valid(self, **kwargs):
+ """Determine if the URI is valid.
+
+ .. deprecated:: 1.1.0
+
+ Use the :class:`~rfc3986.validators.Validator` object instead.
+
+ :param bool require_scheme: Set to ``True`` if you wish to require the
+ presence of the scheme component.
+ :param bool require_authority: Set to ``True`` if you wish to require
+ the presence of the authority component.
+ :param bool require_path: Set to ``True`` if you wish to require the
+ presence of the path component.
+ :param bool require_query: Set to ``True`` if you wish to require the
+ presence of the query component.
+ :param bool require_fragment: Set to ``True`` if you wish to require
+ the presence of the fragment component.
+ :returns: ``True`` if the URI is valid. ``False`` otherwise.
+ :rtype: bool
+ """
+ warnings.warn("Please use rfc3986.validators.Validator instead. "
+ "This method will be eventually removed.",
+ DeprecationWarning)
+ validators = [
+ (self.scheme_is_valid, kwargs.get('require_scheme', False)),
+ (self.authority_is_valid, kwargs.get('require_authority', False)),
+ (self.path_is_valid, kwargs.get('require_path', False)),
+ (self.query_is_valid, kwargs.get('require_query', False)),
+ (self.fragment_is_valid, kwargs.get('require_fragment', False)),
+ ]
+ return all(v(r) for v, r in validators)
+
+ def authority_is_valid(self, require=False):
+ """Determine if the authority component is valid.
+
+ .. deprecated:: 1.1.0
+
+ Use the :class:`~rfc3986.validators.Validator` object instead.
+
+ :param bool require:
+ Set to ``True`` to require the presence of this component.
+ :returns:
+ ``True`` if the authority is valid. ``False`` otherwise.
+ :rtype:
+ bool
+ """
+ warnings.warn("Please use rfc3986.validators.Validator instead. "
+ "This method will be eventually removed.",
+ DeprecationWarning)
+ try:
+ self.authority_info()
+ except exc.InvalidAuthority:
+ return False
+
+ return validators.authority_is_valid(
+ self.authority,
+ host=self.host,
+ require=require,
+ )
+
+ def scheme_is_valid(self, require=False):
+ """Determine if the scheme component is valid.
+
+ .. deprecated:: 1.1.0
+
+ Use the :class:`~rfc3986.validators.Validator` object instead.
+
+ :param str require: Set to ``True`` to require the presence of this
+ component.
+ :returns: ``True`` if the scheme is valid. ``False`` otherwise.
+ :rtype: bool
+ """
+ warnings.warn("Please use rfc3986.validators.Validator instead. "
+ "This method will be eventually removed.",
+ DeprecationWarning)
+ return validators.scheme_is_valid(self.scheme, require)
+
+ def path_is_valid(self, require=False):
+ """Determine if the path component is valid.
+
+ .. deprecated:: 1.1.0
+
+ Use the :class:`~rfc3986.validators.Validator` object instead.
+
+ :param str require: Set to ``True`` to require the presence of this
+ component.
+ :returns: ``True`` if the path is valid. ``False`` otherwise.
+ :rtype: bool
+ """
+ warnings.warn("Please use rfc3986.validators.Validator instead. "
+ "This method will be eventually removed.",
+ DeprecationWarning)
+ return validators.path_is_valid(self.path, require)
+
+ def query_is_valid(self, require=False):
+ """Determine if the query component is valid.
+
+ .. deprecated:: 1.1.0
+
+ Use the :class:`~rfc3986.validators.Validator` object instead.
+
+ :param str require: Set to ``True`` to require the presence of this
+ component.
+ :returns: ``True`` if the query is valid. ``False`` otherwise.
+ :rtype: bool
+ """
+ warnings.warn("Please use rfc3986.validators.Validator instead. "
+ "This method will be eventually removed.",
+ DeprecationWarning)
+ return validators.query_is_valid(self.query, require)
+
+ def fragment_is_valid(self, require=False):
+ """Determine if the fragment component is valid.
+
+ .. deprecated:: 1.1.0
+
+ Use the Validator object instead.
+
+ :param str require: Set to ``True`` to require the presence of this
+ component.
+ :returns: ``True`` if the fragment is valid. ``False`` otherwise.
+ :rtype: bool
+ """
+ warnings.warn("Please use rfc3986.validators.Validator instead. "
+ "This method will be eventually removed.",
+ DeprecationWarning)
+ return validators.fragment_is_valid(self.fragment, require)
+
+ def normalized_equality(self, other_ref):
+ """Compare this URIReference to another URIReference.
+
+ :param URIReference other_ref: (required), The reference with which
+ we're comparing.
+ :returns: ``True`` if the references are equal, ``False`` otherwise.
+ :rtype: bool
+ """
+ return tuple(self.normalize()) == tuple(other_ref.normalize())
+
+ def resolve_with(self, base_uri, strict=False):
+ """Use an absolute URI Reference to resolve this relative reference.
+
+ Assuming this is a relative reference that you would like to resolve,
+ use the provided base URI to resolve it.
+
+ See http://tools.ietf.org/html/rfc3986#section-5 for more information.
+
+ :param base_uri: Either a string or URIReference. It must be an
+ absolute URI or it will raise an exception.
+ :returns: A new URIReference which is the result of resolving this
+ reference using ``base_uri``.
+ :rtype: :class:`URIReference`
+ :raises rfc3986.exceptions.ResolutionError:
+ If the ``base_uri`` is not an absolute URI.
+ """
+ if not isinstance(base_uri, URIMixin):
+ base_uri = type(self).from_string(base_uri)
+
+ if not base_uri.is_absolute():
+ raise exc.ResolutionError(base_uri)
+
+ # This is optional per
+ # http://tools.ietf.org/html/rfc3986#section-5.2.1
+ base_uri = base_uri.normalize()
+
+ # The reference we're resolving
+ resolving = self
+
+ if not strict and resolving.scheme == base_uri.scheme:
+ resolving = resolving.copy_with(scheme=None)
+
+ # http://tools.ietf.org/html/rfc3986#page-32
+ if resolving.scheme is not None:
+ target = resolving.copy_with(
+ path=normalizers.normalize_path(resolving.path)
+ )
+ else:
+ if resolving.authority is not None:
+ target = resolving.copy_with(
+ scheme=base_uri.scheme,
+ path=normalizers.normalize_path(resolving.path)
+ )
+ else:
+ if resolving.path is None:
+ if resolving.query is not None:
+ query = resolving.query
+ else:
+ query = base_uri.query
+ target = resolving.copy_with(
+ scheme=base_uri.scheme,
+ authority=base_uri.authority,
+ path=base_uri.path,
+ query=query
+ )
+ else:
+ if resolving.path.startswith('/'):
+ path = normalizers.normalize_path(resolving.path)
+ else:
+ path = normalizers.normalize_path(
+ misc.merge_paths(base_uri, resolving.path)
+ )
+ target = resolving.copy_with(
+ scheme=base_uri.scheme,
+ authority=base_uri.authority,
+ path=path,
+ query=resolving.query
+ )
+ return target
+
+ def unsplit(self):
+ """Create a URI string from the components.
+
+ :returns: The URI Reference reconstituted as a string.
+ :rtype: str
+ """
+ # See http://tools.ietf.org/html/rfc3986#section-5.3
+ result_list = []
+ if self.scheme:
+ result_list.extend([self.scheme, ':'])
+ if self.authority:
+ result_list.extend(['//', self.authority])
+ if self.path:
+ result_list.append(self.path)
+ if self.query is not None:
+ result_list.extend(['?', self.query])
+ if self.fragment is not None:
+ result_list.extend(['#', self.fragment])
+ return ''.join(result_list)
+
+ def copy_with(self, scheme=misc.UseExisting, authority=misc.UseExisting,
+ path=misc.UseExisting, query=misc.UseExisting,
+ fragment=misc.UseExisting):
+ """Create a copy of this reference with the new components.
+
+ :param str scheme:
+ (optional) The scheme to use for the new reference.
+ :param str authority:
+ (optional) The authority to use for the new reference.
+ :param str path:
+ (optional) The path to use for the new reference.
+ :param str query:
+ (optional) The query to use for the new reference.
+ :param str fragment:
+ (optional) The fragment to use for the new reference.
+ :returns:
+ New URIReference with provided components.
+ :rtype:
+ URIReference
+ """
+ attributes = {
+ 'scheme': scheme,
+ 'authority': authority,
+ 'path': path,
+ 'query': query,
+ 'fragment': fragment,
+ }
+ for key, value in list(attributes.items()):
+ if value is misc.UseExisting:
+ del attributes[key]
+ uri = self._replace(**attributes)
+ uri.encoding = self.encoding
+ return uri
diff --git a/src/urllib3/packages/rfc3986/abnf_regexp.py b/src/urllib3/packages/rfc3986/abnf_regexp.py
index 5b6da177..24c9c3d0 100644
--- a/src/urllib3/packages/rfc3986/abnf_regexp.py
+++ b/src/urllib3/packages/rfc3986/abnf_regexp.py
@@ -13,6 +13,8 @@
# limitations under the License.
"""Module for the regular expressions crafted from ABNF."""
+import sys
+
# https://tools.ietf.org/html/rfc3986#page-13
GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
@@ -25,7 +27,7 @@ RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
DIGIT = '0123456789'
# https://tools.ietf.org/html/rfc3986#section-2.3
-UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + '._!-'
+UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r'._!-'
UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
# We need to escape the '-' in this case:
@@ -75,7 +77,7 @@ REGULAR_NAME_RE = REG_NAME = '((?:{0}|[{1}])*)'.format(
'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE
)
# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
-IPv4_RE = '([0-9]{1,3}.){3}[0-9]{1,3}'
+IPv4_RE = r'([0-9]{1,3}\.){3}[0-9]{1,3}'
# Hexadecimal characters used in each piece of an IPv6 address
HEXDIG_RE = '[0-9A-Fa-f]{1,4}'
# Least-significant 32 bits of an IPv6 address
@@ -111,18 +113,18 @@ IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))'.format(
*variations
)
-IPv_FUTURE_RE = 'v[0-9A-Fa-f]+.[%s]+' % (
+IPv_FUTURE_RE = r'v[0-9A-Fa-f]+\.[%s]+' % (
UNRESERVED_RE + SUB_DELIMITERS_RE + ':'
)
-
# RFC 6874 Zone ID ABNF
ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+'
-IPv6_ADDRZ_RE = IPv6_RE + '%25' + ZONE_ID
-IP_LITERAL_RE = r'\[({0}|(?:{1})|{2})\]'.format(
- IPv6_RE,
- IPv6_ADDRZ_RE,
+IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?'
+IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?'
+
+IP_LITERAL_RE = r'\[({0}|{1})\]'.format(
+ IPv6_ADDRZ_RFC4007_RE,
IPv_FUTURE_RE,
)
@@ -186,3 +188,80 @@ HIER_PART_RE = '(//%s%s|%s|%s|%s)' % (
PATH_ROOTLESS,
PATH_EMPTY,
)
+
+# ###############
+# IRIs / RFC 3987
+# ###############
+
+# Only wide-unicode gets the high-ranges of UCSCHAR
+if sys.maxunicode > 0xFFFF: # pragma: no cover
+ IPRIVATE = u'\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD'
+ UCSCHAR_RE = (
+ u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'
+ u'\U00010000-\U0001FFFD\U00020000-\U0002FFFD'
+ u'\U00030000-\U0003FFFD\U00040000-\U0004FFFD'
+ u'\U00050000-\U0005FFFD\U00060000-\U0006FFFD'
+ u'\U00070000-\U0007FFFD\U00080000-\U0008FFFD'
+ u'\U00090000-\U0009FFFD\U000A0000-\U000AFFFD'
+ u'\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD'
+ u'\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD'
+ )
+else: # pragma: no cover
+ IPRIVATE = u'\uE000-\uF8FF'
+ UCSCHAR_RE = (
+ u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'
+ )
+
+IUNRESERVED_RE = u'A-Za-z0-9\\._~\\-' + UCSCHAR_RE
+IPCHAR = u'([' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':@]|%s)' % PCT_ENCODED
+
+isegments = {
+ 'isegment': IPCHAR + u'*',
+ # Non-zero length segment
+ 'isegment-nz': IPCHAR + u'+',
+ # Non-zero length segment without ":"
+ 'isegment-nz-nc': IPCHAR.replace(':', '') + u'+'
+}
+
+IPATH_ROOTLESS = u'%(isegment-nz)s(/%(isegment)s)*' % isegments
+IPATH_NOSCHEME = u'%(isegment-nz-nc)s(/%(isegment)s)*' % isegments
+IPATH_ABSOLUTE = u'/(?:%s)?' % IPATH_ROOTLESS
+IPATH_ABEMPTY = u'(?:/%(isegment)s)*' % isegments
+IPATH_RE = u'^(?:%s|%s|%s|%s|%s)$' % (
+ IPATH_ABEMPTY, IPATH_ABSOLUTE, IPATH_NOSCHEME, IPATH_ROOTLESS, PATH_EMPTY
+)
+
+IREGULAR_NAME_RE = IREG_NAME = u'(?:{0}|[{1}])*'.format(
+ u'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + IUNRESERVED_RE
+)
+
+IHOST_RE = IHOST_PATTERN = u'({0}|{1}|{2})'.format(
+ IREG_NAME,
+ IPv4_RE,
+ IP_LITERAL_RE,
+)
+
+IUSERINFO_RE = u'^(?:[' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':]|%s)+' % (
+ PCT_ENCODED
+)
+
+IFRAGMENT_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE
+ + u']|%s)*$' % PCT_ENCODED)
+IQUERY_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE
+ + IPRIVATE + u']|%s)*$' % PCT_ENCODED)
+
+IRELATIVE_PART_RE = u'(//%s%s|%s|%s|%s)' % (
+ COMPONENT_PATTERN_DICT['authority'],
+ IPATH_ABEMPTY,
+ IPATH_ABSOLUTE,
+ IPATH_NOSCHEME,
+ PATH_EMPTY,
+)
+
+IHIER_PART_RE = u'(//%s%s|%s|%s|%s)' % (
+ COMPONENT_PATTERN_DICT['authority'],
+ IPATH_ABEMPTY,
+ IPATH_ABSOLUTE,
+ IPATH_ROOTLESS,
+ PATH_EMPTY,
+)
diff --git a/src/urllib3/packages/rfc3986/api.py b/src/urllib3/packages/rfc3986/api.py
index 17f4daf9..ddc4a1cd 100644
--- a/src/urllib3/packages/rfc3986/api.py
+++ b/src/urllib3/packages/rfc3986/api.py
@@ -19,6 +19,7 @@ This module defines functions and provides access to the public attributes
and classes of rfc3986.
"""
+from .iri import IRIReference
from .parseresult import ParseResult
from .uri import URIReference
@@ -37,6 +38,20 @@ def uri_reference(uri, encoding='utf-8'):
return URIReference.from_string(uri, encoding)
+def iri_reference(iri, encoding='utf-8'):
+ """Parse a IRI string into an IRIReference.
+
+ This is a convenience function. You could achieve the same end by using
+ ``IRIReference.from_string(iri)``.
+
+ :param str iri: The IRI which needs to be parsed into a reference.
+ :param str encoding: The encoding of the string provided
+ :returns: A parsed IRI
+ :rtype: :class:`IRIReference`
+ """
+ return IRIReference.from_string(iri, encoding)
+
+
def is_valid_uri(uri, encoding='utf-8', **kwargs):
"""Determine if the URI given is valid.
diff --git a/src/urllib3/packages/rfc3986/exceptions.py b/src/urllib3/packages/rfc3986/exceptions.py
index e0886a5f..da8ca7cb 100644
--- a/src/urllib3/packages/rfc3986/exceptions.py
+++ b/src/urllib3/packages/rfc3986/exceptions.py
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
"""Exceptions module for rfc3986."""
+from . import compat
+
class RFC3986Exception(Exception):
"""Base class for all rfc3986 exception classes."""
@@ -14,7 +16,8 @@ class InvalidAuthority(RFC3986Exception):
def __init__(self, authority):
"""Initialize the exception with the invalid authority."""
super(InvalidAuthority, self).__init__(
- "The authority ({0}) is not valid.".format(authority))
+ u"The authority ({0}) is not valid.".format(
+ compat.to_str(authority)))
class InvalidPort(RFC3986Exception):
@@ -109,3 +112,7 @@ class InvalidComponentsError(ValidationError):
uri,
self.components,
)
+
+
+class MissingDependencyError(RFC3986Exception):
+ """Exception raised when an IRI is encoded without the 'idna' module."""
diff --git a/src/urllib3/packages/rfc3986/iri.py b/src/urllib3/packages/rfc3986/iri.py
new file mode 100644
index 00000000..2c708d85
--- /dev/null
+++ b/src/urllib3/packages/rfc3986/iri.py
@@ -0,0 +1,143 @@
+"""Module containing the implementation of the IRIReference class."""
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Rackspace
+# Copyright (c) 2015 Ian Stapleton Cordasco
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import namedtuple
+
+from . import compat
+from . import exceptions
+from . import misc
+from . import normalizers
+from . import uri
+
+
+try:
+ import idna
+except ImportError: # pragma: no cover
+ idna = None
+
+
+class IRIReference(namedtuple('IRIReference', misc.URI_COMPONENTS),
+ uri.URIMixin):
+ """Immutable object representing a parsed IRI Reference.
+
+ Can be encoded into an URIReference object via the procedure
+ specified in RFC 3987 Section 3.1
+
+ .. note::
+ The IRI submodule is a new interface and may possibly change in
+ the future. Check for changes to the interface when upgrading.
+ """
+
+ slots = ()
+
+ def __new__(cls, scheme, authority, path, query, fragment,
+ encoding='utf-8'):
+ """Create a new IRIReference."""
+ ref = super(IRIReference, cls).__new__(
+ cls,
+ scheme or None,
+ authority or None,
+ path or None,
+ query,
+ fragment)
+ ref.encoding = encoding
+ return ref
+
+ def __eq__(self, other):
+ """Compare this reference to another."""
+ other_ref = other
+ if isinstance(other, tuple):
+ other_ref = self.__class__(*other)
+ elif not isinstance(other, IRIReference):
+ try:
+ other_ref = self.__class__.from_string(other)
+ except TypeError:
+ raise TypeError(
+ 'Unable to compare {0}() to {1}()'.format(
+ type(self).__name__, type(other).__name__))
+
+ # See http://tools.ietf.org/html/rfc3986#section-6.2
+ return tuple(self) == tuple(other_ref)
+
+ def _match_subauthority(self):
+ return misc.ISUBAUTHORITY_MATCHER.match(self.authority)
+
+ @classmethod
+ def from_string(cls, iri_string, encoding='utf-8'):
+ """Parse a IRI reference from the given unicode IRI string.
+
+ :param str iri_string: Unicode IRI to be parsed into a reference.
+ :param str encoding: The encoding of the string provided
+ :returns: :class:`IRIReference` or subclass thereof
+ """
+ iri_string = compat.to_str(iri_string, encoding)
+
+ split_iri = misc.IRI_MATCHER.match(iri_string).groupdict()
+ return cls(
+ split_iri['scheme'], split_iri['authority'],
+ normalizers.encode_component(split_iri['path'], encoding),
+ normalizers.encode_component(split_iri['query'], encoding),
+ normalizers.encode_component(split_iri['fragment'], encoding),
+ encoding,
+ )
+
+ def encode(self, idna_encoder=None):
+ """Encode an IRIReference into a URIReference instance.
+
+ If the ``idna`` module is installed or the ``rfc3986[idna]``
+ extra is used then unicode characters in the IRI host
+ component will be encoded with IDNA2008.
+
+ :param idna_encoder:
+ Function that encodes each part of the host component
+ If not given will raise an exception if the IRI
+ contains a host component.
+ :rtype: uri.URIReference
+ :returns: A URI reference
+ """
+ authority = self.authority
+ if authority:
+ if idna_encoder is None:
+ if idna is None: # pragma: no cover
+ raise exceptions.MissingDependencyError(
+ "Could not import the 'idna' module "
+ "and the IRI hostname requires encoding"
+ )
+ else:
+ def idna_encoder(x):
+ try:
+ return idna.encode(x, strict=True, std3_rules=True).lower()
+ except idna.IDNAError:
+ raise exceptions.InvalidAuthority(self.authority)
+
+ authority = ""
+ if self.host:
+ authority = ".".join([compat.to_str(idna_encoder(part))
+ for part in self.host.split(".")])
+
+ if self.userinfo is not None:
+ authority = (normalizers.encode_component(
+ self.userinfo, self.encoding) + '@' + authority)
+
+ if self.port is not None:
+ authority += ":" + str(self.port)
+
+ return uri.URIReference(self.scheme,
+ authority,
+ path=self.path,
+ query=self.query,
+ fragment=self.fragment,
+ encoding=self.encoding)
diff --git a/src/urllib3/packages/rfc3986/misc.py b/src/urllib3/packages/rfc3986/misc.py
index 697039a9..00f9f3b9 100644
--- a/src/urllib3/packages/rfc3986/misc.py
+++ b/src/urllib3/packages/rfc3986/misc.py
@@ -58,7 +58,14 @@ SUBAUTHORITY_MATCHER = re.compile((
abnf_regexp.PORT_RE))
+HOST_MATCHER = re.compile('^' + abnf_regexp.HOST_RE + '$')
IPv4_MATCHER = re.compile('^' + abnf_regexp.IPv4_RE + '$')
+IPv6_MATCHER = re.compile(r'^\[' + abnf_regexp.IPv6_ADDRZ_RFC4007_RE + r'\]$')
+
+# Used by host validator
+IPv6_NO_RFC4007_MATCHER = re.compile(r'^\[%s\]$' % (
+ abnf_regexp.IPv6_ADDRZ_RE
+))
# Matcher used to validate path components
PATH_MATCHER = re.compile(abnf_regexp.PATH_RE)
@@ -76,7 +83,8 @@ FRAGMENT_MATCHER = QUERY_MATCHER
SCHEME_MATCHER = re.compile('^{0}$'.format(abnf_regexp.SCHEME_RE))
RELATIVE_REF_MATCHER = re.compile(r'^%s(\?%s)?(#%s)?$' % (
- abnf_regexp.RELATIVE_PART_RE, abnf_regexp.QUERY_RE,
+ abnf_regexp.RELATIVE_PART_RE,
+ abnf_regexp.QUERY_RE,
abnf_regexp.FRAGMENT_RE,
))
@@ -87,6 +95,42 @@ ABSOLUTE_URI_MATCHER = re.compile(r'^%s:%s(\?%s)?$' % (
abnf_regexp.QUERY_RE[1:-1],
))
+# ###############
+# IRIs / RFC 3987
+# ###############
+
+IRI_MATCHER = re.compile(abnf_regexp.URL_PARSING_RE, re.UNICODE)
+
+ISUBAUTHORITY_MATCHER = re.compile((
+ u'^(?:(?P<userinfo>{0})@)?' # iuserinfo
+ u'(?P<host>{1})' # ihost
+ u':?(?P<port>{2})?$' # port
+ ).format(abnf_regexp.IUSERINFO_RE,
+ abnf_regexp.IHOST_RE,
+ abnf_regexp.PORT_RE), re.UNICODE)
+
+
+IHOST_MATCHER = re.compile('^' + abnf_regexp.IHOST_RE + '$', re.UNICODE)
+
+IPATH_MATCHER = re.compile(abnf_regexp.IPATH_RE, re.UNICODE)
+
+IQUERY_MATCHER = re.compile(abnf_regexp.IQUERY_RE, re.UNICODE)
+
+IFRAGMENT_MATCHER = re.compile(abnf_regexp.IFRAGMENT_RE, re.UNICODE)
+
+
+RELATIVE_IRI_MATCHER = re.compile(u'^%s(?:\\?%s)?(?:%s)?$' % (
+ abnf_regexp.IRELATIVE_PART_RE,
+ abnf_regexp.IQUERY_RE,
+ abnf_regexp.IFRAGMENT_RE
+), re.UNICODE)
+
+ABSOLUTE_IRI_MATCHER = re.compile(u'^%s:%s(?:\\?%s)?$' % (
+ abnf_regexp.COMPONENT_PATTERN_DICT['scheme'],
+ abnf_regexp.IHIER_PART_RE,
+ abnf_regexp.IQUERY_RE[1:-1]
+), re.UNICODE)
+
# Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3
def merge_paths(base_uri, relative_path):
diff --git a/src/urllib3/packages/rfc3986/normalizers.py b/src/urllib3/packages/rfc3986/normalizers.py
index ea6c6e18..2eb1bb36 100644
--- a/src/urllib3/packages/rfc3986/normalizers.py
+++ b/src/urllib3/packages/rfc3986/normalizers.py
@@ -49,6 +49,21 @@ def normalize_password(password):
def normalize_host(host):
"""Normalize a host string."""
+ if misc.IPv6_MATCHER.match(host):
+ percent = host.find('%')
+ if percent != -1:
+ percent_25 = host.find('%25')
+
+ # Replace RFC 4007 IPv6 Zone ID delimiter '%' with '%25'
+ # from RFC 6874. If the host is '[<IPv6 addr>%25]' then we
+ # assume RFC 4007 and normalize to '[<IPV6 addr>%2525]'
+ if percent_25 == -1 or percent < percent_25 or \
+ (percent == percent_25 and percent_25 == len(host) - 4):
+ host = host.replace('%', '%25', 1)
+
+ # Don't normalize the casing of the Zone ID
+ return host[:percent].lower() + host[percent:]
+
return host.lower()
@@ -147,6 +162,6 @@ def encode_component(uri_component, encoding):
or (byte_ord < 128 and byte.decode() in misc.NON_PCT_ENCODED)):
encoded_uri.extend(byte)
continue
- encoded_uri.extend('%{0:02x}'.format(byte_ord).encode())
+ encoded_uri.extend('%{0:02x}'.format(byte_ord).encode().upper())
return encoded_uri.decode(encoding)
diff --git a/src/urllib3/packages/rfc3986/uri.py b/src/urllib3/packages/rfc3986/uri.py
index 244fff55..d1d71505 100644
--- a/src/urllib3/packages/rfc3986/uri.py
+++ b/src/urllib3/packages/rfc3986/uri.py
@@ -15,16 +15,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import namedtuple
-import warnings
from . import compat
-from . import exceptions as exc
from . import misc
from . import normalizers
-from . import validators
+from ._mixin import URIMixin
-class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS)):
+class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS), URIMixin):
"""Immutable object representing a parsed URI Reference.
.. note::
@@ -116,228 +114,6 @@ class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS)):
naive_equality = tuple(self) == tuple(other_ref)
return naive_equality or self.normalized_equality(other_ref)
- @classmethod
- def from_string(cls, uri_string, encoding='utf-8'):
- """Parse a URI reference from the given unicode URI string.
-
- :param str uri_string: Unicode URI to be parsed into a reference.
- :param str encoding: The encoding of the string provided
- :returns: :class:`URIReference` or subclass thereof
- """
- uri_string = compat.to_str(uri_string, encoding)
-
- split_uri = misc.URI_MATCHER.match(uri_string).groupdict()
- return cls(
- split_uri['scheme'], split_uri['authority'],
- normalizers.encode_component(split_uri['path'], encoding),
- normalizers.encode_component(split_uri['query'], encoding),
- normalizers.encode_component(split_uri['fragment'], encoding),
- encoding,
- )
-
- def authority_info(self):
- """Return a dictionary with the ``userinfo``, ``host``, and ``port``.
-
- If the authority is not valid, it will raise a
- :class:`~rfc3986.exceptions.InvalidAuthority` Exception.
-
- :returns:
- ``{'userinfo': 'username:password', 'host': 'www.example.com',
- 'port': '80'}``
- :rtype: dict
- :raises rfc3986.exceptions.InvalidAuthority:
- If the authority is not ``None`` and can not be parsed.
- """
- if not self.authority:
- return {'userinfo': None, 'host': None, 'port': None}
-
- match = misc.SUBAUTHORITY_MATCHER.match(self.authority)
-
- if match is None:
- # In this case, we have an authority that was parsed from the URI
- # Reference, but it cannot be further parsed by our
- # misc.SUBAUTHORITY_MATCHER. In this case it must not be a valid
- # authority.
- raise exc.InvalidAuthority(self.authority.encode(self.encoding))
-
- # We had a match, now let's ensure that it is actually a valid host
- # address if it is IPv4
- matches = match.groupdict()
- host = matches.get('host')
-
- if (host and misc.IPv4_MATCHER.match(host) and not
- validators.valid_ipv4_host_address(host)):
- # If we have a host, it appears to be IPv4 and it does not have
- # valid bytes, it is an InvalidAuthority.
- raise exc.InvalidAuthority(self.authority.encode(self.encoding))
-
- return matches
-
- @property
- def host(self):
- """If present, a string representing the host."""
- try:
- authority = self.authority_info()
- except exc.InvalidAuthority:
- return None
- return authority['host']
-
- @property
- def port(self):
- """If present, the port extracted from the authority."""
- try:
- authority = self.authority_info()
- except exc.InvalidAuthority:
- return None
- return authority['port']
-
- @property
- def userinfo(self):
- """If present, the userinfo extracted from the authority."""
- try:
- authority = self.authority_info()
- except exc.InvalidAuthority:
- return None
- return authority['userinfo']
-
- def is_absolute(self):
- """Determine if this URI Reference is an absolute URI.
-
- See http://tools.ietf.org/html/rfc3986#section-4.3 for explanation.
-
- :returns: ``True`` if it is an absolute URI, ``False`` otherwise.
- :rtype: bool
- """
- return bool(misc.ABSOLUTE_URI_MATCHER.match(self.unsplit()))
-
- def is_valid(self, **kwargs):
- """Determine if the URI is valid.
-
- .. deprecated:: 1.1.0
-
- Use the :class:`~rfc3986.validators.Validator` object instead.
-
- :param bool require_scheme: Set to ``True`` if you wish to require the
- presence of the scheme component.
- :param bool require_authority: Set to ``True`` if you wish to require
- the presence of the authority component.
- :param bool require_path: Set to ``True`` if you wish to require the
- presence of the path component.
- :param bool require_query: Set to ``True`` if you wish to require the
- presence of the query component.
- :param bool require_fragment: Set to ``True`` if you wish to require
- the presence of the fragment component.
- :returns: ``True`` if the URI is valid. ``False`` otherwise.
- :rtype: bool
- """
- warnings.warn("Please use rfc3986.validators.Validator instead. "
- "This method will be eventually removed.",
- DeprecationWarning)
- validators = [
- (self.scheme_is_valid, kwargs.get('require_scheme', False)),
- (self.authority_is_valid, kwargs.get('require_authority', False)),
- (self.path_is_valid, kwargs.get('require_path', False)),
- (self.query_is_valid, kwargs.get('require_query', False)),
- (self.fragment_is_valid, kwargs.get('require_fragment', False)),
- ]
- return all(v(r) for v, r in validators)
-
- def authority_is_valid(self, require=False):
- """Determine if the authority component is valid.
-
- .. deprecated:: 1.1.0
-
- Use the :class:`~rfc3986.validators.Validator` object instead.
-
- :param bool require:
- Set to ``True`` to require the presence of this component.
- :returns:
- ``True`` if the authority is valid. ``False`` otherwise.
- :rtype:
- bool
- """
- warnings.warn("Please use rfc3986.validators.Validator instead. "
- "This method will be eventually removed.",
- DeprecationWarning)
- try:
- self.authority_info()
- except exc.InvalidAuthority:
- return False
-
- return validators.authority_is_valid(
- self.authority,
- host=self.host,
- require=require,
- )
-
- def scheme_is_valid(self, require=False):
- """Determine if the scheme component is valid.
-
- .. deprecated:: 1.1.0
-
- Use the :class:`~rfc3986.validators.Validator` object instead.
-
- :param str require: Set to ``True`` to require the presence of this
- component.
- :returns: ``True`` if the scheme is valid. ``False`` otherwise.
- :rtype: bool
- """
- warnings.warn("Please use rfc3986.validators.Validator instead. "
- "This method will be eventually removed.",
- DeprecationWarning)
- return validators.scheme_is_valid(self.scheme, require)
-
- def path_is_valid(self, require=False):
- """Determine if the path component is valid.
-
- .. deprecated:: 1.1.0
-
- Use the :class:`~rfc3986.validators.Validator` object instead.
-
- :param str require: Set to ``True`` to require the presence of this
- component.
- :returns: ``True`` if the path is valid. ``False`` otherwise.
- :rtype: bool
- """
- warnings.warn("Please use rfc3986.validators.Validator instead. "
- "This method will be eventually removed.",
- DeprecationWarning)
- return validators.path_is_valid(self.path, require)
-
- def query_is_valid(self, require=False):
- """Determine if the query component is valid.
-
- .. deprecated:: 1.1.0
-
- Use the :class:`~rfc3986.validators.Validator` object instead.
-
- :param str require: Set to ``True`` to require the presence of this
- component.
- :returns: ``True`` if the query is valid. ``False`` otherwise.
- :rtype: bool
- """
- warnings.warn("Please use rfc3986.validators.Validator instead. "
- "This method will be eventually removed.",
- DeprecationWarning)
- return validators.query_is_valid(self.query, require)
-
- def fragment_is_valid(self, require=False):
- """Determine if the fragment component is valid.
-
- .. deprecated:: 1.1.0
-
- Use the Validator object instead.
-
- :param str require: Set to ``True`` to require the presence of this
- component.
- :returns: ``True`` if the fragment is valid. ``False`` otherwise.
- :rtype: bool
- """
- warnings.warn("Please use rfc3986.validators.Validator instead. "
- "This method will be eventually removed.",
- DeprecationWarning)
- return validators.fragment_is_valid(self.fragment, require)
-
def normalize(self):
"""Normalize this reference as described in Section 6.2.2.
@@ -357,136 +133,21 @@ class URIReference(namedtuple('URIReference', misc.URI_COMPONENTS)):
normalizers.normalize_fragment(self.fragment),
self.encoding)
- def normalized_equality(self, other_ref):
- """Compare this URIReference to another URIReference.
+ @classmethod
+ def from_string(cls, uri_string, encoding='utf-8'):
+ """Parse a URI reference from the given unicode URI string.
- :param URIReference other_ref: (required), The reference with which
- we're comparing.
- :returns: ``True`` if the references are equal, ``False`` otherwise.
- :rtype: bool
+ :param str uri_string: Unicode URI to be parsed into a reference.
+ :param str encoding: The encoding of the string provided
+ :returns: :class:`URIReference` or subclass thereof
"""
- return tuple(self.normalize()) == tuple(other_ref.normalize())
-
- def resolve_with(self, base_uri, strict=False):
- """Use an absolute URI Reference to resolve this relative reference.
-
- Assuming this is a relative reference that you would like to resolve,
- use the provided base URI to resolve it.
-
- See http://tools.ietf.org/html/rfc3986#section-5 for more information.
+ uri_string = compat.to_str(uri_string, encoding)
- :param base_uri: Either a string or URIReference. It must be an
- absolute URI or it will raise an exception.
- :returns: A new URIReference which is the result of resolving this
- reference using ``base_uri``.
- :rtype: :class:`URIReference`
- :raises rfc3986.exceptions.ResolutionError:
- If the ``base_uri`` is not an absolute URI.
- """
- if not isinstance(base_uri, URIReference):
- base_uri = URIReference.from_string(base_uri)
-
- if not base_uri.is_absolute():
- raise exc.ResolutionError(base_uri)
-
- # This is optional per
- # http://tools.ietf.org/html/rfc3986#section-5.2.1
- base_uri = base_uri.normalize()
-
- # The reference we're resolving
- resolving = self
-
- if not strict and resolving.scheme == base_uri.scheme:
- resolving = resolving.copy_with(scheme=None)
-
- # http://tools.ietf.org/html/rfc3986#page-32
- if resolving.scheme is not None:
- target = resolving.copy_with(
- path=normalizers.normalize_path(resolving.path)
- )
- else:
- if resolving.authority is not None:
- target = resolving.copy_with(
- scheme=base_uri.scheme,
- path=normalizers.normalize_path(resolving.path)
- )
- else:
- if resolving.path is None:
- if resolving.query is not None:
- query = resolving.query
- else:
- query = base_uri.query
- target = resolving.copy_with(
- scheme=base_uri.scheme,
- authority=base_uri.authority,
- path=base_uri.path,
- query=query
- )
- else:
- if resolving.path.startswith('/'):
- path = normalizers.normalize_path(resolving.path)
- else:
- path = normalizers.normalize_path(
- misc.merge_paths(base_uri, resolving.path)
- )
- target = resolving.copy_with(
- scheme=base_uri.scheme,
- authority=base_uri.authority,
- path=path,
- query=resolving.query
- )
- return target
-
- def unsplit(self):
- """Create a URI string from the components.
-
- :returns: The URI Reference reconstituted as a string.
- :rtype: str
- """
- # See http://tools.ietf.org/html/rfc3986#section-5.3
- result_list = []
- if self.scheme:
- result_list.extend([self.scheme, ':'])
- if self.authority:
- result_list.extend(['//', self.authority])
- if self.path:
- result_list.append(self.path)
- if self.query is not None:
- result_list.extend(['?', self.query])
- if self.fragment is not None:
- result_list.extend(['#', self.fragment])
- return ''.join(result_list)
-
- def copy_with(self, scheme=misc.UseExisting, authority=misc.UseExisting,
- path=misc.UseExisting, query=misc.UseExisting,
- fragment=misc.UseExisting):
- """Create a copy of this reference with the new components.
-
- :param str scheme:
- (optional) The scheme to use for the new reference.
- :param str authority:
- (optional) The authority to use for the new reference.
- :param str path:
- (optional) The path to use for the new reference.
- :param str query:
- (optional) The query to use for the new reference.
- :param str fragment:
- (optional) The fragment to use for the new reference.
- :returns:
- New URIReference with provided components.
- :rtype:
- URIReference
- """
- attributes = {
- 'scheme': scheme,
- 'authority': authority,
- 'path': path,
- 'query': query,
- 'fragment': fragment,
- }
- for key, value in list(attributes.items()):
- if value is misc.UseExisting:
- del attributes[key]
- uri = self._replace(**attributes)
- uri.encoding = self.encoding
- return uri
+ split_uri = misc.URI_MATCHER.match(uri_string).groupdict()
+ return cls(
+ split_uri['scheme'], split_uri['authority'],
+ normalizers.encode_component(split_uri['path'], encoding),
+ normalizers.encode_component(split_uri['query'], encoding),
+ normalizers.encode_component(split_uri['fragment'], encoding),
+ encoding,
+ )
diff --git a/src/urllib3/packages/rfc3986/validators.py b/src/urllib3/packages/rfc3986/validators.py
index c781325e..7fc97215 100644
--- a/src/urllib3/packages/rfc3986/validators.py
+++ b/src/urllib3/packages/rfc3986/validators.py
@@ -304,8 +304,28 @@ def authority_is_valid(authority, host=None, require=False):
bool
"""
validated = is_valid(authority, misc.SUBAUTHORITY_MATCHER, require)
+ if validated and host is not None:
+ return host_is_valid(host, require)
+ return validated
+
+
+def host_is_valid(host, require=False):
+ """Determine if the host string is valid.
+
+ :param str host:
+ The host to validate.
+ :param bool require:
+ (optional) Specify if host must not be None.
+ :returns:
+ ``True`` if valid, ``False`` otherwise
+ :rtype:
+ bool
+ """
+ validated = is_valid(host, misc.HOST_MATCHER, require)
if validated and host is not None and misc.IPv4_MATCHER.match(host):
return valid_ipv4_host_address(host)
+ elif validated and host is not None and misc.IPv6_MATCHER.match(host):
+ return misc.IPv6_NO_RFC4007_MATCHER.match(host) is not None
return validated
@@ -395,7 +415,9 @@ def subauthority_component_is_valid(uri, component):
# If we can parse the authority into sub-components and we're not
# validating the port, we can assume it's valid.
- if component != 'port':
+ if component == 'host':
+ return host_is_valid(subauthority_dict['host'])
+ elif component != 'port':
return True
try:
diff --git a/src/urllib3/util/url.py b/src/urllib3/util/url.py
index e12278b5..0127e2fe 100644
--- a/src/urllib3/util/url.py
+++ b/src/urllib3/util/url.py
@@ -4,7 +4,8 @@ from collections import namedtuple
from ..exceptions import LocationParseError
from ..packages import six, rfc3986
-from ..packages.rfc3986.exceptions import RFC3986Exception
+from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
+from ..packages.rfc3986.validators import Validator
url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
@@ -14,12 +15,12 @@ url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
NORMALIZABLE_SCHEMES = ('http', 'https', None)
# Regex for detecting URLs with schemes. RFC 3986 Section 3.1
-SCHEME_REGEX = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
+SCHEME_REGEX = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+\-]*:|/)")
class Url(namedtuple('Url', url_attrs)):
"""
- Datastructure for representing an HTTP URL. Used as a return value for
+ Data structure for representing an HTTP URL. Used as a return value for
:func:`parse_url`. Both the scheme and host are normalized as they are
both case-insensitive according to RFC 3986.
"""
@@ -29,10 +30,8 @@ class Url(namedtuple('Url', url_attrs)):
query=None, fragment=None):
if path and not path.startswith('/'):
path = '/' + path
- if scheme:
+ if scheme is not None:
scheme = scheme.lower()
- if host and scheme in NORMALIZABLE_SCHEMES:
- host = host.lower()
return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
query, fragment)
@@ -78,23 +77,23 @@ class Url(namedtuple('Url', url_attrs)):
'http://username:password@host.com:80/path?query#fragment'
"""
scheme, auth, host, port, path, query, fragment = self
- url = ''
+ url = u''
# We use "is not None" we want things to happen with empty strings (or 0 port)
if scheme is not None:
- url += scheme + '://'
+ url += scheme + u'://'
if auth is not None:
- url += auth + '@'
+ url += auth + u'@'
if host is not None:
url += host
if port is not None:
- url += ':' + str(port)
+ url += u':' + str(port)
if path is not None:
url += path
if query is not None:
- url += '?' + query
+ url += u'?' + query
if fragment is not None:
- url += '#' + fragment
+ url += u'#' + fragment
return url
@@ -104,7 +103,7 @@ class Url(namedtuple('Url', url_attrs)):
def split_first(s, delims):
"""
- Deprecated. No longer used by parse_url().
+ .. deprecated:: 1.25
Given a string and an iterable of delimiters, split on the first found
delimiter. Return two split parts and the matched delimiter.
@@ -161,6 +160,8 @@ def parse_url(url):
return Url()
is_string = not isinstance(url, six.binary_type)
+ if not is_string:
+ url = url.decode("utf-8")
# RFC 3986 doesn't like URLs that have a host but don't start
# with a scheme and we support URLs like that so we need to
@@ -171,22 +172,53 @@ def parse_url(url):
url = "//" + url
try:
- parse_result = rfc3986.urlparse(url, encoding="utf-8")
+ iri_ref = rfc3986.IRIReference.from_string(url, encoding="utf-8")
except (ValueError, RFC3986Exception):
+ six.raise_from(LocationParseError(url), None)
+
+ def idna_encode(name):
+ if name and any([ord(x) > 128 for x in name]):
+ try:
+ import idna
+ except ImportError:
+ raise LocationParseError("Unable to parse URL without the 'idna' module")
+ try:
+ return idna.encode(name, strict=True, std3_rules=True).lower()
+ except idna.IDNAError:
+ raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name)
+ return name
+
+ has_authority = iri_ref.authority is not None
+ uri_ref = iri_ref.encode(idna_encoder=idna_encode)
+
+ # rfc3986 strips the authority if it's invalid
+ if has_authority and uri_ref.authority is None:
raise LocationParseError(url)
- # RFC 3986 doesn't assert ports must be non-negative.
- if parse_result.port and parse_result.port < 0:
- raise LocationParseError(url)
+ # Only normalize schemes we understand to not break http+unix
+ # or other schemes that don't follow RFC 3986.
+ if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
+ uri_ref = uri_ref.normalize()
+
+ # Validate all URIReference components and ensure that all
+ # components that were set before are still set after
+ # normalization has completed.
+ validator = Validator()
+ try:
+ validator.check_validity_of(
+ *validator.COMPONENT_NAMES
+ ).validate(uri_ref)
+ except ValidationError:
+ six.raise_from(LocationParseError(url), None)
# For the sake of backwards compatibility we put empty
# string values for path if there are any defined values
# beyond the path in the URL.
# TODO: Remove this when we break backwards compatibility.
- path = parse_result.path
+ path = uri_ref.path
if not path:
- if (parse_result.query is not None
- or parse_result.fragment is not None):
+ if (uri_ref.query is not None
+ or uri_ref.fragment is not None):
path = ""
else:
path = None
@@ -201,13 +233,13 @@ def parse_url(url):
return x
return Url(
- scheme=to_input_type(parse_result.scheme),
- auth=to_input_type(parse_result.userinfo),
- host=to_input_type(parse_result.hostname),
- port=parse_result.port,
+ scheme=to_input_type(uri_ref.scheme),
+ auth=to_input_type(uri_ref.userinfo),
+ host=to_input_type(uri_ref.host),
+ port=int(uri_ref.port) if uri_ref.port is not None else None,
path=to_input_type(path),
- query=to_input_type(parse_result.query),
- fragment=to_input_type(parse_result.fragment)
+ query=to_input_type(uri_ref.query),
+ fragment=to_input_type(uri_ref.fragment)
)
diff --git a/test/test_util.py b/test/test_util.py
index ac527355..b8ab2e68 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -131,12 +131,24 @@ class TestUtil(object):
with pytest.raises(LocationParseError):
get_host(location)
+ @pytest.mark.parametrize('url', [
+ 'http://user\\@google.com',
+ 'http://google\\.com',
+ 'user\\@google.com',
+ 'http://google.com#fragment#',
+ 'http://user@user@google.com/',
+ ])
+ def test_invalid_url(self, url):
+ with pytest.raises(LocationParseError):
+ parse_url(url)
+
@pytest.mark.parametrize('url, expected_normalized_url', [
('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'),
('HTTP://JeremyCline:Hunter2@Example.com:8080/',
'http://JeremyCline:Hunter2@example.com:8080/'),
('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
+ ('[::Ff%etH0%Ff]/%ab%Af', '[::ff%25etH0%Ff]/%AB%AF'),
])
def test_parse_url_normalization(self, url, expected_normalized_url):
"""Assert parse_url normalizes the scheme/host, and only the scheme/host"""
@@ -155,8 +167,7 @@ class TestUtil(object):
# Path/query/fragment
('', Url()),
('/', Url(path='/')),
- ('/abc/../def', Url(path="/abc/../def")),
- ('#?/!google.com/?foo#bar', Url(path='', fragment='?/!google.com/?foo#bar')),
+ ('#?/!google.com/?foo', Url(path='', fragment='?/!google.com/?foo')),
('/foo', Url(path='/foo')),
('/foo?bar=baz', Url(path='/foo', query='bar=baz')),
('/foo?bar=baz#banana?apple/orange', Url(path='/foo',
@@ -173,10 +184,10 @@ class TestUtil(object):
# Auth
('http://foo:bar@localhost/', Url('http', auth='foo:bar', host='localhost', path='/')),
('http://foo@localhost/', Url('http', auth='foo', host='localhost', path='/')),
- ('http://foo:bar@baz@localhost/', Url('http',
- auth='foo:bar@baz',
- host='localhost',
- path='/')),
+ ('http://foo:bar@localhost/', Url('http',
+ auth='foo:bar',
+ host='localhost',
+ path='/')),
# Unicode type (Python 2.x)
(u'http://foo:bar@localhost/', Url(u'http',
@@ -194,6 +205,9 @@ class TestUtil(object):
('?', Url(path='', query='')),
('#', Url(path='', fragment='')),
+ # Path normalization
+ ('/abc/../def', Url(path="/def")),
+
# Empty Port
('http://google.com:', Url('http', host='google.com')),
('http://google.com:/', Url('http', host='google.com', path='/')),
@@ -211,6 +225,23 @@ class TestUtil(object):
def test_unparse_url(self, url, expected_url):
assert url == expected_url.url
+ @pytest.mark.parametrize(
+ ['url', 'expected_url'],
+ [
+ # RFC 3986 5.2.4
+ ('/abc/../def', Url(path="/def")),
+ ('/..', Url(path="/")),
+ ('/./abc/./def/', Url(path='/abc/def/')),
+ ('/.', Url(path='/')),
+ ('/./', Url(path='/')),
+ ('/abc/./.././d/././e/.././f/./../../ghi', Url(path='/ghi'))
+ ]
+ )
+ def test_parse_and_normalize_url_paths(self, url, expected_url):
+ actual_url = parse_url(url)
+ assert actual_url == expected_url
+ assert actual_url.url == expected_url.url
+
def test_parse_url_invalid_IPv6(self):
with pytest.raises(LocationParseError):
parse_url('[::1')
@@ -260,12 +291,36 @@ class TestUtil(object):
# CVE-2016-5699
("http://127.0.0.1%0d%0aConnection%3a%20keep-alive",
- Url("http", host="127.0.0.1%0d%0aConnection%3a%20keep-alive")),
+ Url("http", host="127.0.0.1%0d%0aconnection%3a%20keep-alive")),
# NodeJS unicode -> double dot
(u"http://google.com/\uff2e\uff2e/abc", Url("http",
host="google.com",
- path='/%ef%bc%ae%ef%bc%ae/abc'))
+ path='/%EF%BC%AE%EF%BC%AE/abc')),
+
+ # Scheme without ://
+ ("javascript:a='@google.com:12345/';alert(0)",
+ Url(scheme="javascript",
+ path="a='@google.com:12345/';alert(0)")),
+
+ ("//google.com/a/b/c", Url(host="google.com", path="/a/b/c")),
+
+ # International URLs
+ (u'http://ヒ:キ@ヒ.abc.ニ/ヒ?キ#ワ', Url(u'http',
+ host=u'xn--pdk.abc.xn--idk',
+ auth=u'%E3%83%92:%E3%82%AD',
+ path=u'/%E3%83%92',
+ query=u'%E3%82%AD',
+ fragment=u'%E3%83%AF')),
+
+ # Injected headers (CVE-2016-5699, CVE-2019-9740, CVE-2019-9947)
+ ("10.251.0.83:7777?a=1 HTTP/1.1\r\nX-injected: header",
+ Url(host='10.251.0.83', port=7777, path='',
+ query='a=1%20HTTP/1.1%0D%0AX-injected:%20header')),
+
+ ("http://127.0.0.1:6379?\r\nSET test failure12\r\n:8080/test/?test=a",
+ Url(scheme='http', host='127.0.0.1', port=6379, path='',
+ query='%0D%0ASET%20test%20failure12%0D%0A:8080/test/?test=a')),
]
@pytest.mark.parametrize("url, expected_url", url_vulnerabilities)