diff options
author | Robb <robbt@azone.org> | 2019-03-22 23:52:12 -0400 |
---|---|---|
committer | Seth Michael Larson <sethmichaellarson@gmail.com> | 2019-03-22 22:52:12 -0500 |
commit | 46331f94275a4c3b4c71a358a495b1caeaececa0 (patch) | |
tree | 77944873c5b238d7af23662a5526457717485030 | |
parent | edfd3450c6ab8a9d6e1ecc441c665b0afd4084ba (diff) | |
download | urllib3-46331f94275a4c3b4c71a358a495b1caeaececa0.tar.gz |
Encode field names using HTML5 by default instead of RFC 2231 (#1492)
-rw-r--r-- | CHANGES.rst | 2 | ||||
-rw-r--r-- | dummyserver/handlers.py | 60 | ||||
-rw-r--r-- | src/urllib3/fields.py | 140 | ||||
-rw-r--r-- | test/test_fields.py | 47 | ||||
-rw-r--r-- | tox.ini | 2 |
5 files changed, 169 insertions, 82 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index 29c2676f..013d119b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -30,6 +30,8 @@ dev (master) * Add support for IPv6 addresses in subjectAltName section of certificates. (Issue #1269) +* Switched the default multipart header encoder from RFC 2231 to HTML 5 working draft. (Issue #303, PR #1492) + * ... [Short description of non-trivial change.] (Issue #) diff --git a/dummyserver/handlers.py b/dummyserver/handlers.py index f570d881..146241dc 100644 --- a/dummyserver/handlers.py +++ b/dummyserver/handlers.py @@ -17,6 +17,7 @@ from datetime import timedelta from urllib3.packages.six.moves.http_client import responses from urllib3.packages.six.moves.urllib.parse import urlsplit +from urllib3.packages.six import binary_type log = logging.getLogger(__name__) @@ -157,10 +158,15 @@ class TestingApp(RequestHandler): return Response("Wrong size: %d != %d" % (size, len(data)), status='400 Bad Request') - if filename != file_['filename']: - return Response("Wrong filename: %s != %s" % - (filename, file_.filename), - status='400 Bad Request') + got_filename = file_['filename'] + if(isinstance(got_filename, binary_type)): + got_filename = got_filename.decode('utf-8') + + # Tornado can leave the trailing \n in place on the filename. + if filename != got_filename: + return Response( + u"Wrong filename: %s != %s" % (filename, file_.filename), + status='400 Bad Request') return Response() @@ -304,49 +310,3 @@ class TestingApp(RequestHandler): def shutdown(self, request): sys.exit() - - -# RFC2231-aware replacement of internal tornado function -def _parse_header(line): - r"""Parse a Content-type like header. - - Return the main content-type and a dictionary of options. - - >>> d = _parse_header("CD: fd; foo=\"bar\"; file*=utf-8''T%C3%A4st")[1] - >>> d['file'] == 'T\u00e4st' - True - >>> d['foo'] - 'bar' - """ - import tornado.httputil - import email.utils - from urllib3.packages import six - if not six.PY3: - line = line.encode('utf-8') - parts = tornado.httputil._parseparam(';' + line) - key = next(parts) - # decode_params treats first argument special, but we already stripped key - params = [('Dummy', 'value')] - for p in parts: - i = p.find('=') - if i >= 0: - name = p[:i].strip().lower() - value = p[i + 1:].strip() - params.append((name, value)) - params = email.utils.decode_params(params) - params.pop(0) # get rid of the dummy again - pdict = {} - for name, value in params: - value = email.utils.collapse_rfc2231_value(value) - if len(value) >= 2 and value[0] == '"' and value[-1] == '"': - value = value[1:-1] - pdict[name] = value - return key, pdict - - -# TODO: make the following conditional as soon as we know a version -# which does not require this fix. -# See https://github.com/facebook/tornado/issues/868 -if True: - import tornado.httputil - tornado.httputil._parse_header = _parse_header diff --git a/src/urllib3/fields.py b/src/urllib3/fields.py index 37fe64a3..6a9a5a7f 100644 --- a/src/urllib3/fields.py +++ b/src/urllib3/fields.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import email.utils import mimetypes +import re from .packages import six @@ -19,57 +20,147 @@ def guess_content_type(filename, default='application/octet-stream'): return default -def format_header_param(name, value): +def format_header_param_rfc2231(name, value): """ - Helper function to format and quote a single header parameter. + Helper function to format and quote a single header parameter using the + strategy defined in RFC 2231. Particularly useful for header parameters which might contain - non-ASCII values, like file names. This follows RFC 2231, as - suggested by RFC 2388 Section 4.4. + non-ASCII values, like file names. This follows RFC 2388 Section 4.4. :param name: The name of the parameter, a string expected to be ASCII only. :param value: - The value of the parameter, provided as a unicode string. + The value of the parameter, provided as ``bytes`` or `str``. + :ret: + An RFC-2231-formatted unicode string. """ + if isinstance(value, six.binary_type): + value = value.decode("utf-8") + if not any(ch in value for ch in '"\\\r\n'): - result = '%s="%s"' % (name, value) + result = u'%s="%s"' % (name, value) try: result.encode('ascii') except (UnicodeEncodeError, UnicodeDecodeError): pass else: return result - if not six.PY3 and isinstance(value, six.text_type): # Python 2: + + if not six.PY3: # Python 2: value = value.encode('utf-8') + + # encode_rfc2231 accepts an encoded string and returns an ascii-encoded + # string in Python 2 but accepts and returns unicode strings in Python 3 value = email.utils.encode_rfc2231(value, 'utf-8') value = '%s*=%s' % (name, value) + + if not six.PY3: # Python 2: + value = value.decode('utf-8') + return value +_HTML5_REPLACEMENTS = { + u"\u0022": u"%22", + # Replace "\" with "\\". + u"\u005C": u"\u005C\u005C", + u"\u005C": u"\u005C\u005C", +} + +# All control characters from 0x00 to 0x1F *except* 0x1B. +_HTML5_REPLACEMENTS.update({ + six.unichr(cc): u"%{:02X}".format(cc) + for cc + in range(0x00, 0x1F+1) + if cc not in (0x1B,) +}) + + +def _replace_multiple(value, needles_and_replacements): + + def replacer(match): + return needles_and_replacements[match.group(0)] + + pattern = re.compile( + r"|".join([ + re.escape(needle) for needle in needles_and_replacements.keys() + ]) + ) + + result = pattern.sub(replacer, value) + + return result + + +def format_header_param_html5(name, value): + """ + Helper function to format and quote a single header parameter using the + HTML5 strategy. + + Particularly useful for header parameters which might contain + non-ASCII values, like file names. This follows the `HTML5 Working Draft + Section 4.10.22.7`_ and matches the behavior of curl and modern browsers. + + .. _HTML5 Working Draft Section 4.10.22.7: + https://w3c.github.io/html/sec-forms.html#multipart-form-data + + :param name: + The name of the parameter, a string expected to be ASCII only. + :param value: + The value of the parameter, provided as ``bytes`` or `str``. + :ret: + A unicode string, stripped of troublesome characters. + """ + if isinstance(value, six.binary_type): + value = value.decode("utf-8") + + value = _replace_multiple(value, _HTML5_REPLACEMENTS) + + return u'%s="%s"' % (name, value) + + +# For backwards-compatibility. +format_header_param = format_header_param_html5 + + class RequestField(object): """ A data container for request body parameters. :param name: - The name of this request field. + The name of this request field. Must be unicode. :param data: The data/value body. :param filename: - An optional filename of the request field. + An optional filename of the request field. Must be unicode. :param headers: An optional dict-like object of headers to initially use for the field. + :param header_formatter: + An optional callable that is used to encode and format the headers. By + default, this is :func:`format_header_param_html5`. """ - def __init__(self, name, data, filename=None, headers=None): + def __init__( + self, + name, + data, + filename=None, + headers=None, + header_formatter=format_header_param_html5): self._name = name self._filename = filename self.data = data self.headers = {} if headers: self.headers = dict(headers) + self.header_formatter = header_formatter @classmethod - def from_tuples(cls, fieldname, value): + def from_tuples( + cls, + fieldname, + value, + header_formatter=format_header_param_html5): """ A :class:`~urllib3.fields.RequestField` factory from old-style tuple parameters. @@ -97,21 +188,24 @@ class RequestField(object): content_type = None data = value - request_param = cls(fieldname, data, filename=filename) + request_param = cls( + fieldname, data, filename=filename, header_formatter=header_formatter) request_param.make_multipart(content_type=content_type) return request_param def _render_part(self, name, value): """ - Overridable helper function to format a single header parameter. + Overridable helper function to format a single header parameter. By + default, this calls ``self.header_formatter``. :param name: The name of the parameter, a string expected to be ASCII only. :param value: The value of the parameter, provided as a unicode string. """ - return format_header_param(name, value) + + return self.header_formatter(name, value) def _render_parts(self, header_parts): """ @@ -133,7 +227,7 @@ class RequestField(object): if value is not None: parts.append(self._render_part(name, value)) - return '; '.join(parts) + return u'; '.join(parts) def render_headers(self): """ @@ -144,15 +238,15 @@ class RequestField(object): sort_keys = ['Content-Disposition', 'Content-Type', 'Content-Location'] for sort_key in sort_keys: if self.headers.get(sort_key, False): - lines.append('%s: %s' % (sort_key, self.headers[sort_key])) + lines.append(u'%s: %s' % (sort_key, self.headers[sort_key])) for header_name, header_value in self.headers.items(): if header_name not in sort_keys: if header_value: - lines.append('%s: %s' % (header_name, header_value)) + lines.append(u'%s: %s' % (header_name, header_value)) - lines.append('\r\n') - return '\r\n'.join(lines) + lines.append(u'\r\n') + return u'\r\n'.join(lines) def make_multipart(self, content_disposition=None, content_type=None, content_location=None): @@ -168,10 +262,10 @@ class RequestField(object): The 'Content-Location' of the request body. """ - self.headers['Content-Disposition'] = content_disposition or 'form-data' - self.headers['Content-Disposition'] += '; '.join([ - '', self._render_parts( - (('name', self._name), ('filename', self._filename)) + self.headers['Content-Disposition'] = content_disposition or u'form-data' + self.headers['Content-Disposition'] += u'; '.join([ + u'', self._render_parts( + ((u'name', self._name), (u'filename', self._filename)) ) ]) self.headers['Content-Type'] = content_type diff --git a/test/test_fields.py b/test/test_fields.py index e944ec43..72e70b8e 100644 --- a/test/test_fields.py +++ b/test/test_fields.py @@ -1,8 +1,7 @@ import pytest -from urllib3.fields import guess_content_type, RequestField +from urllib3.fields import format_header_param_rfc2231, guess_content_type, RequestField from urllib3.packages.six import u -from . import onlyPy2 class TestRequestField(object): @@ -53,13 +52,45 @@ class TestRequestField(object): parts = field._render_parts([('name', 'value'), ('filename', 'value')]) assert parts == 'name="value"; filename="value"' - def test_render_part(self): - field = RequestField('somename', 'data') + def test_render_part_rfc2231_unicode(self): + field = RequestField('somename', 'data', header_formatter=format_header_param_rfc2231) param = field._render_part('filename', u('n\u00e4me')) assert param == "filename*=utf-8''n%C3%A4me" - @onlyPy2 - def test_render_unicode_bytes_py2(self): + def test_render_part_rfc2231_ascii(self): + field = RequestField('somename', 'data', header_formatter=format_header_param_rfc2231) + param = field._render_part('filename', b'name') + assert param == 'filename="name"' + + def test_render_part_html5_unicode(self): field = RequestField('somename', 'data') - param = field._render_part('filename', 'n\xc3\xa4me') - assert param == "filename*=utf-8''n%C3%A4me" + param = field._render_part('filename', u('n\u00e4me')) + assert param == u('filename="n\u00e4me"') + + def test_render_part_html5_ascii(self): + field = RequestField('somename', 'data') + param = field._render_part('filename', b'name') + assert param == 'filename="name"' + + def test_render_part_html5_unicode_escape(self): + field = RequestField('somename', 'data') + param = field._render_part('filename', u('hello\\world\u0022')) + assert param == u('filename="hello\\\\world%22"') + + def test_render_part_html5_unicode_with_control_character(self): + field = RequestField('somename', 'data') + param = field._render_part('filename', u('hello\x1A\x1B\x1C')) + assert param == u('filename="hello%1A\x1B%1C"') + + def test_from_tuples_rfc2231(self): + field = RequestField.from_tuples( + u('fieldname'), + (u('filen\u00e4me'), 'data'), + header_formatter=format_header_param_rfc2231) + cd = field.headers['Content-Disposition'] + assert (cd == u("form-data; name=\"fieldname\"; filename*=utf-8''filen%C3%A4me")) + + def test_from_tuples_html5(self): + field = RequestField.from_tuples(u('fieldname'), (u('filen\u00e4me'), 'data')) + cd = field.headers['Content-Disposition'] + assert (cd == u('form-data; name="fieldname"; filename="filen\u00e4me"')) @@ -41,7 +41,7 @@ setenv = passenv = TRAVIS TRAVIS_INFRA [testenv:flake8-py3] -basepython = python3.4 +basepython = python3 deps= flake8 commands= |