summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobb <robbt@azone.org>2019-03-22 23:52:12 -0400
committerSeth Michael Larson <sethmichaellarson@gmail.com>2019-03-22 22:52:12 -0500
commit46331f94275a4c3b4c71a358a495b1caeaececa0 (patch)
tree77944873c5b238d7af23662a5526457717485030
parentedfd3450c6ab8a9d6e1ecc441c665b0afd4084ba (diff)
downloadurllib3-46331f94275a4c3b4c71a358a495b1caeaececa0.tar.gz
Encode field names using HTML5 by default instead of RFC 2231 (#1492)
-rw-r--r--CHANGES.rst2
-rw-r--r--dummyserver/handlers.py60
-rw-r--r--src/urllib3/fields.py140
-rw-r--r--test/test_fields.py47
-rw-r--r--tox.ini2
5 files changed, 169 insertions, 82 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 29c2676f..013d119b 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -30,6 +30,8 @@ dev (master)
* Add support for IPv6 addresses in subjectAltName section of certificates. (Issue #1269)
+* Switched the default multipart header encoder from RFC 2231 to HTML 5 working draft. (Issue #303, PR #1492)
+
* ... [Short description of non-trivial change.] (Issue #)
diff --git a/dummyserver/handlers.py b/dummyserver/handlers.py
index f570d881..146241dc 100644
--- a/dummyserver/handlers.py
+++ b/dummyserver/handlers.py
@@ -17,6 +17,7 @@ from datetime import timedelta
from urllib3.packages.six.moves.http_client import responses
from urllib3.packages.six.moves.urllib.parse import urlsplit
+from urllib3.packages.six import binary_type
log = logging.getLogger(__name__)
@@ -157,10 +158,15 @@ class TestingApp(RequestHandler):
return Response("Wrong size: %d != %d" %
(size, len(data)), status='400 Bad Request')
- if filename != file_['filename']:
- return Response("Wrong filename: %s != %s" %
- (filename, file_.filename),
- status='400 Bad Request')
+ got_filename = file_['filename']
+ if(isinstance(got_filename, binary_type)):
+ got_filename = got_filename.decode('utf-8')
+
+ # Tornado can leave the trailing \n in place on the filename.
+ if filename != got_filename:
+ return Response(
+ u"Wrong filename: %s != %s" % (filename, file_.filename),
+ status='400 Bad Request')
return Response()
@@ -304,49 +310,3 @@ class TestingApp(RequestHandler):
def shutdown(self, request):
sys.exit()
-
-
-# RFC2231-aware replacement of internal tornado function
-def _parse_header(line):
- r"""Parse a Content-type like header.
-
- Return the main content-type and a dictionary of options.
-
- >>> d = _parse_header("CD: fd; foo=\"bar\"; file*=utf-8''T%C3%A4st")[1]
- >>> d['file'] == 'T\u00e4st'
- True
- >>> d['foo']
- 'bar'
- """
- import tornado.httputil
- import email.utils
- from urllib3.packages import six
- if not six.PY3:
- line = line.encode('utf-8')
- parts = tornado.httputil._parseparam(';' + line)
- key = next(parts)
- # decode_params treats first argument special, but we already stripped key
- params = [('Dummy', 'value')]
- for p in parts:
- i = p.find('=')
- if i >= 0:
- name = p[:i].strip().lower()
- value = p[i + 1:].strip()
- params.append((name, value))
- params = email.utils.decode_params(params)
- params.pop(0) # get rid of the dummy again
- pdict = {}
- for name, value in params:
- value = email.utils.collapse_rfc2231_value(value)
- if len(value) >= 2 and value[0] == '"' and value[-1] == '"':
- value = value[1:-1]
- pdict[name] = value
- return key, pdict
-
-
-# TODO: make the following conditional as soon as we know a version
-# which does not require this fix.
-# See https://github.com/facebook/tornado/issues/868
-if True:
- import tornado.httputil
- tornado.httputil._parse_header = _parse_header
diff --git a/src/urllib3/fields.py b/src/urllib3/fields.py
index 37fe64a3..6a9a5a7f 100644
--- a/src/urllib3/fields.py
+++ b/src/urllib3/fields.py
@@ -1,6 +1,7 @@
from __future__ import absolute_import
import email.utils
import mimetypes
+import re
from .packages import six
@@ -19,57 +20,147 @@ def guess_content_type(filename, default='application/octet-stream'):
return default
-def format_header_param(name, value):
+def format_header_param_rfc2231(name, value):
"""
- Helper function to format and quote a single header parameter.
+ Helper function to format and quote a single header parameter using the
+ strategy defined in RFC 2231.
Particularly useful for header parameters which might contain
- non-ASCII values, like file names. This follows RFC 2231, as
- suggested by RFC 2388 Section 4.4.
+ non-ASCII values, like file names. This follows RFC 2388 Section 4.4.
:param name:
The name of the parameter, a string expected to be ASCII only.
:param value:
- The value of the parameter, provided as a unicode string.
+ The value of the parameter, provided as ``bytes`` or `str``.
+ :ret:
+ An RFC-2231-formatted unicode string.
"""
+ if isinstance(value, six.binary_type):
+ value = value.decode("utf-8")
+
if not any(ch in value for ch in '"\\\r\n'):
- result = '%s="%s"' % (name, value)
+ result = u'%s="%s"' % (name, value)
try:
result.encode('ascii')
except (UnicodeEncodeError, UnicodeDecodeError):
pass
else:
return result
- if not six.PY3 and isinstance(value, six.text_type): # Python 2:
+
+ if not six.PY3: # Python 2:
value = value.encode('utf-8')
+
+ # encode_rfc2231 accepts an encoded string and returns an ascii-encoded
+ # string in Python 2 but accepts and returns unicode strings in Python 3
value = email.utils.encode_rfc2231(value, 'utf-8')
value = '%s*=%s' % (name, value)
+
+ if not six.PY3: # Python 2:
+ value = value.decode('utf-8')
+
return value
+_HTML5_REPLACEMENTS = {
+ u"\u0022": u"%22",
+ # Replace "\" with "\\".
+ u"\u005C": u"\u005C\u005C",
+ u"\u005C": u"\u005C\u005C",
+}
+
+# All control characters from 0x00 to 0x1F *except* 0x1B.
+_HTML5_REPLACEMENTS.update({
+ six.unichr(cc): u"%{:02X}".format(cc)
+ for cc
+ in range(0x00, 0x1F+1)
+ if cc not in (0x1B,)
+})
+
+
+def _replace_multiple(value, needles_and_replacements):
+
+ def replacer(match):
+ return needles_and_replacements[match.group(0)]
+
+ pattern = re.compile(
+ r"|".join([
+ re.escape(needle) for needle in needles_and_replacements.keys()
+ ])
+ )
+
+ result = pattern.sub(replacer, value)
+
+ return result
+
+
+def format_header_param_html5(name, value):
+ """
+ Helper function to format and quote a single header parameter using the
+ HTML5 strategy.
+
+ Particularly useful for header parameters which might contain
+ non-ASCII values, like file names. This follows the `HTML5 Working Draft
+ Section 4.10.22.7`_ and matches the behavior of curl and modern browsers.
+
+ .. _HTML5 Working Draft Section 4.10.22.7:
+ https://w3c.github.io/html/sec-forms.html#multipart-form-data
+
+ :param name:
+ The name of the parameter, a string expected to be ASCII only.
+ :param value:
+ The value of the parameter, provided as ``bytes`` or `str``.
+ :ret:
+ A unicode string, stripped of troublesome characters.
+ """
+ if isinstance(value, six.binary_type):
+ value = value.decode("utf-8")
+
+ value = _replace_multiple(value, _HTML5_REPLACEMENTS)
+
+ return u'%s="%s"' % (name, value)
+
+
+# For backwards-compatibility.
+format_header_param = format_header_param_html5
+
+
class RequestField(object):
"""
A data container for request body parameters.
:param name:
- The name of this request field.
+ The name of this request field. Must be unicode.
:param data:
The data/value body.
:param filename:
- An optional filename of the request field.
+ An optional filename of the request field. Must be unicode.
:param headers:
An optional dict-like object of headers to initially use for the field.
+ :param header_formatter:
+ An optional callable that is used to encode and format the headers. By
+ default, this is :func:`format_header_param_html5`.
"""
- def __init__(self, name, data, filename=None, headers=None):
+ def __init__(
+ self,
+ name,
+ data,
+ filename=None,
+ headers=None,
+ header_formatter=format_header_param_html5):
self._name = name
self._filename = filename
self.data = data
self.headers = {}
if headers:
self.headers = dict(headers)
+ self.header_formatter = header_formatter
@classmethod
- def from_tuples(cls, fieldname, value):
+ def from_tuples(
+ cls,
+ fieldname,
+ value,
+ header_formatter=format_header_param_html5):
"""
A :class:`~urllib3.fields.RequestField` factory from old-style tuple parameters.
@@ -97,21 +188,24 @@ class RequestField(object):
content_type = None
data = value
- request_param = cls(fieldname, data, filename=filename)
+ request_param = cls(
+ fieldname, data, filename=filename, header_formatter=header_formatter)
request_param.make_multipart(content_type=content_type)
return request_param
def _render_part(self, name, value):
"""
- Overridable helper function to format a single header parameter.
+ Overridable helper function to format a single header parameter. By
+ default, this calls ``self.header_formatter``.
:param name:
The name of the parameter, a string expected to be ASCII only.
:param value:
The value of the parameter, provided as a unicode string.
"""
- return format_header_param(name, value)
+
+ return self.header_formatter(name, value)
def _render_parts(self, header_parts):
"""
@@ -133,7 +227,7 @@ class RequestField(object):
if value is not None:
parts.append(self._render_part(name, value))
- return '; '.join(parts)
+ return u'; '.join(parts)
def render_headers(self):
"""
@@ -144,15 +238,15 @@ class RequestField(object):
sort_keys = ['Content-Disposition', 'Content-Type', 'Content-Location']
for sort_key in sort_keys:
if self.headers.get(sort_key, False):
- lines.append('%s: %s' % (sort_key, self.headers[sort_key]))
+ lines.append(u'%s: %s' % (sort_key, self.headers[sort_key]))
for header_name, header_value in self.headers.items():
if header_name not in sort_keys:
if header_value:
- lines.append('%s: %s' % (header_name, header_value))
+ lines.append(u'%s: %s' % (header_name, header_value))
- lines.append('\r\n')
- return '\r\n'.join(lines)
+ lines.append(u'\r\n')
+ return u'\r\n'.join(lines)
def make_multipart(self, content_disposition=None, content_type=None,
content_location=None):
@@ -168,10 +262,10 @@ class RequestField(object):
The 'Content-Location' of the request body.
"""
- self.headers['Content-Disposition'] = content_disposition or 'form-data'
- self.headers['Content-Disposition'] += '; '.join([
- '', self._render_parts(
- (('name', self._name), ('filename', self._filename))
+ self.headers['Content-Disposition'] = content_disposition or u'form-data'
+ self.headers['Content-Disposition'] += u'; '.join([
+ u'', self._render_parts(
+ ((u'name', self._name), (u'filename', self._filename))
)
])
self.headers['Content-Type'] = content_type
diff --git a/test/test_fields.py b/test/test_fields.py
index e944ec43..72e70b8e 100644
--- a/test/test_fields.py
+++ b/test/test_fields.py
@@ -1,8 +1,7 @@
import pytest
-from urllib3.fields import guess_content_type, RequestField
+from urllib3.fields import format_header_param_rfc2231, guess_content_type, RequestField
from urllib3.packages.six import u
-from . import onlyPy2
class TestRequestField(object):
@@ -53,13 +52,45 @@ class TestRequestField(object):
parts = field._render_parts([('name', 'value'), ('filename', 'value')])
assert parts == 'name="value"; filename="value"'
- def test_render_part(self):
- field = RequestField('somename', 'data')
+ def test_render_part_rfc2231_unicode(self):
+ field = RequestField('somename', 'data', header_formatter=format_header_param_rfc2231)
param = field._render_part('filename', u('n\u00e4me'))
assert param == "filename*=utf-8''n%C3%A4me"
- @onlyPy2
- def test_render_unicode_bytes_py2(self):
+ def test_render_part_rfc2231_ascii(self):
+ field = RequestField('somename', 'data', header_formatter=format_header_param_rfc2231)
+ param = field._render_part('filename', b'name')
+ assert param == 'filename="name"'
+
+ def test_render_part_html5_unicode(self):
field = RequestField('somename', 'data')
- param = field._render_part('filename', 'n\xc3\xa4me')
- assert param == "filename*=utf-8''n%C3%A4me"
+ param = field._render_part('filename', u('n\u00e4me'))
+ assert param == u('filename="n\u00e4me"')
+
+ def test_render_part_html5_ascii(self):
+ field = RequestField('somename', 'data')
+ param = field._render_part('filename', b'name')
+ assert param == 'filename="name"'
+
+ def test_render_part_html5_unicode_escape(self):
+ field = RequestField('somename', 'data')
+ param = field._render_part('filename', u('hello\\world\u0022'))
+ assert param == u('filename="hello\\\\world%22"')
+
+ def test_render_part_html5_unicode_with_control_character(self):
+ field = RequestField('somename', 'data')
+ param = field._render_part('filename', u('hello\x1A\x1B\x1C'))
+ assert param == u('filename="hello%1A\x1B%1C"')
+
+ def test_from_tuples_rfc2231(self):
+ field = RequestField.from_tuples(
+ u('fieldname'),
+ (u('filen\u00e4me'), 'data'),
+ header_formatter=format_header_param_rfc2231)
+ cd = field.headers['Content-Disposition']
+ assert (cd == u("form-data; name=\"fieldname\"; filename*=utf-8''filen%C3%A4me"))
+
+ def test_from_tuples_html5(self):
+ field = RequestField.from_tuples(u('fieldname'), (u('filen\u00e4me'), 'data'))
+ cd = field.headers['Content-Disposition']
+ assert (cd == u('form-data; name="fieldname"; filename="filen\u00e4me"'))
diff --git a/tox.ini b/tox.ini
index 8764a349..f2e46a0b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -41,7 +41,7 @@ setenv =
passenv = TRAVIS TRAVIS_INFRA
[testenv:flake8-py3]
-basepython = python3.4
+basepython = python3
deps=
flake8
commands=