diff options
| author | Tim Burke <tim.burke@gmail.com> | 2019-02-28 12:16:21 -0800 |
|---|---|---|
| committer | Jakub Stasiak <jakub@stasiak.at> | 2019-02-28 22:51:56 +0100 |
| commit | f0bc79e1d973806866e2eb74db82adfda64ab1e5 (patch) | |
| tree | 2c1dc765ba7acf617440a290b46e20c99aa21fe2 | |
| parent | a915bb642dd6cd4e92c959addff30509977a637c (diff) | |
| download | eventlet-f0bc79e1d973806866e2eb74db82adfda64ab1e5.tar.gz | |
wsgi: Stop replacing invalid UTF-8 on py3
For more context, see #467 and #497.
On py3, urllib.parse.unquote() defaults to decoding via UTF-8 and
replacing invalid UTF-8 sequences with "\N{REPLACEMENT CHARACTER}".
This causes a few problems:
- Since WSGI requires that bytes be decoded as Latin-1 on py3, we
have to do an extra re-encode/decode cycle in encode_dance().
- Applications written for Latin-1 are broken, as there are valid
Latin-1 sequences that are mangled because of the replacement.
- Applications written for UTF-8 cannot differentiate between a
replacement character that was intentionally sent by the client
versus an invalid byte sequence.
Fortunately, unquote() allows us to specify the encoding that should
be used. By specifying Latin-1, we can drop encode_dance() entirely
and preserve as much information from the wire as we can.
| -rw-r--r-- | eventlet/wsgi.py | 13 | ||||
| -rw-r--r-- | tests/wsgi_test.py | 23 |
2 files changed, 19 insertions, 17 deletions
diff --git a/eventlet/wsgi.py b/eventlet/wsgi.py index 84bb43c..086c6ea 100644 --- a/eventlet/wsgi.py +++ b/eventlet/wsgi.py @@ -59,14 +59,6 @@ def addr_to_host_port(addr): return (host, port) -def encode_dance(s): - if not isinstance(s, bytes): - s = s.encode('utf-8', 'replace') - if six.PY2: - return s - return s.decode('latin1') - - # Collections of error codes to compare against. Not all attributes are set # on errno module on all platforms, so some are literals :( BAD_SOCK = set((errno.EBADF, 10053)) @@ -646,7 +638,10 @@ class HttpProtocol(BaseHTTPServer.BaseHTTPRequestHandler): pq = self.path.split('?', 1) env['RAW_PATH_INFO'] = pq[0] - env['PATH_INFO'] = encode_dance(urllib.parse.unquote(pq[0])) + if six.PY2: + env['PATH_INFO'] = urllib.parse.unquote(pq[0]) + else: + env['PATH_INFO'] = urllib.parse.unquote(pq[0], encoding='latin1') if len(pq) > 1: env['QUERY_STRING'] = pq[1] diff --git a/tests/wsgi_test.py b/tests/wsgi_test.py index 901035b..7f3856c 100644 --- a/tests/wsgi_test.py +++ b/tests/wsgi_test.py @@ -1443,21 +1443,28 @@ class TestHttpd(_TestBase): self.site.application = wsgi_app sock = eventlet.connect(self.server_addr) - sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n') + # This is a properly-quoted request for the UTF-8 path /你好 + sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\n\r\n') result = read_http(sock) assert result.status == 'HTTP/1.1 200 OK' - # that was only preparation, actual test below + # Like above, but the octets are reversed before being quoted, + # so the result should *not* be interpreted as UTF-8 + sock.sendall(b'GET /%BD%A5%E5%A0%BD%E4 HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n') + result = read_http(sock) + assert result.status == 'HTTP/1.1 200 OK' + + # that was only preparation, actual tests below # Per PEP-0333 https://www.python.org/dev/peps/pep-0333/#unicode-issues # in all WSGI environment strings application must observe either bytes in latin-1 (ISO-8859-1) # or unicode code points \u0000..\u00ff - # wsgi_decoding_dance from Werkzeug to emulate concerned application msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[0])) assert isinstance(g[0], str), msg - if six.PY2: - assert g[0] == u'/你好'.encode('utf-8') - else: - decoded = g[0].encode('latin1').decode('utf-8', 'replace') - assert decoded == u'/你好' + # Fortunately, WSGI strings have the same literal representation on both py2 and py3 + assert g[0] == '/\xe4\xbd\xa0\xe5\xa5\xbd' + + msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[1])) + assert isinstance(g[1], str), msg + assert g[1] == '/\xbd\xa5\xe5\xa0\xbd\xe4' @tests.skip_if_no_ipv6 def test_ipv6(self): |
