summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Burke <tim.burke@gmail.com>2019-02-28 12:16:21 -0800
committerJakub Stasiak <jakub@stasiak.at>2019-02-28 22:51:56 +0100
commitf0bc79e1d973806866e2eb74db82adfda64ab1e5 (patch)
tree2c1dc765ba7acf617440a290b46e20c99aa21fe2
parenta915bb642dd6cd4e92c959addff30509977a637c (diff)
downloadeventlet-f0bc79e1d973806866e2eb74db82adfda64ab1e5.tar.gz
wsgi: Stop replacing invalid UTF-8 on py3
For more context, see #467 and #497. On py3, urllib.parse.unquote() defaults to decoding via UTF-8 and replacing invalid UTF-8 sequences with "\N{REPLACEMENT CHARACTER}". This causes a few problems: - Since WSGI requires that bytes be decoded as Latin-1 on py3, we have to do an extra re-encode/decode cycle in encode_dance(). - Applications written for Latin-1 are broken, as there are valid Latin-1 sequences that are mangled because of the replacement. - Applications written for UTF-8 cannot differentiate between a replacement character that was intentionally sent by the client versus an invalid byte sequence. Fortunately, unquote() allows us to specify the encoding that should be used. By specifying Latin-1, we can drop encode_dance() entirely and preserve as much information from the wire as we can.
-rw-r--r--eventlet/wsgi.py13
-rw-r--r--tests/wsgi_test.py23
2 files changed, 19 insertions, 17 deletions
diff --git a/eventlet/wsgi.py b/eventlet/wsgi.py
index 84bb43c..086c6ea 100644
--- a/eventlet/wsgi.py
+++ b/eventlet/wsgi.py
@@ -59,14 +59,6 @@ def addr_to_host_port(addr):
return (host, port)
-def encode_dance(s):
- if not isinstance(s, bytes):
- s = s.encode('utf-8', 'replace')
- if six.PY2:
- return s
- return s.decode('latin1')
-
-
# Collections of error codes to compare against. Not all attributes are set
# on errno module on all platforms, so some are literals :(
BAD_SOCK = set((errno.EBADF, 10053))
@@ -646,7 +638,10 @@ class HttpProtocol(BaseHTTPServer.BaseHTTPRequestHandler):
pq = self.path.split('?', 1)
env['RAW_PATH_INFO'] = pq[0]
- env['PATH_INFO'] = encode_dance(urllib.parse.unquote(pq[0]))
+ if six.PY2:
+ env['PATH_INFO'] = urllib.parse.unquote(pq[0])
+ else:
+ env['PATH_INFO'] = urllib.parse.unquote(pq[0], encoding='latin1')
if len(pq) > 1:
env['QUERY_STRING'] = pq[1]
diff --git a/tests/wsgi_test.py b/tests/wsgi_test.py
index 901035b..7f3856c 100644
--- a/tests/wsgi_test.py
+++ b/tests/wsgi_test.py
@@ -1443,21 +1443,28 @@ class TestHttpd(_TestBase):
self.site.application = wsgi_app
sock = eventlet.connect(self.server_addr)
- sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n')
+ # This is a properly-quoted request for the UTF-8 path /你好
+ sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\n\r\n')
result = read_http(sock)
assert result.status == 'HTTP/1.1 200 OK'
- # that was only preparation, actual test below
+ # Like above, but the octets are reversed before being quoted,
+ # so the result should *not* be interpreted as UTF-8
+ sock.sendall(b'GET /%BD%A5%E5%A0%BD%E4 HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n')
+ result = read_http(sock)
+ assert result.status == 'HTTP/1.1 200 OK'
+
+ # that was only preparation, actual tests below
# Per PEP-0333 https://www.python.org/dev/peps/pep-0333/#unicode-issues
# in all WSGI environment strings application must observe either bytes in latin-1 (ISO-8859-1)
# or unicode code points \u0000..\u00ff
- # wsgi_decoding_dance from Werkzeug to emulate concerned application
msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[0]))
assert isinstance(g[0], str), msg
- if six.PY2:
- assert g[0] == u'/你好'.encode('utf-8')
- else:
- decoded = g[0].encode('latin1').decode('utf-8', 'replace')
- assert decoded == u'/你好'
+ # Fortunately, WSGI strings have the same literal representation on both py2 and py3
+ assert g[0] == '/\xe4\xbd\xa0\xe5\xa5\xbd'
+
+ msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[1]))
+ assert isinstance(g[1], str), msg
+ assert g[1] == '/\xbd\xa5\xe5\xa0\xbd\xe4'
@tests.skip_if_no_ipv6
def test_ipv6(self):