add tests, add docs describing all changesfeature.pathinfogeddon

author: Chris McDonough <chrism@plope.com> 2012-01-08 16:24:58 -0500
committer: Chris McDonough <chrism@plope.com> 2012-01-08 16:24:58 -0500
commit: 2fd0945e7fac522e76ca180fe53067f46d728278 (patch)
tree: e6daaed503e66c2f33cb9edbf759d38ad1f9a2ef
parent: 63302b67d06d519c569f325528c98bae5bcbf9d5 (diff)
download: webob-feature.pathinfogeddon.tar.gz
6 files changed, 113 insertions, 13 deletions
diff --git a/docs/news.txt b/docs/news.txt
index caa27af..280038f 100644
--- a/docs/news.txt
+++ b/docs/news.txt
@@ -15,6 +15,75 @@ Next release
 
 * Removed (non-API) ``webob.descriptors.upath_property``.
 
+* ``request.path_info_pop`` and ``request.path_info_peek`` now return
+  bytestrings on Python 3.  Previously, they would return text.  Rationale:
+  the raw value of ``PATH_INFO`` on Python 3 is "WSGI-encoded", and doesn't
+  make sense to work against at all.  We normalize behavior on Python 2 and
+  Python 3 by explicitly working against and returning bytes on both
+  platforms.
+
+* ``Request.blank(<url_with_nonascii_url_encoded_value>)`` and
+  ``Request.blank(base_url=<url_with_nonascii_url_encoded_value>)`` now
+  produce a request environment with correct ``PATH_INFO`` and
+  ``SCRIPT_NAME`` environment variables.
+
+* Request.from_file now produces a request environment with correct
+  ``PATH_INFO`` and ``SCRIPT_NAME`` environment variables when the header
+  line of the request contains nonascii characters in the URI.
+
+* Response ``location`` header value mutation (converting relative paths to
+  absolute) when a response is generated will now create the correct URL when
+  SCRIPT_NAME or PATH_INFO exists in the environment.  Also fixed same code
+  to not barf when nonascii characters are in the location.
+
+* BaseRequest now accepts a ``url_encoding`` argument, which defaults to
+  'utf-8'.  It represents the presumed encoding of the SCRIPT_NAME and
+  PATH_INFO environment variables.
+
+* Four new descriptor APIs have been added to BaseRequest: ``pathinfo``,
+  ``pathinfo_bytes``, ``scriptname``, and ``scriptname_bytes``.  These
+  supersede the existing ``path_info`` and ``script_name`` descriptors and
+  should be used going forward instead. The ``path_info`` and ``script_name``
+  descriptor have been deprecated.
+
+  This was done to address Python 3-related PEP 3333 issues.  The older
+  ``path_info`` and ``script_name`` descriptors have historically operated
+  against the raw ``PATH_INFO`` and ``SCRIPT_NAME`` values in the WSGI
+  environment, treating them as if they were bytestring values.  However,
+  because PEP 3333 specifies that PATH_INFO and SCRIPT_NAME are *text* (raw
+  bytes decoded from Latin-1) on Python 3, operating on the raw environ
+  values as if they are bytes is not sane on that platform, and will not
+  work.
+
+  In the meantime, we can't just make ``path_info`` and ``script_name``
+  return decoded (Unicode) values on Python 2 without breaking existing
+  consumer code, and having these return bytestrings on Python 3 is what no
+  one expects.  It's generally much saner to operate against text (decoded)
+  values on both platforms, and we'd prefer to make this the default
+  name-wise going forward. So we've chosen to deprecate both ``path_info``
+  and ``script_name``.  They'll continue operating like they always have,
+  which is to say, they'll work fine on Python 2, and they'll return nonsense
+  on Python 3.  We'll leave both in place for a long time, but they'll emit a
+  warning when used.
+
+  ``pathinfo`` is a replacement for the older deprecated ``path_info``
+  descriptor; it returns the PATH_INFO as unicode/text (decoded using the
+  request's url_encoding).  It's a descriptor, so you can also set it using a
+  text value.  It will raise an exception if you try to set it using a
+  non-text value.  ``pathinfo_bytes`` is the bytes-oriented version, which
+  you can use to get and set PATH_INFO using a bytes value.
+
+  ``scriptname`` is a replacement for the older deprecated ``script_name``
+  descriptor; it returns the SCRIPT_NAME as unicode/text (decoded using the
+  request's url_encoding).  It's a descriptor, so you can also set it using a
+  text value.  It will raise an exception if you try to set it using a
+  non-text value.  ``scriptname_bytes`` is the bytes-oriented version, which
+  you can use to get and set SCRIPT_NAME using a bytes value.
+
+  The pre-existing ``upath_info`` descriptor is now aliased to the new
+  ``pathinfo`` descriptor.  The pre-existing ``uscript_name`` descriptor is
+  now aliased to the new ``scriptname`` descriptor.
+
 1.2b2
 ------
 
diff --git a/tests/test_request.py b/tests/test_request.py
index 016bd0e..462dc0f 100644
--- a/tests/test_request.py
+++ b/tests/test_request.py
@@ -797,7 +797,7 @@ class BaseRequestTests(unittest.TestCase):
 
     def test_path_info_pop_non_empty_w_pattern_miss(self):
         import re
-        PATTERN = re.compile('miss')
+        PATTERN = re.compile(b'miss')
         environ = {'wsgi.url_scheme': 'http',
                    'SERVER_NAME': 'example.com',
                    'SERVER_PORT': '80',
@@ -812,7 +812,7 @@ class BaseRequestTests(unittest.TestCase):
 
     def test_path_info_pop_non_empty_w_pattern_hit(self):
         import re
-        PATTERN = re.compile('path')
+        PATTERN = re.compile(b'path')
         environ = {'wsgi.url_scheme': 'http',
                    'SERVER_NAME': 'example.com',
                    'SERVER_PORT': '80',
diff --git a/tests/test_response.py b/tests/test_response.py
index 792d77d..8cef916 100644
--- a/tests/test_response.py
+++ b/tests/test_response.py
@@ -997,11 +997,30 @@ def test_decode_content_gzip():
 
 def test__abs_headerlist_location_with_scheme():
     res = Response()
-    res.content_encoding = 'gzip'
     res.headerlist = [('Location', 'http:')]
     result = res._abs_headerlist({})
     eq_(result, [('Location', 'http:')])
 
+def test__abs_headerlist_location_with_relative():
+    encoded_path_info = b'/\xe6\xb5\x81'
+    encoded_script_name = b'/\xe6\xb5\x82'
+    if PY3:
+        wsgiencoded_path_info = encoded_path_info.decode('latin-1')
+        wsgiencoded_script_name = encoded_script_name.decode('latin-1')
+    else:
+        wsgiencoded_path_info = encoded_path_info
+        wsgiencoded_script_name = encoded_script_name
+    environ = {
+        'wsgi.url_scheme': 'http',
+        'HTTP_HOST': 'test.com',
+        'SCRIPT_NAME': wsgiencoded_script_name,
+        'PATH_INFO': wsgiencoded_path_info,
+    }
+    res = Response()
+    res.headerlist = [('Location', 'foo')]
+    result = res._abs_headerlist(environ)
+    eq_(result, [('Location', 'http://test.com/%E6%B5%82/%E6%B5%81/foo')])
+
 def test_response_set_body_file1():
      data  = b'abc'
      file = io.BytesIO(data)
diff --git a/webob/request.py b/webob/request.py
index f76b754..50bb407 100644
--- a/webob/request.py
+++ b/webob/request.py
@@ -281,8 +281,16 @@ class BaseRequest(object):
         parse_int, serialize_int, 'int')
 
     # raw wsgi values (bytes on py2, bytes-tunneled-via-text on py3)
-    script_name = environ_getter('SCRIPT_NAME', '')
-    path_info = environ_getter('PATH_INFO')
+    script_name = deprecated_property(
+        environ_getter('SCRIPT_NAME', ''),
+        'script_name',
+        'deprecated in WebOb 1.2, use scriptname or scriptname_bytes instead',
+        '1.4')
+    path_info = deprecated_property(
+        environ_getter('PATH_INFO'),
+        'path_info',
+        'deprecated in WebOb 1.2, use pathinfo or pathinfo_bytes instead',
+        '1.4')
 
     if PY3: # pragma: no cover
         def _bytes_to_wsgi(self, val):
@@ -671,7 +679,8 @@ class BaseRequest(object):
 
         Optional ``pattern`` argument is a regexp to match the return value
         before returning. If there is no match, no changes are made to the
-        request and None is returned.
+        request and None is returned.  The pattern must always match against
+        a bytes object (not unicode).
         """
         path = self.pathinfo_bytes
         if not path:
@@ -684,7 +693,7 @@ class BaseRequest(object):
         if idx == -1:
             idx = len(path)
         r = path[:idx]
-        if pattern is None or re.match(pattern, r.decode(self.url_encoding)):
+        if pattern is None or re.match(pattern, r):
             self.scriptname_bytes += slashes + r
             self.pathinfo_bytes = path[idx:]
             return r
diff --git a/webob/response.py b/webob/response.py
index 83931f8..20b0e47 100644
--- a/webob/response.py
+++ b/webob/response.py
@@ -934,8 +934,10 @@ class Response(object):
             if name.lower() == 'location':
                 if SCHEME_RE.search(value):
                     break
-                new_location = urlparse.urljoin(
-                    _request_uri(environ), value)
+                uri = _request_uri(environ)
+                if not uri.endswith('/'):
+                    uri += '/'
+                new_location = urlparse.urljoin(uri, value)
                 headerlist = list(headerlist)
                 idx = headerlist.index((name, value))
                 headerlist[idx] = (name, new_location)
@@ -1152,16 +1154,17 @@ def _request_uri(environ):
 
     script_name = environ.get('SCRIPT_NAME') or '/'
     path_info = environ.get('PATH_INFO','')
+
     if PY3: # pragma: no cover
         script_name = script_name.encode('latin-1').decode('utf-8')
         path_info = path_info.encode('latin-1').decode('utf-8')
 
     url += url_quote(script_name)
     path_info = url_quote(path_info)
-    if not environ.get('SCRIPT_NAME'):
-        url += path_info[1:]
-    else:
+    if environ.get('SCRIPT_NAME'):
         url += path_info
+    else:
+        url += path_info[1:]
     return url
 
 
diff --git a/webob/util.py b/webob/util.py
index b740088..6b838b4 100644
--- a/webob/util.py
+++ b/webob/util.py
@@ -49,7 +49,7 @@ def warn_deprecation(text, version, stacklevel): # pragma: no cover
     # version specifies when to start raising exceptions instead of warnings
     if version == '1.2':
         raise DeprecationWarning(text)
-    elif version == '1.3':
+    elif version in ('1.3', '1.4'):
         cls = DeprecationWarning
     else:
         cls = DeprecationWarning
author	Chris McDonough <chrism@plope.com>	2012-01-08 16:24:58 -0500
committer	Chris McDonough <chrism@plope.com>	2012-01-08 16:24:58 -0500
commit	2fd0945e7fac522e76ca180fe53067f46d728278 (patch)
tree	e6daaed503e66c2f33cb9edbf759d38ad1f9a2ef
parent	63302b67d06d519c569f325528c98bae5bcbf9d5 (diff)
download	webob-feature.pathinfogeddon.tar.gz