summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRatnadeep Debnath <rtnpro@gmail.com>2017-02-25 14:30:28 +0530
committerNick Coghlan <ncoghlan@gmail.com>2017-02-25 19:00:28 +1000
commit21024f06622c4c55b666adb130797a4ee205d005 (patch)
tree8b5f5381deb999d248430f3b2b8e351936e72fe8
parent140792bd514ee4ba739fda899785bea3ce746f05 (diff)
downloadcpython-git-21024f06622c4c55b666adb130797a4ee205d005.tar.gz
bpo-16285: Update urllib quoting to RFC 3986 (#173)
* bpo-16285: Update urllib quoting to RFC 3986 urllib.parse.quote is now based on RFC 3986, and hence includes `'~'` in the set of characters that is not escaped by default. Patch by Christian Theune and Ratnadeep Debnath.
-rw-r--r--Doc/library/urllib.parse.rst6
-rw-r--r--Doc/whatsnew/3.7.rst7
-rw-r--r--Lib/test/test_urllib.py4
-rw-r--r--Lib/urllib/parse.py9
-rw-r--r--Misc/ACKS4
-rw-r--r--Misc/NEWS4
6 files changed, 27 insertions, 7 deletions
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
index 676321b46a..7a5b56f5da 100644
--- a/Doc/library/urllib.parse.rst
+++ b/Doc/library/urllib.parse.rst
@@ -451,13 +451,17 @@ task isn't already covered by the URL parsing functions above.
.. function:: quote(string, safe='/', encoding=None, errors=None)
Replace special characters in *string* using the ``%xx`` escape. Letters,
- digits, and the characters ``'_.-'`` are never quoted. By default, this
+ digits, and the characters ``'_.-~'`` are never quoted. By default, this
function is intended for quoting the path section of URL. The optional *safe*
parameter specifies additional ASCII characters that should not be quoted
--- its default value is ``'/'``.
*string* may be either a :class:`str` or a :class:`bytes`.
+ .. versionchanged:: 3.7
+ Moved from RFC 2396 to RFC 3986 for quoting URL strings. "~" is now
+ included in the set of reserved characters.
+
The optional *encoding* and *errors* parameters specify how to deal with
non-ASCII characters, as accepted by the :meth:`str.encode` method.
*encoding* defaults to ``'utf-8'``.
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst
index 861c53728e..35eea84e0a 100644
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -103,6 +103,13 @@ The :const:`~unittest.mock.sentinel` attributes now preserve their identity
when they are :mod:`copied <copy>` or :mod:`pickled <pickle>`.
(Contributed by Serhiy Storchaka in :issue:`20804`.)
+urllib.parse
+------------
+
+:func:`urllib.parse.quote` has been updated to from RFC 2396 to RFC 3986,
+adding `~` to the set of characters that is never quoted by default.
+(Contributed by Christian Theune and Ratnadeep Debnath in :issue:`16285`.)
+
Optimizations
=============
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index 5084486e5a..bffbb0a8d1 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -733,7 +733,7 @@ FF
class QuotingTests(unittest.TestCase):
r"""Tests for urllib.quote() and urllib.quote_plus()
- According to RFC 2396 (Uniform Resource Identifiers), to escape a
+ According to RFC 3986 (Uniform Resource Identifiers), to escape a
character you write it as '%' + <2 character US-ASCII hex value>.
The Python code of ``'%' + hex(ord(<character>))[2:]`` escapes a
character properly. Case does not matter on the hex letters.
@@ -761,7 +761,7 @@ class QuotingTests(unittest.TestCase):
do_not_quote = '' .join(["ABCDEFGHIJKLMNOPQRSTUVWXYZ",
"abcdefghijklmnopqrstuvwxyz",
"0123456789",
- "_.-"])
+ "_.-~"])
result = urllib.parse.quote(do_not_quote)
self.assertEqual(do_not_quote, result,
"using quote(): %r != %r" % (do_not_quote, result))
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 1d08730a89..f3a309aacc 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -704,7 +704,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'):
_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
b'abcdefghijklmnopqrstuvwxyz'
b'0123456789'
- b'_.-')
+ b'_.-~')
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
_safe_quoters = {}
@@ -736,15 +736,18 @@ def quote(string, safe='/', encoding=None, errors=None):
Each part of a URL, e.g. the path info, the query, etc., has a
different set of reserved characters that must be quoted.
- RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
+ RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists
the following reserved characters.
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
- "$" | ","
+ "$" | "," | "~"
Each of these characters is reserved in some component of a URL,
but not necessarily in all of them.
+ Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
+ Now, "~" is included in the set of reserved characters.
+
By default, the quote function is intended for quoting the path
section of a URL. Thus, it will not encode '/'. This character
is reserved, but in typical usage the quote function is being
diff --git a/Misc/ACKS b/Misc/ACKS
index e63a061098..255318e572 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -344,6 +344,7 @@ Kushal Das
Jonathan Dasteel
Pierre-Yves David
A. Jesse Jiryu Davis
+Ratnadeep Debnath
Merlijn van Deen
John DeGood
Ned Deily
@@ -1518,6 +1519,7 @@ Mikhail Terekhov
Victor Terrón
Richard M. Tew
Tobias Thelen
+Christian Theune
Févry Thibault
Lowe Thiderman
Nicolas M. Thiéry
@@ -1528,7 +1530,7 @@ Stephen Thorne
Jeremy Thurgood
Eric Tiedemann
July Tikhonov
-Tracy Tims
+
Oren Tirosh
Tim Tisdall
Jason Tishler
diff --git a/Misc/NEWS b/Misc/NEWS
index e7ab3df8d7..74ec8c3bdf 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -249,6 +249,10 @@ Extension Modules
Library
-------
+- Issue #16285: urrlib.parse.quote is now based on RFC 3986 and hence includes
+ '~' in the set of characters that is not quoted by default. Patch by
+ Christian Theune and Ratnadeep Debnath.
+
- bpo-29532: Altering a kwarg dictionary passed to functools.partial()
no longer affects a partial object after creation.