summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_urllib.py2
-rw-r--r--Lib/urllib/parse.py30
-rw-r--r--Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst2
3 files changed, 23 insertions, 11 deletions
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index f067560ca6..2df74f5e6f 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -1104,6 +1104,8 @@ class UnquotingTests(unittest.TestCase):
self.assertEqual(result.count('%'), 1,
"using unquote(): not all characters escaped: "
"%s" % result)
+
+ def test_unquote_rejects_none_and_tuple(self):
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 4f6867accb..5f95c5ff7f 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -600,6 +600,9 @@ _hextobyte = None
def unquote_to_bytes(string):
"""unquote_to_bytes('abc%20def') -> b'abc def'."""
+ return bytes(_unquote_impl(string))
+
+def _unquote_impl(string: bytes | bytearray | str) -> bytes | bytearray:
# Note: strings are encoded as UTF-8. This is only an issue if it contains
# unescaped non-ASCII characters, which URIs should not.
if not string:
@@ -611,8 +614,8 @@ def unquote_to_bytes(string):
bits = string.split(b'%')
if len(bits) == 1:
return string
- res = [bits[0]]
- append = res.append
+ res = bytearray(bits[0])
+ append = res.extend
# Delay the initialization of the table to not waste memory
# if the function is never called
global _hextobyte
@@ -626,10 +629,20 @@ def unquote_to_bytes(string):
except KeyError:
append(b'%')
append(item)
- return b''.join(res)
+ return res
_asciire = re.compile('([\x00-\x7f]+)')
+def _generate_unquoted_parts(string, encoding, errors):
+ previous_match_end = 0
+ for ascii_match in _asciire.finditer(string):
+ start, end = ascii_match.span()
+ yield string[previous_match_end:start] # Non-ASCII
+ # The ascii_match[1] group == string[start:end].
+ yield _unquote_impl(ascii_match[1]).decode(encoding, errors)
+ previous_match_end = end
+ yield string[previous_match_end:] # Non-ASCII tail
+
def unquote(string, encoding='utf-8', errors='replace'):
"""Replace %xx escapes by their single-character equivalent. The optional
encoding and errors parameters specify how to decode percent-encoded
@@ -641,21 +654,16 @@ def unquote(string, encoding='utf-8', errors='replace'):
unquote('abc%20def') -> 'abc def'.
"""
if isinstance(string, bytes):
- return unquote_to_bytes(string).decode(encoding, errors)
+ return _unquote_impl(string).decode(encoding, errors)
if '%' not in string:
+ # Is it a string-like object?
string.split
return string
if encoding is None:
encoding = 'utf-8'
if errors is None:
errors = 'replace'
- bits = _asciire.split(string)
- res = [bits[0]]
- append = res.append
- for i in range(1, len(bits), 2):
- append(unquote_to_bytes(bits[i]).decode(encoding, errors))
- append(bits[i + 1])
- return ''.join(res)
+ return ''.join(_generate_unquoted_parts(string, encoding, errors))
def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
diff --git a/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst b/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst
new file mode 100644
index 0000000000..ad01f5e16b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst
@@ -0,0 +1,2 @@
+Reduced the memory usage of :func:`urllib.parse.unquote` and
+:func:`urllib.parse.unquote_to_bytes` on large values.