diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2023-05-17 16:06:06 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-17 16:06:06 -0700 |
commit | f48a96a28012d28ae37a2f4587a780a5eb779946 (patch) | |
tree | c9d5e9271a27e75b4f394ba441da49a8df4bd176 /Lib/urllib/parse.py | |
parent | 425065bb002b9cbf9c12f61a6f3102f2ce2b8d14 (diff) | |
download | cpython-git-3.10.tar.gz |
[3.10] [3.11] gh-102153: Start stripping C0 control and space chars in `urlsplit` (GH-102508) (GH-104575) (#104592)3.10
gh-102153: Start stripping C0 control and space chars in `urlsplit` (GH-102508)
`urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit GH-25595.
This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/GH-url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329).
I simplified the docs by eliding the state of the world explanatory
paragraph in this security release only backport. (people will see
that in the mainline /3/ docs)
---------
(cherry picked from commit 2f630e1ce18ad2e07428296532a68b11dc66ad10)
(cherry picked from commit 610cc0ab1b760b2abaac92bd256b96191c46b941)
Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Co-authored-by: Illia Volochii <illia.volochii@gmail.com>
Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org>
Diffstat (limited to 'Lib/urllib/parse.py')
-rw-r--r-- | Lib/urllib/parse.py | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 26ddf30748..0ab2023843 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -25,6 +25,10 @@ currently not entirely compliant with this RFC due to defacto scenarios for parsing, and for backward compatibility purposes, some parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. + +The WHATWG URL Parser spec should also be considered. We are not compliant with +it either due to existing user code API behavior expectations (Hyrum's Law). +It serves as a useful guide when making changes. """ import re @@ -78,6 +82,10 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' '0123456789' '+-.') +# Leading and trailing C0 control and space to be stripped per WHATWG spec. +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) +_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' + # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] @@ -455,6 +463,10 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) + # Only lstrip url as some applications rely on preserving trailing space. + # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) + url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) for b in _UNSAFE_URL_BYTES_TO_REMOVE: url = url.replace(b, "") |