diff options
Diffstat (limited to 'Lib/urllib/parse.py')
-rw-r--r-- | Lib/urllib/parse.py | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index da00739..b73b344 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -25,6 +25,10 @@ currently not entirely compliant with this RFC due to defacto scenarios for parsing, and for backward compatibility purposes, some parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. + +The WHATWG URL Parser spec should also be considered. We are not compliant with +it either due to existing user code API behavior expectations (Hyrum's Law). +It serves as a useful guide when making changes. """ from collections import namedtuple @@ -80,6 +84,10 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' '0123456789' '+-.') +# Leading and trailing C0 control and space to be stripped per WHATWG spec. +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) +_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' + # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] @@ -464,6 +472,10 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) + # Only lstrip url as some applications rely on preserving trailing space. + # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) + url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) for b in _UNSAFE_URL_BYTES_TO_REMOVE: url = url.replace(b, "") |