summaryrefslogtreecommitdiffstats
path: root/Lib/urllib/parse.py
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2021-05-05 17:25:29 (GMT)
committerGitHub <noreply@github.com>2021-05-05 17:25:29 (GMT)
commit515a7bc4e13645d0945b46a8e1d9102b918cd407 (patch)
treebca9cf860500a30cc5d42806342f9c4420c17ffe /Lib/urllib/parse.py
parent44f6b9aa49d562ab7c67952442b8348346b24141 (diff)
downloadcpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.zip
cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.tar.gz
cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.tar.bz2
[3.8] bpo-43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. (GH-25595) (#25726)
Co-authored-by: Gregory P. Smith <greg@krypto.org> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> (cherry picked from commit 76cd81d60310d65d01f9d7b48a8985d8ab89c8b4) Co-authored-by: Senthil Kumaran <senthil@uthcode.com> Co-authored-by: Senthil Kumaran <skumaran@gatech.edu>
Diffstat (limited to 'Lib/urllib/parse.py')
-rw-r--r--Lib/urllib/parse.py10
1 files changed, 10 insertions, 0 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 36fd8fe..f0d9d4d 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -77,6 +77,9 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
'0123456789'
'+-.')
+# Unsafe bytes to be removed per WHATWG spec
+_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
+
# XXX: Consider replacing with functools.lru_cache
MAX_CACHE_SIZE = 20
_parse_cache = {}
@@ -414,6 +417,11 @@ def _checknetloc(netloc):
raise ValueError("netloc '" + netloc + "' contains invalid " +
"characters under NFKC normalization")
+def _remove_unsafe_bytes_from_url(url):
+ for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+ url = url.replace(b, "")
+ return url
+
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -421,6 +429,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
url, scheme, _coerce_result = _coerce_args(url, scheme)
+ url = _remove_unsafe_bytes_from_url(url)
+ scheme = _remove_unsafe_bytes_from_url(scheme)
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)