[3.8] bpo-43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. (GH-25595) (#25726)

Co-authored-by: Gregory P. Smith <greg@krypto.org> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> (cherry picked from commit 76cd81d60310d65d01f9d7b48a8985d8ab89c8b4) Co-authored-by: Senthil Kumaran <senthil@uthcode.com> Co-authored-by: Senthil Kumaran <skumaran@gatech.edu>
author: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> 2021-05-05 17:25:29 (GMT)
committer: GitHub <noreply@github.com> 2021-05-05 17:25:29 (GMT)
commit: 515a7bc4e13645d0945b46a8e1d9102b918cd407 (patch)
tree: bca9cf860500a30cc5d42806342f9c4420c17ffe /Lib/urllib/parse.py
parent: 44f6b9aa49d562ab7c67952442b8348346b24141 (diff)
download: cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.zip
cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.tar.gz
cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.tar.bz2
1 files changed, 10 insertions, 0 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 36fd8fe..f0d9d4d 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -77,6 +77,9 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                 '0123456789'
                 '+-.')
 
+# Unsafe bytes to be removed per WHATWG spec
+_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
+
 # XXX: Consider replacing with functools.lru_cache
 MAX_CACHE_SIZE = 20
 _parse_cache = {}
@@ -414,6 +417,11 @@ def _checknetloc(netloc):
             raise ValueError("netloc '" + netloc + "' contains invalid " +
                              "characters under NFKC normalization")
 
+def _remove_unsafe_bytes_from_url(url):
+    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+        url = url.replace(b, "")
+    return url
+
 def urlsplit(url, scheme='', allow_fragments=True):
     """Parse a URL into 5 components:
     <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -421,6 +429,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
     Note that we don't break the components up in smaller bits
     (e.g. netloc is a single string) and we don't expand % escapes."""
     url, scheme, _coerce_result = _coerce_args(url, scheme)
+    url = _remove_unsafe_bytes_from_url(url)
+    scheme = _remove_unsafe_bytes_from_url(scheme)
     allow_fragments = bool(allow_fragments)
     key = url, scheme, allow_fragments, type(url), type(scheme)
     cached = _parse_cache.get(key, None)
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>	2021-05-05 17:25:29 (GMT)
committer	GitHub <noreply@github.com>	2021-05-05 17:25:29 (GMT)
commit	515a7bc4e13645d0945b46a8e1d9102b918cd407 (patch)
tree	bca9cf860500a30cc5d42806342f9c4420c17ffe /Lib/urllib/parse.py
parent	44f6b9aa49d562ab7c67952442b8348346b24141 (diff)
download	cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.zip cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.tar.gz cpython-515a7bc4e13645d0945b46a8e1d9102b918cd407.tar.bz2