summaryrefslogtreecommitdiffstats
path: root/Lib/urllib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2024-08-31 09:42:08 (GMT)
committerGitHub <noreply@github.com>2024-08-31 09:42:08 (GMT)
commitfc897fcc01964649f023e0baa4c95d142e4e8a10 (patch)
tree225e7086273594100c09c34c09d03f8291911024 /Lib/urllib
parente5a567b0a721c26c79530249d9aa159afbd11955 (diff)
downloadcpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.zip
cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.tar.gz
cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.tar.bz2
gh-76960: Fix urljoin() and urldefrag() for URIs with empty components (GH-123273)
* urljoin() with relative reference "?" sets empty query and removes fragment. * Preserve empty components (authority, params, query, fragment) in urljoin(). * Preserve empty components (authority, params, query) in urldefrag(). Also refactor the code and get rid of double _coerce_args() and _coerce_result() calls in urljoin(), urldefrag(), urlparse() and urlunparse().
Diffstat (limited to 'Lib/urllib')
-rw-r--r--Lib/urllib/parse.py100
1 files changed, 62 insertions, 38 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 3316530..5b00ab2 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -392,20 +392,23 @@ def urlparse(url, scheme='', allow_fragments=True):
Note that % escapes are not expanded.
"""
url, scheme, _coerce_result = _coerce_args(url, scheme)
- splitresult = urlsplit(url, scheme, allow_fragments)
- scheme, netloc, url, query, fragment = splitresult
- if scheme in uses_params and ';' in url:
- url, params = _splitparams(url)
- else:
- params = ''
- result = ParseResult(scheme, netloc, url, params, query, fragment)
+ scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments)
+ result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '')
return _coerce_result(result)
-def _splitparams(url):
+def _urlparse(url, scheme=None, allow_fragments=True):
+ scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
+ if (scheme or '') in uses_params and ';' in url:
+ url, params = _splitparams(url, allow_none=True)
+ else:
+ params = None
+ return (scheme, netloc, url, params, query, fragment)
+
+def _splitparams(url, allow_none=False):
if '/' in url:
i = url.find(';', url.rfind('/'))
if i < 0:
- return url, ''
+ return url, None if allow_none else ''
else:
i = url.find(';')
return url[:i], url[i+1:]
@@ -472,17 +475,23 @@ def urlsplit(url, scheme='', allow_fragments=True):
"""
url, scheme, _coerce_result = _coerce_args(url, scheme)
+ scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
+ v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '')
+ return _coerce_result(v)
+
+def _urlsplit(url, scheme=None, allow_fragments=True):
# Only lstrip url as some applications rely on preserving trailing space.
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
- scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
-
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
- scheme = scheme.replace(b, "")
+ if scheme is not None:
+ scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
+ for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+ scheme = scheme.replace(b, "")
allow_fragments = bool(allow_fragments)
- netloc = query = fragment = ''
+ netloc = query = fragment = None
i = url.find(':')
if i > 0 and url[0].isascii() and url[0].isalpha():
for c in url[:i]:
@@ -503,8 +512,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
- v = SplitResult(scheme, netloc, url, query, fragment)
- return _coerce_result(v)
+ return (scheme, netloc, url, query, fragment)
def urlunparse(components):
"""Put a parsed URL back together again. This may result in a
@@ -513,9 +521,15 @@ def urlunparse(components):
(the draft states that these are equivalent)."""
scheme, netloc, url, params, query, fragment, _coerce_result = (
_coerce_args(*components))
+ if not netloc:
+ if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
+ netloc = ''
+ else:
+ netloc = None
if params:
url = "%s;%s" % (url, params)
- return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
+ return _coerce_result(_urlunsplit(scheme or None, netloc, url,
+ query or None, fragment or None))
def urlunsplit(components):
"""Combine the elements of a tuple as returned by urlsplit() into a
@@ -525,20 +539,27 @@ def urlunsplit(components):
empty query; the RFC states that these are equivalent)."""
scheme, netloc, url, query, fragment, _coerce_result = (
_coerce_args(*components))
- if netloc:
+ if not netloc:
+ if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
+ netloc = ''
+ else:
+ netloc = None
+ return _coerce_result(_urlunsplit(scheme or None, netloc, url,
+ query or None, fragment or None))
+
+def _urlunsplit(scheme, netloc, url, query, fragment):
+ if netloc is not None:
if url and url[:1] != '/': url = '/' + url
url = '//' + netloc + url
elif url[:2] == '//':
url = '//' + url
- elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
- url = '//' + url
if scheme:
url = scheme + ':' + url
- if query:
+ if query is not None:
url = url + '?' + query
- if fragment:
+ if fragment is not None:
url = url + '#' + fragment
- return _coerce_result(url)
+ return url
def urljoin(base, url, allow_fragments=True):
"""Join a base URL and a possibly relative URL to form an absolute
@@ -549,26 +570,29 @@ def urljoin(base, url, allow_fragments=True):
return base
base, url, _coerce_result = _coerce_args(base, url)
- bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
- urlparse(base, '', allow_fragments)
- scheme, netloc, path, params, query, fragment = \
- urlparse(url, bscheme, allow_fragments)
+ bscheme, bnetloc, bpath, bquery, bfragment = \
+ _urlsplit(base, None, allow_fragments)
+ scheme, netloc, path, query, fragment = \
+ _urlsplit(url, None, allow_fragments)
+ if scheme is None:
+ scheme = bscheme
if scheme != bscheme or scheme not in uses_relative:
return _coerce_result(url)
if scheme in uses_netloc:
if netloc:
- return _coerce_result(urlunparse((scheme, netloc, path,
- params, query, fragment)))
+ return _coerce_result(_urlunsplit(scheme, netloc, path,
+ query, fragment))
netloc = bnetloc
- if not path and not params:
+ if not path:
path = bpath
- params = bparams
- if not query:
+ if query is None:
query = bquery
- return _coerce_result(urlunparse((scheme, netloc, path,
- params, query, fragment)))
+ if fragment is None:
+ fragment = bfragment
+ return _coerce_result(_urlunsplit(scheme, netloc, path,
+ query, fragment))
base_parts = bpath.split('/')
if base_parts[-1] != '':
@@ -605,8 +629,8 @@ def urljoin(base, url, allow_fragments=True):
# then we need to append the trailing '/'
resolved_path.append('')
- return _coerce_result(urlunparse((scheme, netloc, '/'.join(
- resolved_path) or '/', params, query, fragment)))
+ return _coerce_result(_urlunsplit(scheme, netloc, '/'.join(
+ resolved_path) or '/', query, fragment))
def urldefrag(url):
@@ -618,12 +642,12 @@ def urldefrag(url):
"""
url, _coerce_result = _coerce_args(url)
if '#' in url:
- s, n, p, a, q, frag = urlparse(url)
- defrag = urlunparse((s, n, p, a, q, ''))
+ s, n, p, q, frag = _urlsplit(url)
+ defrag = _urlunsplit(s, n, p, q, None)
else:
frag = ''
defrag = url
- return _coerce_result(DefragResult(defrag, frag))
+ return _coerce_result(DefragResult(defrag, frag or ''))
_hexdig = '0123456789ABCDEFabcdef'
_hextobyte = None