gh-76960: Fix urljoin() and urldefrag() for URIs with empty components (GH-123273)

* urljoin() with relative reference "?" sets empty query and removes fragment. * Preserve empty components (authority, params, query, fragment) in urljoin(). * Preserve empty components (authority, params, query) in urldefrag(). Also refactor the code and get rid of double _coerce_args() and _coerce_result() calls in urljoin(), urldefrag(), urlparse() and urlunparse().
author: Serhiy Storchaka <storchaka@gmail.com> 2024-08-31 09:42:08 (GMT)
committer: GitHub <noreply@github.com> 2024-08-31 09:42:08 (GMT)
commit: fc897fcc01964649f023e0baa4c95d142e4e8a10 (patch)
tree: 225e7086273594100c09c34c09d03f8291911024 /Lib/urllib
parent: e5a567b0a721c26c79530249d9aa159afbd11955 (diff)
download: cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.zip
cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.tar.gz
cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.tar.bz2
1 files changed, 62 insertions, 38 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 3316530..5b00ab2 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -392,20 +392,23 @@ def urlparse(url, scheme='', allow_fragments=True):
     Note that % escapes are not expanded.
     """
     url, scheme, _coerce_result = _coerce_args(url, scheme)
-    splitresult = urlsplit(url, scheme, allow_fragments)
-    scheme, netloc, url, query, fragment = splitresult
-    if scheme in uses_params and ';' in url:
-        url, params = _splitparams(url)
-    else:
-        params = ''
-    result = ParseResult(scheme, netloc, url, params, query, fragment)
+    scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments)
+    result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '')
     return _coerce_result(result)
 
-def _splitparams(url):
+def _urlparse(url, scheme=None, allow_fragments=True):
+    scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
+    if (scheme or '') in uses_params and ';' in url:
+        url, params = _splitparams(url, allow_none=True)
+    else:
+        params = None
+    return (scheme, netloc, url, params, query, fragment)
+
+def _splitparams(url, allow_none=False):
     if '/'  in url:
         i = url.find(';', url.rfind('/'))
         if i < 0:
-            return url, ''
+            return url, None if allow_none else ''
     else:
         i = url.find(';')
     return url[:i], url[i+1:]
@@ -472,17 +475,23 @@ def urlsplit(url, scheme='', allow_fragments=True):
     """
 
     url, scheme, _coerce_result = _coerce_args(url, scheme)
+    scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
+    v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '')
+    return _coerce_result(v)
+
+def _urlsplit(url, scheme=None, allow_fragments=True):
     # Only lstrip url as some applications rely on preserving trailing space.
     # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
     url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
-    scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
-
     for b in _UNSAFE_URL_BYTES_TO_REMOVE:
         url = url.replace(b, "")
-        scheme = scheme.replace(b, "")
+    if scheme is not None:
+        scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
+        for b in _UNSAFE_URL_BYTES_TO_REMOVE:
+            scheme = scheme.replace(b, "")
 
     allow_fragments = bool(allow_fragments)
-    netloc = query = fragment = ''
+    netloc = query = fragment = None
     i = url.find(':')
     if i > 0 and url[0].isascii() and url[0].isalpha():
         for c in url[:i]:
@@ -503,8 +512,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
     if '?' in url:
         url, query = url.split('?', 1)
     _checknetloc(netloc)
-    v = SplitResult(scheme, netloc, url, query, fragment)
-    return _coerce_result(v)
+    return (scheme, netloc, url, query, fragment)
 
 def urlunparse(components):
     """Put a parsed URL back together again.  This may result in a
@@ -513,9 +521,15 @@ def urlunparse(components):
     (the draft states that these are equivalent)."""
     scheme, netloc, url, params, query, fragment, _coerce_result = (
                                                   _coerce_args(*components))
+    if not netloc:
+        if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
+            netloc = ''
+        else:
+            netloc = None
     if params:
         url = "%s;%s" % (url, params)
-    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
+    return _coerce_result(_urlunsplit(scheme or None, netloc, url,
+                                      query or None, fragment or None))
 
 def urlunsplit(components):
     """Combine the elements of a tuple as returned by urlsplit() into a
@@ -525,20 +539,27 @@ def urlunsplit(components):
     empty query; the RFC states that these are equivalent)."""
     scheme, netloc, url, query, fragment, _coerce_result = (
                                           _coerce_args(*components))
-    if netloc:
+    if not netloc:
+        if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
+            netloc = ''
+        else:
+            netloc = None
+    return _coerce_result(_urlunsplit(scheme or None, netloc, url,
+                                      query or None, fragment or None))
+
+def _urlunsplit(scheme, netloc, url, query, fragment):
+    if netloc is not None:
         if url and url[:1] != '/': url = '/' + url
         url = '//' + netloc + url
     elif url[:2] == '//':
         url = '//' + url
-    elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
-        url = '//' + url
     if scheme:
         url = scheme + ':' + url
-    if query:
+    if query is not None:
         url = url + '?' + query
-    if fragment:
+    if fragment is not None:
         url = url + '#' + fragment
-    return _coerce_result(url)
+    return url
 
 def urljoin(base, url, allow_fragments=True):
     """Join a base URL and a possibly relative URL to form an absolute
@@ -549,26 +570,29 @@ def urljoin(base, url, allow_fragments=True):
         return base
 
     base, url, _coerce_result = _coerce_args(base, url)
-    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
-            urlparse(base, '', allow_fragments)
-    scheme, netloc, path, params, query, fragment = \
-            urlparse(url, bscheme, allow_fragments)
+    bscheme, bnetloc, bpath, bquery, bfragment = \
+            _urlsplit(base, None, allow_fragments)
+    scheme, netloc, path, query, fragment = \
+            _urlsplit(url, None, allow_fragments)
 
+    if scheme is None:
+        scheme = bscheme
     if scheme != bscheme or scheme not in uses_relative:
         return _coerce_result(url)
     if scheme in uses_netloc:
         if netloc:
-            return _coerce_result(urlunparse((scheme, netloc, path,
-                                              params, query, fragment)))
+            return _coerce_result(_urlunsplit(scheme, netloc, path,
+                                              query, fragment))
         netloc = bnetloc
 
-    if not path and not params:
+    if not path:
         path = bpath
-        params = bparams
-        if not query:
+        if query is None:
             query = bquery
-        return _coerce_result(urlunparse((scheme, netloc, path,
-                                          params, query, fragment)))
+            if fragment is None:
+                fragment = bfragment
+        return _coerce_result(_urlunsplit(scheme, netloc, path,
+                                          query, fragment))
 
     base_parts = bpath.split('/')
     if base_parts[-1] != '':
@@ -605,8 +629,8 @@ def urljoin(base, url, allow_fragments=True):
         # then we need to append the trailing '/'
         resolved_path.append('')
 
-    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
-        resolved_path) or '/', params, query, fragment)))
+    return _coerce_result(_urlunsplit(scheme, netloc, '/'.join(
+        resolved_path) or '/', query, fragment))
 
 
 def urldefrag(url):
@@ -618,12 +642,12 @@ def urldefrag(url):
     """
     url, _coerce_result = _coerce_args(url)
     if '#' in url:
-        s, n, p, a, q, frag = urlparse(url)
-        defrag = urlunparse((s, n, p, a, q, ''))
+        s, n, p, q, frag = _urlsplit(url)
+        defrag = _urlunsplit(s, n, p, q, None)
     else:
         frag = ''
         defrag = url
-    return _coerce_result(DefragResult(defrag, frag))
+    return _coerce_result(DefragResult(defrag, frag or ''))
 
 _hexdig = '0123456789ABCDEFabcdef'
 _hextobyte = None
author	Serhiy Storchaka <storchaka@gmail.com>	2024-08-31 09:42:08 (GMT)
committer	GitHub <noreply@github.com>	2024-08-31 09:42:08 (GMT)
commit	fc897fcc01964649f023e0baa4c95d142e4e8a10 (patch)
tree	225e7086273594100c09c34c09d03f8291911024 /Lib/urllib
parent	e5a567b0a721c26c79530249d9aa159afbd11955 (diff)
download	cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.zip cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.tar.gz cpython-fc897fcc01964649f023e0baa4c95d142e4e8a10.tar.bz2