diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2014-08-21 23:16:17 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2014-08-21 23:16:17 (GMT) |
commit | 55ac5b3f7b67bf7c721f00bddb8456f7886c449f (patch) | |
tree | 29a015336e9dd1e89aa15014b3ac1615f2ccc723 /Lib/urllib | |
parent | a7eb74627826e389dd78bd5a7a3b0878f4e3dde6 (diff) | |
download | cpython-55ac5b3f7b67bf7c721f00bddb8456f7886c449f.zip cpython-55ac5b3f7b67bf7c721f00bddb8456f7886c449f.tar.gz cpython-55ac5b3f7b67bf7c721f00bddb8456f7886c449f.tar.bz2 |
Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution of relative URLs, rather than RFCs 1808 and 2396.
Patch by Demian Brecht.
Diffstat (limited to 'Lib/urllib')
-rw-r--r-- | Lib/urllib/parse.py | 63 |
1 files changed, 38 insertions, 25 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index dfb947c..b6ac414 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -409,11 +409,13 @@ def urljoin(base, url, allow_fragments=True): return url if not url: return base + base, url, _coerce_result = _coerce_args(base, url) bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ urlparse(base, '', allow_fragments) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: return _coerce_result(url) if scheme in uses_netloc: @@ -421,9 +423,7 @@ def urljoin(base, url, allow_fragments=True): return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) netloc = bnetloc - if path[:1] == '/': - return _coerce_result(urlunparse((scheme, netloc, path, - params, query, fragment))) + if not path and not params: path = bpath params = bparams @@ -431,29 +431,42 @@ def urljoin(base, url, allow_fragments=True): query = bquery return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) - segments = bpath.split('/')[:-1] + path.split('/') - # XXX The stuff below is bogus in various ways... - if segments[-1] == '.': - segments[-1] = '' - while '.' in segments: - segments.remove('.') - while 1: - i = 1 - n = len(segments) - 1 - while i < n: - if (segments[i] == '..' - and segments[i-1] not in ('', '..')): - del segments[i-1:i+1] - break - i = i+1 + + base_parts = bpath.split('/') + if base_parts[-1] != '': + # the last item is not a directory, so will not be taken into account + # in resolving the relative path + del base_parts[-1] + + # for rfc3986, ignore all base path should the first character be root. + if path[:1] == '/': + segments = path.split('/') + else: + segments = base_parts + path.split('/') + + resolved_path = [] + + for seg in segments: + if seg == '..': + try: + resolved_path.pop() + except IndexError: + # ignore any .. segments that would otherwise cause an IndexError + # when popped from resolved_path if resolving for rfc3986 + pass + elif seg == '.': + continue else: - break - if segments == ['', '..']: - segments[-1] = '' - elif len(segments) >= 2 and segments[-1] == '..': - segments[-2:] = [''] - return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment))) + resolved_path.append(seg) + + if segments[-1] in ('.', '..'): + # do some post-processing here. if the last segment was a relative dir, + # then we need to append the trailing '/' + resolved_path.append('') + + return _coerce_result(urlunparse((scheme, netloc, '/'.join( + resolved_path), params, query, fragment))) + def urldefrag(url): """Removes any existing fragment from URL. |