diff options
author | Guido van Rossum <guido@python.org> | 1996-05-28 23:54:24 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1996-05-28 23:54:24 (GMT) |
commit | 3fd32ecd9232fcb041b9f1f7a19a1e7e65cf11a0 (patch) | |
tree | 60ae2ea840561ae39ae51ac3e875501ff5461c09 /Lib | |
parent | 1acbffe2e0783f1c3f3c71d250dcfe55096d361e (diff) | |
download | cpython-3fd32ecd9232fcb041b9f1f7a19a1e7e65cf11a0.zip cpython-3fd32ecd9232fcb041b9f1f7a19a1e7e65cf11a0.tar.gz cpython-3fd32ecd9232fcb041b9f1f7a19a1e7e65cf11a0.tar.bz2 |
optimizations due to Fred Drake; added urldefrag() function
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/urlparse.py | 53 |
1 files changed, 35 insertions, 18 deletions
diff --git a/Lib/urlparse.py b/Lib/urlparse.py index 571de46..571ef0e 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -3,6 +3,7 @@ # Standard/builtin Python modules import string +from string import joinfields, splitfields, find, rfind # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file', @@ -18,17 +19,23 @@ uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais', # Characters valid in scheme names scheme_chars = string.letters + string.digits + '+-.' +_parse_cache = {} + +def clear_cache(): + global _parse_cache + _parse_cache = {} + + # Parse a URL into 6 components: # <scheme>://<netloc>/<path>;<params>?<query>#<fragment> # Return a 6-tuple: (scheme, netloc, path, params, query, fragment). # Note that we don't break the components up in smaller bits # (e.g. netloc is a single string) and we don't expand % escapes. def urlparse(url, scheme = '', allow_framents = 1): - netloc = '' - path = '' - params = '' - query = '' - fragment = '' + key = url, scheme, allow_framents + if _parse_cache.has_key(key): + return _parse_cache[key] + netloc = path = params = query = fragment = '' i = string.find(url, ':') if i > 0: for c in url[:i]: @@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1): i = string.find(url, ';') if i >= 0: url, params = url[:i], url[i+1:] - return scheme, netloc, url, params, query, fragment + tuple = scheme, netloc, url, params, query, fragment + _parse_cache[key] = tuple + return tuple # Put a parsed URL back together again. This may result in a slightly # different, but equivalent URL, if the URL that was parsed originally @@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1): if not base: return url bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ - urlparse(base, '', allow_framents) + urlparse(base, '', allow_framents) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_framents) # XXX Unofficial hack: default netloc to bnetloc even if @@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1): scheme in uses_netloc and bscheme in uses_netloc: netloc = bnetloc # Strip the port number - i = string.find(netloc, '@') + i = find(netloc, '@') if i < 0: i = 0 - i = string.find(netloc, ':', i) + i = find(netloc, ':', i) if i >= 0: netloc = netloc[:i] if scheme != bscheme or scheme not in uses_relative: @@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1): return urlunparse((scheme, netloc, path, params, query, fragment)) if not path: - path = bpath - if not query: - query = bquery - return urlunparse((scheme, netloc, path, - params, query, fragment)) - i = string.rfind(bpath, '/') + return urlunparse((scheme, netloc, bpath, + params, query or bquery, fragment)) + i = rfind(bpath, '/') if i >= 0: path = bpath[:i] + '/' + path - segments = string.splitfields(path, '/') + segments = splitfields(path, '/') if segments[-1] == '.': segments[-1] = '' while '.' in segments: @@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1): break if len(segments) >= 2 and segments[-1] == '..': segments[-2:] = [''] - path = string.joinfields(segments, '/') - return urlunparse((scheme, netloc, path, + return urlunparse((scheme, netloc, joinfields(segments, '/'), params, query, fragment)) +def urldefrag(url): + """Removes any existing fragment from URL. + + Returns a tuple of the defragmented URL and the fragment. If + the URL contained no fragments, the second element is the + empty string. + """ + s, n, p, a, q, frag = urlparse(url) + defrag = urlunparse((s, n, p, a, q, '')) + return defrag, frag + + test_input = """ http://a/b/c/d |