diff options
Diffstat (limited to 'Lib/urlparse.py')
-rw-r--r-- | Lib/urlparse.py | 370 |
1 files changed, 185 insertions, 185 deletions
diff --git a/Lib/urlparse.py b/Lib/urlparse.py index b747bc6..b9ecee1 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -6,25 +6,25 @@ UC Irvine, June 1995. # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file', - 'https', 'shttp', - 'prospero', 'rtsp', 'rtspu', ''] + 'https', 'shttp', + 'prospero', 'rtsp', 'rtspu', ''] uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais', - 'file', - 'https', 'shttp', 'snews', - 'prospero', 'rtsp', 'rtspu', ''] + 'file', + 'https', 'shttp', 'snews', + 'prospero', 'rtsp', 'rtspu', ''] non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais', - 'snews', 'sip', - ] + 'snews', 'sip', + ] uses_params = ['ftp', 'hdl', 'prospero', 'http', - 'https', 'shttp', 'rtsp', 'rtspu', 'sip', - ''] + 'https', 'shttp', 'rtsp', 'rtspu', 'sip', + ''] uses_query = ['http', 'wais', - 'https', 'shttp', - 'gopher', 'rtsp', 'rtspu', 'sip', - ''] + 'https', 'shttp', + 'gopher', 'rtsp', 'rtspu', 'sip', + ''] uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais', - 'https', 'shttp', 'snews', - 'file', 'prospero', ''] + 'https', 'shttp', 'snews', + 'file', 'prospero', ''] # Characters valid in scheme names scheme_chars = ('abcdefghijklmnopqrstuvwxyz' @@ -36,158 +36,158 @@ MAX_CACHE_SIZE = 20 _parse_cache = {} def clear_cache(): - """Clear the parse cache.""" - global _parse_cache - _parse_cache = {} + """Clear the parse cache.""" + global _parse_cache + _parse_cache = {} def urlparse(url, scheme = '', allow_fragments = 1): - """Parse a URL into 6 components: - <scheme>://<netloc>/<path>;<params>?<query>#<fragment> - Return a 6-tuple: (scheme, netloc, path, params, query, fragment). - Note that we don't break the components up in smaller bits - (e.g. netloc is a single string) and we don't expand % escapes.""" - key = url, scheme, allow_fragments - cached = _parse_cache.get(key, None) - if cached: - return cached - if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth - clear_cache() - netloc = path = params = query = fragment = '' - i = url.find(':') - if i > 0: - if url[:i] == 'http': # optimize the common case - scheme = url[:i].lower() - url = url[i+1:] - if url[:2] == '//': - i = url.find('/', 2) - if i < 0: - i = len(url) - netloc = url[2:i] - url = url[i:] - if allow_fragments: - i = url.rfind('#') - if i >= 0: - fragment = url[i+1:] - url = url[:i] - i = url.find('?') - if i >= 0: - query = url[i+1:] - url = url[:i] - i = url.find(';') - if i >= 0: - params = url[i+1:] - url = url[:i] - tuple = scheme, netloc, url, params, query, fragment - _parse_cache[key] = tuple - return tuple - for c in url[:i]: - if c not in scheme_chars: - break - else: - scheme, url = url[:i].lower(), url[i+1:] - if scheme in uses_netloc: - if url[:2] == '//': - i = url.find('/', 2) - if i < 0: - i = len(url) - netloc, url = url[2:i], url[i:] - if allow_fragments and scheme in uses_fragment: - i = url.rfind('#') - if i >= 0: - url, fragment = url[:i], url[i+1:] - if scheme in uses_query: - i = url.find('?') - if i >= 0: - url, query = url[:i], url[i+1:] - if scheme in uses_params: - i = url.find(';') - if i >= 0: - url, params = url[:i], url[i+1:] - tuple = scheme, netloc, url, params, query, fragment - _parse_cache[key] = tuple - return tuple + """Parse a URL into 6 components: + <scheme>://<netloc>/<path>;<params>?<query>#<fragment> + Return a 6-tuple: (scheme, netloc, path, params, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + key = url, scheme, allow_fragments + cached = _parse_cache.get(key, None) + if cached: + return cached + if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth + clear_cache() + netloc = path = params = query = fragment = '' + i = url.find(':') + if i > 0: + if url[:i] == 'http': # optimize the common case + scheme = url[:i].lower() + url = url[i+1:] + if url[:2] == '//': + i = url.find('/', 2) + if i < 0: + i = len(url) + netloc = url[2:i] + url = url[i:] + if allow_fragments: + i = url.rfind('#') + if i >= 0: + fragment = url[i+1:] + url = url[:i] + i = url.find('?') + if i >= 0: + query = url[i+1:] + url = url[:i] + i = url.find(';') + if i >= 0: + params = url[i+1:] + url = url[:i] + tuple = scheme, netloc, url, params, query, fragment + _parse_cache[key] = tuple + return tuple + for c in url[:i]: + if c not in scheme_chars: + break + else: + scheme, url = url[:i].lower(), url[i+1:] + if scheme in uses_netloc: + if url[:2] == '//': + i = url.find('/', 2) + if i < 0: + i = len(url) + netloc, url = url[2:i], url[i:] + if allow_fragments and scheme in uses_fragment: + i = url.rfind('#') + if i >= 0: + url, fragment = url[:i], url[i+1:] + if scheme in uses_query: + i = url.find('?') + if i >= 0: + url, query = url[:i], url[i+1:] + if scheme in uses_params: + i = url.find(';') + if i >= 0: + url, params = url[:i], url[i+1:] + tuple = scheme, netloc, url, params, query, fragment + _parse_cache[key] = tuple + return tuple def urlunparse((scheme, netloc, url, params, query, fragment)): - """Put a parsed URL back together again. This may result in a - slightly different, but equivalent URL, if the URL that was parsed - originally had redundant delimiters, e.g. a ? with an empty query - (the draft states that these are equivalent).""" - if netloc or (scheme in uses_netloc and url[:2] == '//'): - if url and url[:1] != '/': url = '/' + url - url = '//' + (netloc or '') + url - if scheme: - url = scheme + ':' + url - if params: - url = url + ';' + params - if query: - url = url + '?' + query - if fragment: - url = url + '#' + fragment - return url + """Put a parsed URL back together again. This may result in a + slightly different, but equivalent URL, if the URL that was parsed + originally had redundant delimiters, e.g. a ? with an empty query + (the draft states that these are equivalent).""" + if netloc or (scheme in uses_netloc and url[:2] == '//'): + if url and url[:1] != '/': url = '/' + url + url = '//' + (netloc or '') + url + if scheme: + url = scheme + ':' + url + if params: + url = url + ';' + params + if query: + url = url + '?' + query + if fragment: + url = url + '#' + fragment + return url def urljoin(base, url, allow_fragments = 1): - """Join a base URL and a possibly relative URL to form an absolute - interpretation of the latter.""" - if not base: - return url - if not url: - return base - bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ - urlparse(base, '', allow_fragments) - scheme, netloc, path, params, query, fragment = \ - urlparse(url, bscheme, allow_fragments) - if scheme != bscheme or scheme not in uses_relative: - return url - if scheme in uses_netloc: - if netloc: - return urlunparse((scheme, netloc, path, - params, query, fragment)) - netloc = bnetloc - if path[:1] == '/': - return urlunparse((scheme, netloc, path, - params, query, fragment)) - if not path: - if not params: - params = bparams - if not query: - query = bquery - return urlunparse((scheme, netloc, bpath, - params, query, fragment)) - segments = bpath.split('/')[:-1] + path.split('/') - # XXX The stuff below is bogus in various ways... - if segments[-1] == '.': - segments[-1] = '' - while '.' in segments: - segments.remove('.') - while 1: - i = 1 - n = len(segments) - 1 - while i < n: - if (segments[i] == '..' - and segments[i-1] not in ('', '..')): - del segments[i-1:i+1] - break - i = i+1 - else: - break - if segments == ['', '..']: - segments[-1] = '' - elif len(segments) >= 2 and segments[-1] == '..': - segments[-2:] = [''] - return urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment)) + """Join a base URL and a possibly relative URL to form an absolute + interpretation of the latter.""" + if not base: + return url + if not url: + return base + bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ + urlparse(base, '', allow_fragments) + scheme, netloc, path, params, query, fragment = \ + urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: + return url + if scheme in uses_netloc: + if netloc: + return urlunparse((scheme, netloc, path, + params, query, fragment)) + netloc = bnetloc + if path[:1] == '/': + return urlunparse((scheme, netloc, path, + params, query, fragment)) + if not path: + if not params: + params = bparams + if not query: + query = bquery + return urlunparse((scheme, netloc, bpath, + params, query, fragment)) + segments = bpath.split('/')[:-1] + path.split('/') + # XXX The stuff below is bogus in various ways... + if segments[-1] == '.': + segments[-1] = '' + while '.' in segments: + segments.remove('.') + while 1: + i = 1 + n = len(segments) - 1 + while i < n: + if (segments[i] == '..' + and segments[i-1] not in ('', '..')): + del segments[i-1:i+1] + break + i = i+1 + else: + break + if segments == ['', '..']: + segments[-1] = '' + elif len(segments) >= 2 and segments[-1] == '..': + segments[-2:] = [''] + return urlunparse((scheme, netloc, '/'.join(segments), + params, query, fragment)) def urldefrag(url): - """Removes any existing fragment from URL. + """Removes any existing fragment from URL. - Returns a tuple of the defragmented URL and the fragment. If - the URL contained no fragments, the second element is the - empty string. - """ - s, n, p, a, q, frag = urlparse(url) - defrag = urlunparse((s, n, p, a, q, '')) - return defrag, frag + Returns a tuple of the defragmented URL and the fragment. If + the URL contained no fragments, the second element is the + empty string. + """ + s, n, p, a, q, frag = urlparse(url) + defrag = urlunparse((s, n, p, a, q, '')) + return defrag, frag test_input = """ @@ -226,34 +226,34 @@ test_input = """ # XXX The result for //g is actually http://g/; is this a problem? def test(): - import sys - base = '' - if sys.argv[1:]: - fn = sys.argv[1] - if fn == '-': - fp = sys.stdin - else: - fp = open(fn) - else: - import StringIO - fp = StringIO.StringIO(test_input) - while 1: - line = fp.readline() - if not line: break - words = line.split() - if not words: - continue - url = words[0] - parts = urlparse(url) - print '%-10s : %s' % (url, parts) - abs = urljoin(base, url) - if not base: - base = abs - wrapped = '<URL:%s>' % abs - print '%-10s = %s' % (url, wrapped) - if len(words) == 3 and words[1] == '=': - if wrapped != words[2]: - print 'EXPECTED', words[2], '!!!!!!!!!!' + import sys + base = '' + if sys.argv[1:]: + fn = sys.argv[1] + if fn == '-': + fp = sys.stdin + else: + fp = open(fn) + else: + import StringIO + fp = StringIO.StringIO(test_input) + while 1: + line = fp.readline() + if not line: break + words = line.split() + if not words: + continue + url = words[0] + parts = urlparse(url) + print '%-10s : %s' % (url, parts) + abs = urljoin(base, url) + if not base: + base = abs + wrapped = '<URL:%s>' % abs + print '%-10s = %s' % (url, wrapped) + if len(words) == 3 and words[1] == '=': + if wrapped != words[2]: + print 'EXPECTED', words[2], '!!!!!!!!!!' if __name__ == '__main__': - test() + test() |