diff options
Diffstat (limited to 'Lib/urlparse.py')
| -rw-r--r-- | Lib/urlparse.py | 272 |
1 files changed, 140 insertions, 132 deletions
diff --git a/Lib/urlparse.py b/Lib/urlparse.py index 4bc577c..294545c 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -1,11 +1,32 @@ """Parse (absolute and relative) URLs. -See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, -UC Irvine, June 1995. +urlparse module is based upon the following RFC specifications. + +RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding +and L. Masinter, January 2005. + +RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. +Berners-Lee, R. Fielding, and L. Masinter, August 1998. + +RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998. + +RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June +1995. + +RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. +McCahill, December 1994 + +RFC 3986 is considered the current standard and any future changes to +urlparse module should conform with it. The urlparse module is +currently not entirely compliant with this RFC due to defacto +scenarios for parsing, and for backward compatibility purposes, some +parsing quirks from older RFCs are retained. The testcases in +test_urlparse.py provides a good indicator of parsing behavior. + """ __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", - "urlsplit", "urlunsplit"] + "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', @@ -14,7 +35,7 @@ uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', - 'svn', 'svn+ssh', 'sftp'] + 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh'] non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', @@ -37,52 +58,17 @@ _parse_cache = {} def clear_cache(): """Clear the parse cache.""" - global _parse_cache - _parse_cache = {} - - -class BaseResult(tuple): - """Base class for the parsed result objects. - - This provides the attributes shared by the two derived result - objects as read-only properties. The derived classes are - responsible for checking the right number of arguments were - supplied to the constructor. - - """ + _parse_cache.clear() - __slots__ = () - - # Attributes that access the basic components of the URL: - - @property - def scheme(self): - return self[0] - - @property - def netloc(self): - return self[1] - - @property - def path(self): - return self[2] - - @property - def query(self): - return self[-2] - - @property - def fragment(self): - return self[-1] - # Additional attributes that provide access to parsed-out portions - # of the netloc: +class ResultMixin(object): + """Shared methods for the parsed result objects.""" @property def username(self): netloc = self.netloc if "@" in netloc: - userinfo = netloc.split("@", 1)[0] + userinfo = netloc.rsplit("@", 1)[0] if ":" in userinfo: userinfo = userinfo.split(":", 1)[0] return userinfo @@ -92,7 +78,7 @@ class BaseResult(tuple): def password(self): netloc = self.netloc if "@" in netloc: - userinfo = netloc.split("@", 1)[0] + userinfo = netloc.rsplit("@", 1)[0] if ":" in userinfo: return userinfo.split(":", 1)[1] return None @@ -101,7 +87,7 @@ class BaseResult(tuple): def hostname(self): netloc = self.netloc if "@" in netloc: - netloc = netloc.split("@", 1)[1] + netloc = netloc.rsplit("@", 1)[1] if ":" in netloc: netloc = netloc.split(":", 1)[0] return netloc.lower() or None @@ -110,37 +96,26 @@ class BaseResult(tuple): def port(self): netloc = self.netloc if "@" in netloc: - netloc = netloc.split("@", 1)[1] + netloc = netloc.rsplit("@", 1)[1] if ":" in netloc: port = netloc.split(":", 1)[1] return int(port, 10) return None +from collections import namedtuple -class SplitResult(BaseResult): +class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): __slots__ = () - def __new__(cls, scheme, netloc, path, query, fragment): - return BaseResult.__new__( - cls, (scheme, netloc, path, query, fragment)) - def geturl(self): return urlunsplit(self) -class ParseResult(BaseResult): +class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): __slots__ = () - def __new__(cls, scheme, netloc, path, params, query, fragment): - return BaseResult.__new__( - cls, (scheme, netloc, path, params, query, fragment)) - - @property - def params(self): - return self[3] - def geturl(self): return urlunparse(self) @@ -209,7 +184,8 @@ def urlsplit(url, scheme='', allow_fragments=True): break else: scheme, url = url[:i].lower(), url[i+1:] - if scheme in uses_netloc and url[:2] == '//': + + if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if allow_fragments and scheme in uses_fragment and '#' in url: url, fragment = url.split('#', 1) @@ -219,16 +195,23 @@ def urlsplit(url, scheme='', allow_fragments=True): _parse_cache[key] = v return v -def urlunparse((scheme, netloc, url, params, query, fragment)): +def urlunparse(data): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query (the draft states that these are equivalent).""" + scheme, netloc, url, params, query, fragment = data if params: url = "%s;%s" % (url, params) return urlunsplit((scheme, netloc, url, query, fragment)) -def urlunsplit((scheme, netloc, url, query, fragment)): +def urlunsplit(data): + """Combine the elements of a tuple as returned by urlsplit() into a + complete URL as a string. The data argument can be any five-item iterable. + This may result in a slightly different, but equivalent URL, if the URL that + was parsed originally had unnecessary delimiters (for example, a ? with an + empty query; the RFC states that these are equivalent).""" + scheme, netloc, url, query, fragment = data if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url @@ -261,9 +244,18 @@ def urljoin(base, url, allow_fragments=True): if path[:1] == '/': return urlunparse((scheme, netloc, path, params, query, fragment)) - if not (path or params or query): - return urlunparse((scheme, netloc, bpath, - bparams, bquery, fragment)) + if not path: + path = bpath + if not params: + params = bparams + else: + path = path[:-1] + return urlunparse((scheme, netloc, path, + params, query, fragment)) + if not query: + query = bquery + return urlunparse((scheme, netloc, path, + params, query, fragment)) segments = bpath.split('/')[:-1] + path.split('/') # XXX The stuff below is bogus in various ways... if segments[-1] == '.': @@ -302,73 +294,89 @@ def urldefrag(url): else: return url, '' +# unquote method for parse_qs and parse_qsl +# Cannot use directly from urllib as it would create circular reference. +# urllib uses urlparse methods ( urljoin) -test_input = """ - http://a/b/c/d - - g:h = <URL:g:h> - http:g = <URL:http://a/b/c/g> - http: = <URL:http://a/b/c/d> - g = <URL:http://a/b/c/g> - ./g = <URL:http://a/b/c/g> - g/ = <URL:http://a/b/c/g/> - /g = <URL:http://a/g> - //g = <URL:http://g> - ?y = <URL:http://a/b/c/d?y> - g?y = <URL:http://a/b/c/g?y> - g?y/./x = <URL:http://a/b/c/g?y/./x> - . = <URL:http://a/b/c/> - ./ = <URL:http://a/b/c/> - .. = <URL:http://a/b/> - ../ = <URL:http://a/b/> - ../g = <URL:http://a/b/g> - ../.. = <URL:http://a/> - ../../g = <URL:http://a/g> - ../../../g = <URL:http://a/../g> - ./../g = <URL:http://a/b/g> - ./g/. = <URL:http://a/b/c/g/> - /./g = <URL:http://a/./g> - g/./h = <URL:http://a/b/c/g/h> - g/../h = <URL:http://a/b/c/h> - http:g = <URL:http://a/b/c/g> - http: = <URL:http://a/b/c/d> - http:?y = <URL:http://a/b/c/d?y> - http:g?y = <URL:http://a/b/c/g?y> - http:g?y/./x = <URL:http://a/b/c/g?y/./x> -""" -def test(): - import sys - base = '' - if sys.argv[1:]: - fn = sys.argv[1] - if fn == '-': - fp = sys.stdin - else: - fp = open(fn) - else: +_hexdig = '0123456789ABCDEFabcdef' +_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig) + +def unquote(s): + """unquote('abc%20def') -> 'abc def'.""" + res = s.split('%') + for i in xrange(1, len(res)): + item = res[i] try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - fp = StringIO(test_input) - while 1: - line = fp.readline() - if not line: break - words = line.split() - if not words: + res[i] = _hextochr[item[:2]] + item[2:] + except KeyError: + res[i] = '%' + item + except UnicodeDecodeError: + res[i] = unichr(int(item[:2], 16)) + item[2:] + return "".join(res) + +def parse_qs(qs, keep_blank_values=0, strict_parsing=0): + """Parse a query given as a string argument. + + Arguments: + + qs: URL-encoded query string to be parsed + + keep_blank_values: flag indicating whether blank values in + URL encoded queries should be treated as blank strings. + A true value indicates that blanks should be retained as + blank strings. The default false value indicates that + blank values are to be ignored and treated as if they were + not included. + + strict_parsing: flag indicating what to do with parsing errors. + If false (the default), errors are silently ignored. + If true, errors raise a ValueError exception. + """ + dict = {} + for name, value in parse_qsl(qs, keep_blank_values, strict_parsing): + if name in dict: + dict[name].append(value) + else: + dict[name] = [value] + return dict + +def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): + """Parse a query given as a string argument. + + Arguments: + + qs: URL-encoded query string to be parsed + + keep_blank_values: flag indicating whether blank values in + URL encoded queries should be treated as blank strings. A + true value indicates that blanks should be retained as blank + strings. The default false value indicates that blank values + are to be ignored and treated as if they were not included. + + strict_parsing: flag indicating what to do with parsing errors. If + false (the default), errors are silently ignored. If true, + errors raise a ValueError exception. + + Returns a list, as G-d intended. + """ + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: continue - url = words[0] - parts = urlparse(url) - print '%-10s : %s' % (url, parts) - abs = urljoin(base, url) - if not base: - base = abs - wrapped = '<URL:%s>' % abs - print '%-10s = %s' % (url, wrapped) - if len(words) == 3 and words[1] == '=': - if wrapped != words[2]: - print 'EXPECTED', words[2], '!!!!!!!!!!' - -if __name__ == '__main__': - test() + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError, "bad query field: %r" % (name_value,) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = unquote(nv[0].replace('+', ' ')) + value = unquote(nv[1].replace('+', ' ')) + r.append((name, value)) + + return r |
