1 files changed, 140 insertions, 132 deletions
diff --git a/Lib/urlparse.py b/Lib/urlparse.py
index 4bc577c..294545c 100644
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -1,11 +1,32 @@
 """Parse (absolute and relative) URLs.
 
-See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
-UC Irvine, June 1995.
+urlparse module is based upon the following RFC specifications.
+
+RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
+and L.  Masinter, January 2005.
+
+RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
+Berners-Lee, R. Fielding, and L. Masinter, August 1998.
+
+RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
+
+RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
+1995.
+
+RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
+McCahill, December 1994
+
+RFC 3986 is considered the current standard and any future changes to
+urlparse module should conform with it.  The urlparse module is
+currently not entirely compliant with this RFC due to defacto
+scenarios for parsing, and for backward compatibility purposes, some
+parsing quirks from older RFCs are retained. The testcases in
+test_urlparse.py provides a good indicator of parsing behavior.
+
 """
 
 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
-           "urlsplit", "urlunsplit"]
+           "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
 
 # A classification of schemes ('' means apply by default)
 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
@@ -14,7 +35,7 @@ uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
-               'svn', 'svn+ssh', 'sftp']
+               'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
@@ -37,52 +58,17 @@ _parse_cache = {}
 
 def clear_cache():
     """Clear the parse cache."""
-    global _parse_cache
-    _parse_cache = {}
-
-
-class BaseResult(tuple):
-    """Base class for the parsed result objects.
-
-    This provides the attributes shared by the two derived result
-    objects as read-only properties.  The derived classes are
-    responsible for checking the right number of arguments were
-    supplied to the constructor.
-
-    """
+    _parse_cache.clear()
 
-    __slots__ = ()
-
-    # Attributes that access the basic components of the URL:
-
-    @property
-    def scheme(self):
-        return self[0]
-
-    @property
-    def netloc(self):
-        return self[1]
-
-    @property
-    def path(self):
-        return self[2]
-
-    @property
-    def query(self):
-        return self[-2]
-
-    @property
-    def fragment(self):
-        return self[-1]
 
-    # Additional attributes that provide access to parsed-out portions
-    # of the netloc:
+class ResultMixin(object):
+    """Shared methods for the parsed result objects."""
 
     @property
     def username(self):
         netloc = self.netloc
         if "@" in netloc:
-            userinfo = netloc.split("@", 1)[0]
+            userinfo = netloc.rsplit("@", 1)[0]
             if ":" in userinfo:
                 userinfo = userinfo.split(":", 1)[0]
             return userinfo
@@ -92,7 +78,7 @@ class BaseResult(tuple):
     def password(self):
         netloc = self.netloc
         if "@" in netloc:
-            userinfo = netloc.split("@", 1)[0]
+            userinfo = netloc.rsplit("@", 1)[0]
             if ":" in userinfo:
                 return userinfo.split(":", 1)[1]
         return None
@@ -101,7 +87,7 @@ class BaseResult(tuple):
     def hostname(self):
         netloc = self.netloc
         if "@" in netloc:
-            netloc = netloc.split("@", 1)[1]
+            netloc = netloc.rsplit("@", 1)[1]
         if ":" in netloc:
             netloc = netloc.split(":", 1)[0]
         return netloc.lower() or None
@@ -110,37 +96,26 @@ class BaseResult(tuple):
     def port(self):
         netloc = self.netloc
         if "@" in netloc:
-            netloc = netloc.split("@", 1)[1]
+            netloc = netloc.rsplit("@", 1)[1]
         if ":" in netloc:
             port = netloc.split(":", 1)[1]
             return int(port, 10)
         return None
 
+from collections import namedtuple
 
-class SplitResult(BaseResult):
+class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
 
     __slots__ = ()
 
-    def __new__(cls, scheme, netloc, path, query, fragment):
-        return BaseResult.__new__(
-            cls, (scheme, netloc, path, query, fragment))
-
     def geturl(self):
         return urlunsplit(self)
 
 
-class ParseResult(BaseResult):
+class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
 
     __slots__ = ()
 
-    def __new__(cls, scheme, netloc, path, params, query, fragment):
-        return BaseResult.__new__(
-            cls, (scheme, netloc, path, params, query, fragment))
-
-    @property
-    def params(self):
-        return self[3]
-
     def geturl(self):
         return urlunparse(self)
 
@@ -209,7 +184,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
                 break
         else:
             scheme, url = url[:i].lower(), url[i+1:]
-    if scheme in uses_netloc and url[:2] == '//':
+
+    if url[:2] == '//':
         netloc, url = _splitnetloc(url, 2)
     if allow_fragments and scheme in uses_fragment and '#' in url:
         url, fragment = url.split('#', 1)
@@ -219,16 +195,23 @@ def urlsplit(url, scheme='', allow_fragments=True):
     _parse_cache[key] = v
     return v
 
-def urlunparse((scheme, netloc, url, params, query, fragment)):
+def urlunparse(data):
     """Put a parsed URL back together again.  This may result in a
     slightly different, but equivalent URL, if the URL that was parsed
     originally had redundant delimiters, e.g. a ? with an empty query
     (the draft states that these are equivalent)."""
+    scheme, netloc, url, params, query, fragment = data
     if params:
         url = "%s;%s" % (url, params)
     return urlunsplit((scheme, netloc, url, query, fragment))
 
-def urlunsplit((scheme, netloc, url, query, fragment)):
+def urlunsplit(data):
+    """Combine the elements of a tuple as returned by urlsplit() into a
+    complete URL as a string. The data argument can be any five-item iterable.
+    This may result in a slightly different, but equivalent URL, if the URL that
+    was parsed originally had unnecessary delimiters (for example, a ? with an
+    empty query; the RFC states that these are equivalent)."""
+    scheme, netloc, url, query, fragment = data
     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
         if url and url[:1] != '/': url = '/' + url
         url = '//' + (netloc or '') + url
@@ -261,9 +244,18 @@ def urljoin(base, url, allow_fragments=True):
     if path[:1] == '/':
         return urlunparse((scheme, netloc, path,
                            params, query, fragment))
-    if not (path or params or query):
-        return urlunparse((scheme, netloc, bpath,
-                           bparams, bquery, fragment))
+    if not path:
+        path = bpath
+        if not params:
+            params = bparams
+        else:
+            path = path[:-1]
+            return urlunparse((scheme, netloc, path,
+                                params, query, fragment))
+        if not query:
+            query = bquery
+        return urlunparse((scheme, netloc, path,
+                           params, query, fragment))
     segments = bpath.split('/')[:-1] + path.split('/')
     # XXX The stuff below is bogus in various ways...
     if segments[-1] == '.':
@@ -302,73 +294,89 @@ def urldefrag(url):
     else:
         return url, ''
 
+# unquote method for parse_qs and parse_qsl
+# Cannot use directly from urllib as it would create circular reference.
+# urllib uses urlparse methods ( urljoin)
 
-test_input = """
-      http://a/b/c/d
-
-      g:h        = <URL:g:h>
-      http:g     = <URL:http://a/b/c/g>
-      http:      = <URL:http://a/b/c/d>
-      g          = <URL:http://a/b/c/g>
-      ./g        = <URL:http://a/b/c/g>
-      g/         = <URL:http://a/b/c/g/>
-      /g         = <URL:http://a/g>
-      //g        = <URL:http://g>
-      ?y         = <URL:http://a/b/c/d?y>
-      g?y        = <URL:http://a/b/c/g?y>
-      g?y/./x    = <URL:http://a/b/c/g?y/./x>
-      .          = <URL:http://a/b/c/>
-      ./         = <URL:http://a/b/c/>
-      ..         = <URL:http://a/b/>
-      ../        = <URL:http://a/b/>
-      ../g       = <URL:http://a/b/g>
-      ../..      = <URL:http://a/>
-      ../../g    = <URL:http://a/g>
-      ../../../g = <URL:http://a/../g>
-      ./../g     = <URL:http://a/b/g>
-      ./g/.      = <URL:http://a/b/c/g/>
-      /./g       = <URL:http://a/./g>
-      g/./h      = <URL:http://a/b/c/g/h>
-      g/../h     = <URL:http://a/b/c/h>
-      http:g     = <URL:http://a/b/c/g>
-      http:      = <URL:http://a/b/c/d>
-      http:?y         = <URL:http://a/b/c/d?y>
-      http:g?y        = <URL:http://a/b/c/g?y>
-      http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
-"""
 
-def test():
-    import sys
-    base = ''
-    if sys.argv[1:]:
-        fn = sys.argv[1]
-        if fn == '-':
-            fp = sys.stdin
-        else:
-            fp = open(fn)
-    else:
+_hexdig = '0123456789ABCDEFabcdef'
+_hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig)
+
+def unquote(s):
+    """unquote('abc%20def') -> 'abc def'."""
+    res = s.split('%')
+    for i in xrange(1, len(res)):
+        item = res[i]
         try:
-            from cStringIO import StringIO
-        except ImportError:
-            from StringIO import StringIO
-        fp = StringIO(test_input)
-    while 1:
-        line = fp.readline()
-        if not line: break
-        words = line.split()
-        if not words:
+            res[i] = _hextochr[item[:2]] + item[2:]
+        except KeyError:
+            res[i] = '%' + item
+        except UnicodeDecodeError:
+            res[i] = unichr(int(item[:2], 16)) + item[2:]
+    return "".join(res)
+
+def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
+    """Parse a query given as a string argument.
+
+        Arguments:
+
+        qs: URL-encoded query string to be parsed
+
+        keep_blank_values: flag indicating whether blank values in
+            URL encoded queries should be treated as blank strings.
+            A true value indicates that blanks should be retained as
+            blank strings.  The default false value indicates that
+            blank values are to be ignored and treated as if they were
+            not included.
+
+        strict_parsing: flag indicating what to do with parsing errors.
+            If false (the default), errors are silently ignored.
+            If true, errors raise a ValueError exception.
+    """
+    dict = {}
+    for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
+        if name in dict:
+            dict[name].append(value)
+        else:
+            dict[name] = [value]
+    return dict
+
+def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
+    """Parse a query given as a string argument.
+
+    Arguments:
+
+    qs: URL-encoded query string to be parsed
+
+    keep_blank_values: flag indicating whether blank values in
+        URL encoded queries should be treated as blank strings.  A
+        true value indicates that blanks should be retained as blank
+        strings.  The default false value indicates that blank values
+        are to be ignored and treated as if they were  not included.
+
+    strict_parsing: flag indicating what to do with parsing errors. If
+        false (the default), errors are silently ignored. If true,
+        errors raise a ValueError exception.
+
+    Returns a list, as G-d intended.
+    """
+    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+    r = []
+    for name_value in pairs:
+        if not name_value and not strict_parsing:
             continue
-        url = words[0]
-        parts = urlparse(url)
-        print '%-10s : %s' % (url, parts)
-        abs = urljoin(base, url)
-        if not base:
-            base = abs
-        wrapped = '<URL:%s>' % abs
-        print '%-10s = %s' % (url, wrapped)
-        if len(words) == 3 and words[1] == '=':
-            if wrapped != words[2]:
-                print 'EXPECTED', words[2], '!!!!!!!!!!'
-
-if __name__ == '__main__':
-    test()
+        nv = name_value.split('=', 1)
+        if len(nv) != 2:
+            if strict_parsing:
+                raise ValueError, "bad query field: %r" % (name_value,)
+            # Handle case of a control-name with no equal sign
+            if keep_blank_values:
+                nv.append('')
+            else:
+                continue
+        if len(nv[1]) or keep_blank_values:
+            name = unquote(nv[0].replace('+', ' '))
+            value = unquote(nv[1].replace('+', ' '))
+            r.append((name, value))
+
+    return r