From 9fc443cf590c76d4b979c46dc954d3956cee0319 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Tue, 30 Nov 2010 15:48:08 +0000 Subject: Issue 9873: the URL parsing functions now accept ASCII encoded byte sequences in addition to character strings --- Doc/library/urllib.parse.rst | 231 +++++++++++++++++++++++++++---------- Doc/whatsnew/3.2.rst | 8 ++ Lib/test/test_urlparse.py | 235 ++++++++++++++++++++++++++++++++++---- Lib/urllib/parse.py | 263 ++++++++++++++++++++++++++++++++++--------- Misc/NEWS | 3 + 5 files changed, 603 insertions(+), 137 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index a15ff47..eab218e 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -24,7 +24,15 @@ following URL schemes: ``file``, ``ftp``, ``gopher``, ``hdl``, ``http``, ``rsync``, ``rtsp``, ``rtspu``, ``sftp``, ``shttp``, ``sip``, ``sips``, ``snews``, ``svn``, ``svn+ssh``, ``telnet``, ``wais``. -The :mod:`urllib.parse` module defines the following functions: +The :mod:`urllib.parse` module defines functions that fall into two broad +categories: URL parsing and URL quoting. These are covered in detail in +the following sections. + +URL Parsing +----------- + +The URL parsing functions focus on splitting a URL string into its components, +or on combining URL components into a URL string. .. function:: urlparse(urlstring, scheme='', allow_fragments=True) @@ -242,6 +250,161 @@ The :mod:`urllib.parse` module defines the following functions: string. If there is no fragment identifier in *url*, return *url* unmodified and an empty string. + The return value is actually an instance of a subclass of :class:`tuple`. This + class has the following additional read-only convenience attributes: + + +------------------+-------+-------------------------+----------------------+ + | Attribute | Index | Value | Value if not present | + +==================+=======+=========================+======================+ + | :attr:`url` | 0 | URL with no fragment | empty string | + +------------------+-------+-------------------------+----------------------+ + | :attr:`fragment` | 1 | Fragment identifier | empty string | + +------------------+-------+-------------------------+----------------------+ + + See section :ref:`urlparse-result-object` for more information on the result + object. + + .. versionchanged:: 3.2 + Result is a structured object rather than a simple 2-tuple + + +Parsing ASCII Encoded Bytes +--------------------------- + +The URL parsing functions were originally designed to operate on character +strings only. In practice, it is useful to be able to manipulate properly +quoted and encoded URLs as sequences of ASCII bytes. Accordingly, the +URL parsing functions in this module all operate on :class:`bytes` and +:class:`bytearray` objects in addition to :class:`str` objects. + +If :class:`str` data is passed in, the result will also contain only +:class:`str` data. If :class:`bytes` or :class:`bytearray` data is +passed in, the result will contain only :class:`bytes` data. + +Attempting to mix :class:`str` data with :class:`bytes` or +:class:`bytearray` in a single function call will result in a +:exc:`TypeError` being thrown, while attempting to pass in non-ASCII +byte values will trigger :exc:`UnicodeDecodeError`. + +To support easier conversion of result objects between :class:`str` and +:class:`bytes`, all return values from URL parsing functions provide +either an :meth:`encode` method (when the result contains :class:`str` +data) or a :meth:`decode` method (when the result contains :class:`bytes` +data). The signatures of these methods match those of the corresponding +:class:`str` and :class:`bytes` methods (except that the default encoding +is ``'ascii'`` rather than ``'utf-8'``). Each produces a value of a +corresponding type that contains either :class:`bytes` data (for +:meth:`encode` methods) or :class:`str` data (for +:meth:`decode` methods). + +Applications that need to operate on potentially improperly quoted URLs +that may contain non-ASCII data will need to do their own decoding from +bytes to characters before invoking the URL parsing methods. + +The behaviour described in this section applies only to the URL parsing +functions. The URL quoting functions use their own rules when producing +or consuming byte sequences as detailed in the documentation of the +individual URL quoting functions. + +.. versionchanged:: 3.2 + URL parsing functions now accept ASCII encoded byte sequences + + +.. _urlparse-result-object: + +Structured Parse Results +------------------------ + +The result objects from the :func:`urlparse`, :func:`urlsplit` and +:func:`urldefrag`functions are subclasses of the :class:`tuple` type. +These subclasses add the attributes listed in the documentation for +those functions, the encoding and decoding support described in the +previous section, as well as an additional method: + +.. method:: urllib.parse.SplitResult.geturl() + + Return the re-combined version of the original URL as a string. This may + differ from the original URL in that the scheme may be normalized to lower + case and empty components may be dropped. Specifically, empty parameters, + queries, and fragment identifiers will be removed. + + For :func:`urldefrag` results, only empty fragment identifiers will be removed. + For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be + made to the URL returned by this method. + + The result of this method remains unchanged if passed back through the original + parsing function: + + >>> from urllib.parse import urlsplit + >>> url = 'HTTP://www.Python.org/doc/#' + >>> r1 = urlsplit(url) + >>> r1.geturl() + 'http://www.Python.org/doc/' + >>> r2 = urlsplit(r1.geturl()) + >>> r2.geturl() + 'http://www.Python.org/doc/' + + +The following classes provide the implementations of the structured parse +results when operating on :class:`str` objects: + +.. class:: DefragResult(url, fragment) + + Concrete class for :func:`urldefrag` results containing :class:`str` + data. The :meth:`encode` method returns a :class:`DefragResultBytes` + instance. + + .. versionadded:: 3.2 + +.. class:: ParseResult(scheme, netloc, path, params, query, fragment) + + Concrete class for :func:`urlparse` results containing :class:`str` + data. The :meth:`encode` method returns a :class:`ParseResultBytes` + instance. + +.. class:: SplitResult(scheme, netloc, path, query, fragment) + + Concrete class for :func:`urlsplit` results containing :class:`str` + data. The :meth:`encode` method returns a :class:`SplitResultBytes` + instance. + + +The following classes provide the implementations of the parse results when +operating on :class:`bytes` or :class:`bytearray` objects: + +.. class:: DefragResultBytes(url, fragment) + + Concrete class for :func:`urldefrag` results containing :class:`bytes` + data. The :meth:`decode` method returns a :class:`DefragResult` + instance. + + .. versionadded:: 3.2 + +.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment) + + Concrete class for :func:`urlparse` results containing :class:`bytes` + data. The :meth:`decode` method returns a :class:`ParseResult` + instance. + + .. versionadded:: 3.2 + +.. class:: SplitResultBytes(scheme, netloc, path, query, fragment) + + Concrete class for :func:`urlsplit` results containing :class:`bytes` + data. The :meth:`decode` method returns a :class:`SplitResult` + instance. + + .. versionadded:: 3.2 + + +URL Quoting +----------- + +The URL quoting functions focus on taking program data and making it safe +for use as URL components by quoting special characters and appropriately +encoding non-ASCII text. They also support reversing these operations to +recreate the original data from the contents of a URL component if that +task isn't already covered by the URL parsing functions above. .. function:: quote(string, safe='/', encoding=None, errors=None) @@ -322,8 +485,7 @@ The :mod:`urllib.parse` module defines the following functions: If it is a :class:`str`, unescaped non-ASCII characters in *string* are encoded into UTF-8 bytes. - Example: ``unquote_to_bytes('a%26%EF')`` yields - ``b'a&\xef'``. + Example: ``unquote_to_bytes('a%26%EF')`` yields ``b'a&\xef'``. .. function:: urlencode(query, doseq=False, safe='', encoding=None, errors=None) @@ -340,12 +502,13 @@ The :mod:`urllib.parse` module defines the following functions: the optional parameter *doseq* is evaluates to *True*, individual ``key=value`` pairs separated by ``'&'`` are generated for each element of the value sequence for the key. The order of parameters in the encoded - string will match the order of parameter tuples in the sequence. This module - provides the functions :func:`parse_qs` and :func:`parse_qsl` which are used - to parse query strings into Python data structures. + string will match the order of parameter tuples in the sequence. When *query* parameter is a :class:`str`, the *safe*, *encoding* and *error* - parameters are sent the :func:`quote_plus` for encoding. + parameters are passed down to :func:`quote_plus` for encoding. + + To reverse this encoding process, :func:`parse_qs` and :func:`parse_qsl` are + provided in this module to parse query strings into Python data structures. .. versionchanged:: 3.2 Query parameter supports bytes and string objects. @@ -376,57 +539,3 @@ The :mod:`urllib.parse` module defines the following functions: :rfc:`1738` - Uniform Resource Locators (URL) This specifies the formal syntax and semantics of absolute URLs. - - -.. _urlparse-result-object: - -Results of :func:`urlparse` and :func:`urlsplit` ------------------------------------------------- - -The result objects from the :func:`urlparse` and :func:`urlsplit` functions are -subclasses of the :class:`tuple` type. These subclasses add the attributes -described in those functions, as well as provide an additional method: - -.. method:: ParseResult.geturl() - - Return the re-combined version of the original URL as a string. This may differ - from the original URL in that the scheme will always be normalized to lower case - and empty components may be dropped. Specifically, empty parameters, queries, - and fragment identifiers will be removed. - - The result of this method is a fixpoint if passed back through the original - parsing function: - - >>> import urllib.parse - >>> url = 'HTTP://www.Python.org/doc/#' - - >>> r1 = urllib.parse.urlsplit(url) - >>> r1.geturl() - 'http://www.Python.org/doc/' - - >>> r2 = urllib.parse.urlsplit(r1.geturl()) - >>> r2.geturl() - 'http://www.Python.org/doc/' - - -The following classes provide the implementations of the parse results: - -.. class:: BaseResult - - Base class for the concrete result classes. This provides most of the - attribute definitions. It does not provide a :meth:`geturl` method. It is - derived from :class:`tuple`, but does not override the :meth:`__init__` or - :meth:`__new__` methods. - - -.. class:: ParseResult(scheme, netloc, path, params, query, fragment) - - Concrete class for :func:`urlparse` results. The :meth:`__new__` method is - overridden to support checking that the right number of arguments are passed. - - -.. class:: SplitResult(scheme, netloc, path, query, fragment) - - Concrete class for :func:`urlsplit` results. The :meth:`__new__` method is - overridden to support checking that the right number of arguments are passed. - diff --git a/Doc/whatsnew/3.2.rst b/Doc/whatsnew/3.2.rst index 2d1f0ef..056e2fa 100644 --- a/Doc/whatsnew/3.2.rst +++ b/Doc/whatsnew/3.2.rst @@ -573,6 +573,14 @@ New, Improved, and Deprecated Modules (Contributed by Rodolpho Eckhardt and Nick Coghlan, :issue:`10220`.) .. XXX: Mention inspect.getattr_static (Michael Foord) +.. XXX: Mention urllib.parse changes + Issue 9873 (Nick Coghlan): + - ASCII byte sequence support in URL parsing + - named tuple for urldefrag return value + Issue 5468 (Dan Mahn) for urlencode: + - bytes input support + - non-UTF8 percent encoding of non-ASCII characters + Issue 2987 for IPv6 (RFC2732) support in urlparse Multi-threading =============== diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index e559142..e67c2b6 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -24,6 +24,17 @@ parse_qsl_test_cases = [ ("&a=b", [('a', 'b')]), ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]), ("a=1&a=2", [('a', '1'), ('a', '2')]), + (b"", []), + (b"&", []), + (b"&&", []), + (b"=", [(b'', b'')]), + (b"=a", [(b'', b'a')]), + (b"a", [(b'a', b'')]), + (b"a=", [(b'a', b'')]), + (b"a=", [(b'a', b'')]), + (b"&a=b", [(b'a', b'b')]), + (b"a=a+b&b=b+c", [(b'a', b'a b'), (b'b', b'b c')]), + (b"a=1&a=2", [(b'a', b'1'), (b'a', b'2')]), ] class UrlParseTestCase(unittest.TestCase): @@ -86,7 +97,7 @@ class UrlParseTestCase(unittest.TestCase): def test_roundtrips(self): - testcases = [ + str_cases = [ ('file:///tmp/junk.txt', ('file', '', '/tmp/junk.txt', '', '', ''), ('file', '', '/tmp/junk.txt', '', '')), @@ -110,16 +121,21 @@ class UrlParseTestCase(unittest.TestCase): ('git+ssh', 'git@github.com','/user/project.git', '','',''), ('git+ssh', 'git@github.com','/user/project.git', - '', '')) + '', '')), ] - for url, parsed, split in testcases: + def _encode(t): + return (t[0].encode('ascii'), + tuple(x.encode('ascii') for x in t[1]), + tuple(x.encode('ascii') for x in t[2])) + bytes_cases = [_encode(x) for x in str_cases] + for url, parsed, split in str_cases + bytes_cases: self.checkRoundtrips(url, parsed, split) def test_http_roundtrips(self): # urllib.parse.urlsplit treats 'http:' as an optimized special case, # so we test both 'http:' and 'https:' in all the following. # Three cheers for white box knowledge! - testcases = [ + str_cases = [ ('://www.python.org', ('www.python.org', '', '', '', ''), ('www.python.org', '', '', '')), @@ -136,19 +152,34 @@ class UrlParseTestCase(unittest.TestCase): ('a', '/b/c/d', 'p', 'q', 'f'), ('a', '/b/c/d;p', 'q', 'f')), ] - for scheme in ('http', 'https'): - for url, parsed, split in testcases: - url = scheme + url - parsed = (scheme,) + parsed - split = (scheme,) + split - self.checkRoundtrips(url, parsed, split) + def _encode(t): + return (t[0].encode('ascii'), + tuple(x.encode('ascii') for x in t[1]), + tuple(x.encode('ascii') for x in t[2])) + bytes_cases = [_encode(x) for x in str_cases] + str_schemes = ('http', 'https') + bytes_schemes = (b'http', b'https') + str_tests = str_schemes, str_cases + bytes_tests = bytes_schemes, bytes_cases + for schemes, test_cases in (str_tests, bytes_tests): + for scheme in schemes: + for url, parsed, split in test_cases: + url = scheme + url + parsed = (scheme,) + parsed + split = (scheme,) + split + self.checkRoundtrips(url, parsed, split) def checkJoin(self, base, relurl, expected): - self.assertEqual(urllib.parse.urljoin(base, relurl), expected, - (base, relurl, expected)) + str_components = (base, relurl, expected) + self.assertEqual(urllib.parse.urljoin(base, relurl), expected) + bytes_components = baseb, relurlb, expectedb = [ + x.encode('ascii') for x in str_components] + self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) def test_unparse_parse(self): - for u in ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]: + str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] + bytes_cases = [x.encode('ascii') for x in str_cases] + for u in str_cases + bytes_cases: self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u) self.assertEqual(urllib.parse.urlunparse(urllib.parse.urlparse(u)), u) @@ -328,7 +359,7 @@ class UrlParseTestCase(unittest.TestCase): self.checkJoin(SIMPLE_BASE, 'http:g?y/./x','http://a/b/c/g?y/./x') def test_RFC2732(self): - for url, hostname, port in [ + str_cases = [ ('http://Test.python.org:5432/foo/', 'test.python.org', 5432), ('http://12.34.56.78:5432/foo/', '12.34.56.78', 5432), ('http://[::1]:5432/foo/', '::1', 5432), @@ -349,20 +380,26 @@ class UrlParseTestCase(unittest.TestCase): ('http://[::12.34.56.78]/foo/', '::12.34.56.78', None), ('http://[::ffff:12.34.56.78]/foo/', '::ffff:12.34.56.78', None), - ]: + ] + def _encode(t): + return t[0].encode('ascii'), t[1].encode('ascii'), t[2] + bytes_cases = [_encode(x) for x in str_cases] + for url, hostname, port in str_cases + bytes_cases: urlparsed = urllib.parse.urlparse(url) self.assertEqual((urlparsed.hostname, urlparsed.port) , (hostname, port)) - for invalid_url in [ + str_cases = [ 'http://::12.34.56.78]/', 'http://[::1/foo/', 'ftp://[::1/foo/bad]/bad', 'http://[::1/foo/bad]/bad', - 'http://[::ffff:12.34.56.78']: + 'http://[::ffff:12.34.56.78'] + bytes_cases = [x.encode('ascii') for x in str_cases] + for invalid_url in str_cases + bytes_cases: self.assertRaises(ValueError, urllib.parse.urlparse, invalid_url) def test_urldefrag(self): - for url, defrag, frag in [ + str_cases = [ ('http://python.org#frag', 'http://python.org', 'frag'), ('http://python.org', 'http://python.org', ''), ('http://python.org/#frag', 'http://python.org/', 'frag'), @@ -373,8 +410,16 @@ class UrlParseTestCase(unittest.TestCase): ('http://python.org/p?q', 'http://python.org/p?q', ''), (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'), (RFC2396_BASE, 'http://a/b/c/d;p?q', ''), - ]: - self.assertEqual(urllib.parse.urldefrag(url), (defrag, frag)) + ] + def _encode(t): + return type(t)(x.encode('ascii') for x in t) + bytes_cases = [_encode(x) for x in str_cases] + for url, defrag, frag in str_cases + bytes_cases: + result = urllib.parse.urldefrag(url) + self.assertEqual(result.geturl(), url) + self.assertEqual(result, (defrag, frag)) + self.assertEqual(result.url, defrag) + self.assertEqual(result.fragment, frag) def test_urlsplit_attributes(self): url = "HTTP://WWW.PYTHON.ORG/doc/#frag" @@ -390,7 +435,8 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p.port, None) # geturl() won't return exactly the original URL in this case # since the scheme is always case-normalized - #self.assertEqual(p.geturl(), url) + # We handle this by ignoring the first 4 characters of the URL + self.assertEqual(p.geturl()[4:], url[4:]) url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" p = urllib.parse.urlsplit(url) @@ -422,6 +468,45 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) + # And check them all again, only with bytes this time + url = b"HTTP://WWW.PYTHON.ORG/doc/#frag" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"WWW.PYTHON.ORG") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl()[4:], url[4:]) + + url = b"http://User:Pass@www.python.org:080/doc/?query=yes#frag" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"query=yes") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, b"User") + self.assertEqual(p.password, b"Pass") + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), url) + + url = b"http://User@example.com:Pass@www.python.org:080/doc/?query=yes#frag" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"User@example.com:Pass@www.python.org:080") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"query=yes") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, b"User@example.com") + self.assertEqual(p.password, b"Pass") + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), url) def test_attributes_bad_port(self): """Check handling of non-integer ports.""" @@ -433,6 +518,15 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port) + # Once again, repeat ourselves to test bytes + p = urllib.parse.urlsplit(b"http://www.example.net:foo") + self.assertEqual(p.netloc, b"www.example.net:foo") + self.assertRaises(ValueError, lambda: p.port) + + p = urllib.parse.urlparse(b"http://www.example.net:foo") + self.assertEqual(p.netloc, b"www.example.net:foo") + self.assertRaises(ValueError, lambda: p.port) + def test_attributes_without_netloc(self): # This example is straight from RFC 3261. It looks like it # should allow the username, hostname, and port to be filled @@ -456,10 +550,30 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) + # You guessed it, repeating the test with bytes input + uri = b"sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" + p = urllib.parse.urlsplit(uri) + self.assertEqual(p.netloc, b"") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, None) + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), uri) + + p = urllib.parse.urlparse(uri) + self.assertEqual(p.netloc, b"") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, None) + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), uri) + def test_noslash(self): # Issue 1637: http://foo.com?query is legal self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"), ('http', 'example.com', '', '', 'blahblah=/foo', '')) + self.assertEqual(urllib.parse.urlparse(b"http://example.com?blahblah=/foo"), + (b'http', b'example.com', b'', b'', b'blahblah=/foo', b'')) def test_withoutscheme(self): # Test urlparse without scheme @@ -472,6 +586,13 @@ class UrlParseTestCase(unittest.TestCase): ('','www.python.org:80','','','','')) self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"), ('http','www.python.org:80','','','','')) + # Repeat for bytes input + self.assertEqual(urllib.parse.urlparse(b"path"), + (b'',b'',b'path',b'',b'',b'')) + self.assertEqual(urllib.parse.urlparse(b"//www.python.org:80"), + (b'',b'www.python.org:80',b'',b'',b'',b'')) + self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"), + (b'http',b'www.python.org:80',b'',b'',b'',b'')) def test_portseparator(self): # Issue 754016 makes changes for port separator ':' from scheme separator @@ -481,6 +602,13 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','','')) self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"), ('http','www.python.org:80','','','','')) + # As usual, need to check bytes input as well + self.assertEqual(urllib.parse.urlparse(b"path:80"), + (b'',b'',b'path:80',b'',b'',b'')) + self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b'')) + self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b'')) + self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"), + (b'http',b'www.python.org:80',b'',b'',b'',b'')) def test_usingsys(self): # Issue 3314: sys module is used in the error @@ -492,6 +620,71 @@ class UrlParseTestCase(unittest.TestCase): ('s3', 'foo.com', '/stuff', '', '', '')) self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff"), ('x-newscheme', 'foo.com', '/stuff', '', '', '')) + # And for bytes... + self.assertEqual(urllib.parse.urlparse(b"s3://foo.com/stuff"), + (b's3', b'foo.com', b'/stuff', b'', b'', b'')) + self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff"), + (b'x-newscheme', b'foo.com', b'/stuff', b'', b'', b'')) + + def test_mixed_types_rejected(self): + # Several functions that process either strings or ASCII encoded bytes + # accept multiple arguments. Check they reject mixed type input + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlparse("www.python.org", b"http") + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlparse(b"www.python.org", "http") + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlsplit("www.python.org", b"http") + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlsplit(b"www.python.org", "http") + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlunparse(( b"http", "www.python.org","","","","")) + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlunparse(("http", b"www.python.org","","","","")) + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlunsplit((b"http", "www.python.org","","","")) + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urlunsplit(("http", b"www.python.org","","","")) + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urljoin("http://python.org", b"http://python.org") + with self.assertRaisesRegexp(TypeError, "Cannot mix str"): + urllib.parse.urljoin(b"http://python.org", "http://python.org") + + def _check_result_type(self, str_type): + num_args = len(str_type._fields) + bytes_type = str_type._encoded_counterpart + self.assertIs(bytes_type._decoded_counterpart, str_type) + str_args = ('',) * num_args + bytes_args = (b'',) * num_args + str_result = str_type(*str_args) + bytes_result = bytes_type(*bytes_args) + encoding = 'ascii' + errors = 'strict' + self.assertEqual(str_result, str_args) + self.assertEqual(bytes_result.decode(), str_args) + self.assertEqual(bytes_result.decode(), str_result) + self.assertEqual(bytes_result.decode(encoding), str_args) + self.assertEqual(bytes_result.decode(encoding), str_result) + self.assertEqual(bytes_result.decode(encoding, errors), str_args) + self.assertEqual(bytes_result.decode(encoding, errors), str_result) + self.assertEqual(bytes_result, bytes_args) + self.assertEqual(str_result.encode(), bytes_args) + self.assertEqual(str_result.encode(), bytes_result) + self.assertEqual(str_result.encode(encoding), bytes_args) + self.assertEqual(str_result.encode(encoding), bytes_result) + self.assertEqual(str_result.encode(encoding, errors), bytes_args) + self.assertEqual(str_result.encode(encoding, errors), bytes_result) + + def test_result_pairs(self): + # Check encoding and decoding between result pairs + result_types = [ + urllib.parse.DefragResult, + urllib.parse.SplitResult, + urllib.parse.ParseResult, + ] + for result_type in result_types: + self._check_result_type(result_type) + def test_main(): support.run_unittest(UrlParseTestCase) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 78f3084..ab5b356 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -60,6 +60,7 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' '0123456789' '+-.') +# XXX: Consider replacing with functools.lru_cache MAX_CACHE_SIZE = 20 _parse_cache = {} @@ -69,66 +70,210 @@ def clear_cache(): _safe_quoters.clear() -class ResultMixin(object): - """Shared methods for the parsed result objects.""" +# Helpers for bytes handling +# For 3.2, we deliberately require applications that +# handle improperly quoted URLs to do their own +# decoding and encoding. If valid use cases are +# presented, we may relax this by using latin-1 +# decoding internally for 3.3 +_implicit_encoding = 'ascii' +_implicit_errors = 'strict' + +def _noop(obj): + return obj + +def _encode_result(obj, encoding=_implicit_encoding, + errors=_implicit_errors): + return obj.encode(encoding, errors) + +def _decode_args(args, encoding=_implicit_encoding, + errors=_implicit_errors): + return tuple(x.decode(encoding, errors) if x else '' for x in args) + +def _coerce_args(*args): + # Invokes decode if necessary to create str args + # and returns the coerced inputs along with + # an appropriate result coercion function + # - noop for str inputs + # - encoding function otherwise + str_input = isinstance(args[0], str) + for arg in args[1:]: + # We special-case the empty string to support the + # "scheme=''" default argument to some functions + if arg and isinstance(arg, str) != str_input: + raise TypeError("Cannot mix str and non-str arguments") + if str_input: + return args + (_noop,) + return _decode_args(args) + (_encode_result,) + +# Result objects are more helpful than simple tuples +class _ResultMixinStr(object): + """Standard approach to encoding parsed results from str to bytes""" + __slots__ = () + + def encode(self, encoding='ascii', errors='strict'): + return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) + + +class _ResultMixinBytes(object): + """Standard approach to decoding parsed results from bytes to str""" + __slots__ = () + + def decode(self, encoding='ascii', errors='strict'): + return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) + + +class _NetlocResultMixinBase(object): + """Shared methods for the parsed result objects containing a netloc element""" + __slots__ = () @property def username(self): - netloc = self.netloc - if "@" in netloc: - userinfo = netloc.rsplit("@", 1)[0] - if ":" in userinfo: - userinfo = userinfo.split(":", 1)[0] - return userinfo - return None + return self._userinfo[0] @property def password(self): - netloc = self.netloc - if "@" in netloc: - userinfo = netloc.rsplit("@", 1)[0] - if ":" in userinfo: - return userinfo.split(":", 1)[1] - return None + return self._userinfo[1] @property def hostname(self): - netloc = self.netloc.split('@')[-1] - if '[' in netloc and ']' in netloc: - return netloc.split(']')[0][1:].lower() - elif ':' in netloc: - return netloc.split(':')[0].lower() - elif netloc == '': - return None - else: - return netloc.lower() + hostname = self._hostinfo[0] + if not hostname: + hostname = None + elif hostname is not None: + hostname = hostname.lower() + return hostname @property def port(self): - netloc = self.netloc.split('@')[-1].split(']')[-1] - if ':' in netloc: - port = netloc.split(':')[1] - return int(port, 10) + port = self._hostinfo[1] + if port is not None: + port = int(port, 10) + return port + + +class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): + __slots__ = () + + @property + def _userinfo(self): + netloc = self.netloc + userinfo, have_info, hostinfo = netloc.rpartition('@') + if have_info: + username, have_password, password = userinfo.partition(':') + if not have_password: + password = None else: - return None + username = password = None + return username, password + + @property + def _hostinfo(self): + netloc = self.netloc + _, _, hostinfo = netloc.rpartition('@') + _, have_open_br, bracketed = hostinfo.partition('[') + if have_open_br: + hostname, _, port = bracketed.partition(']') + _, have_port, port = port.partition(':') + else: + hostname, have_port, port = hostinfo.partition(':') + if not have_port: + port = None + return hostname, port + + +class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): + __slots__ = () + + @property + def _userinfo(self): + netloc = self.netloc + userinfo, have_info, hostinfo = netloc.rpartition(b'@') + if have_info: + username, have_password, password = userinfo.partition(b':') + if not have_password: + password = None + else: + username = password = None + return username, password + + @property + def _hostinfo(self): + netloc = self.netloc + _, _, hostinfo = netloc.rpartition(b'@') + _, have_open_br, bracketed = hostinfo.partition(b'[') + if have_open_br: + hostname, _, port = bracketed.partition(b']') + _, have_port, port = port.partition(b':') + else: + hostname, have_port, port = hostinfo.partition(b':') + if not have_port: + port = None + return hostname, port + from collections import namedtuple -class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): +_DefragResultBase = namedtuple('DefragResult', 'url fragment') +_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment') +_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment') + +# For backwards compatibility, alias _NetlocResultMixinStr +# ResultBase is no longer part of the documented API, but it is +# retained since deprecating it isn't worth the hassle +ResultBase = _NetlocResultMixinStr +# Structured result objects for string data +class DefragResult(_DefragResultBase, _ResultMixinStr): __slots__ = () + def geturl(self): + if self.fragment: + return self.url + '#' + self.fragment + else: + return self.url +class SplitResult(_SplitResultBase, _NetlocResultMixinStr): + __slots__ = () def geturl(self): return urlunsplit(self) +class ParseResult(_ParseResultBase, _NetlocResultMixinStr): + __slots__ = () + def geturl(self): + return urlunparse(self) -class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): +# Structured result objects for bytes data +class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): + __slots__ = () + def geturl(self): + if self.fragment: + return self.url + b'#' + self.fragment + else: + return self.url +class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): __slots__ = () + def geturl(self): + return urlunsplit(self) +class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): + __slots__ = () def geturl(self): return urlunparse(self) +# Set up the encode/decode result pairs +def _fix_result_transcoding(): + _result_pairs = ( + (DefragResult, DefragResultBytes), + (SplitResult, SplitResultBytes), + (ParseResult, ParseResultBytes), + ) + for _decoded, _encoded in _result_pairs: + _decoded._encoded_counterpart = _encoded + _encoded._decoded_counterpart = _decoded + +_fix_result_transcoding() +del _fix_result_transcoding def urlparse(url, scheme='', allow_fragments=True): """Parse a URL into 6 components: @@ -136,13 +281,15 @@ def urlparse(url, scheme='', allow_fragments=True): Return a 6-tuple: (scheme, netloc, path, params, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" + url, scheme, _coerce_result = _coerce_args(url, scheme) tuple = urlsplit(url, scheme, allow_fragments) scheme, netloc, url, query, fragment = tuple if scheme in uses_params and ';' in url: url, params = _splitparams(url) else: params = '' - return ParseResult(scheme, netloc, url, params, query, fragment) + result = ParseResult(scheme, netloc, url, params, query, fragment) + return _coerce_result(result) def _splitparams(url): if '/' in url: @@ -167,11 +314,12 @@ def urlsplit(url, scheme='', allow_fragments=True): Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" + url, scheme, _coerce_result = _coerce_args(url, scheme) allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: - return cached + return _coerce_result(cached) if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = '' @@ -191,7 +339,7 @@ def urlsplit(url, scheme='', allow_fragments=True): url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v - return v + return _coerce_result(v) if url.endswith(':') or not url[i+1].isdigit(): for c in url[:i]: if c not in scheme_chars: @@ -209,17 +357,18 @@ def urlsplit(url, scheme='', allow_fragments=True): url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v - return v + return _coerce_result(v) def urlunparse(components): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query (the draft states that these are equivalent).""" - scheme, netloc, url, params, query, fragment = components + scheme, netloc, url, params, query, fragment, _coerce_result = ( + _coerce_args(*components)) if params: url = "%s;%s" % (url, params) - return urlunsplit((scheme, netloc, url, query, fragment)) + return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) def urlunsplit(components): """Combine the elements of a tuple as returned by urlsplit() into a @@ -227,7 +376,8 @@ def urlunsplit(components): This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had unnecessary delimiters (for example, a ? with an empty query; the RFC states that these are equivalent).""" - scheme, netloc, url, query, fragment = components + scheme, netloc, url, query, fragment, _coerce_result = ( + _coerce_args(*components)) if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url @@ -237,7 +387,7 @@ def urlunsplit(components): url = url + '?' + query if fragment: url = url + '#' + fragment - return url + return _coerce_result(url) def urljoin(base, url, allow_fragments=True): """Join a base URL and a possibly relative URL to form an absolute @@ -246,32 +396,33 @@ def urljoin(base, url, allow_fragments=True): return url if not url: return base + base, url, _coerce_result = _coerce_args(base, url) bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ urlparse(base, '', allow_fragments) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_fragments) if scheme != bscheme or scheme not in uses_relative: - return url + return _coerce_result(url) if scheme in uses_netloc: if netloc: - return urlunparse((scheme, netloc, path, - params, query, fragment)) + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) netloc = bnetloc if path[:1] == '/': - return urlunparse((scheme, netloc, path, - params, query, fragment)) + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) if not path: path = bpath if not params: params = bparams else: path = path[:-1] - return urlunparse((scheme, netloc, path, - params, query, fragment)) + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) if not query: query = bquery - return urlunparse((scheme, netloc, path, - params, query, fragment)) + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) segments = bpath.split('/')[:-1] + path.split('/') # XXX The stuff below is bogus in various ways... if segments[-1] == '.': @@ -293,8 +444,8 @@ def urljoin(base, url, allow_fragments=True): segments[-1] = '' elif len(segments) >= 2 and segments[-1] == '..': segments[-2:] = [''] - return urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment)) + return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), + params, query, fragment))) def urldefrag(url): """Removes any existing fragment from URL. @@ -303,12 +454,14 @@ def urldefrag(url): the URL contained no fragments, the second element is the empty string. """ + url, _coerce_result = _coerce_args(url) if '#' in url: s, n, p, a, q, frag = urlparse(url) defrag = urlunparse((s, n, p, a, q, '')) - return defrag, frag else: - return url, '' + frag = '' + defrag = url + return _coerce_result(DefragResult(defrag, frag)) def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" @@ -420,6 +573,7 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False): Returns a list, as G-d intended. """ + qs, _coerce_result = _coerce_args(qs) pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] r = [] for name_value in pairs: @@ -435,10 +589,9 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False): else: continue if len(nv[1]) or keep_blank_values: - name = unquote(nv[0].replace('+', ' ')) - value = unquote(nv[1].replace('+', ' ')) + name = _coerce_result(unquote(nv[0].replace('+', ' '))) + value = _coerce_result(unquote(nv[1].replace('+', ' '))) r.append((name, value)) - return r def unquote_plus(string, encoding='utf-8', errors='replace'): diff --git a/Misc/NEWS b/Misc/NEWS index 2eb41df..470fcff 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -43,6 +43,9 @@ Core and Builtins Library ------- +- Issue #9873: The URL parsing functions in urllib.parse now accept + ASCII byte sequences as input in addition to character strings. + - Issue #10586: The statistics API for the new functools.lru_cache has been changed to a single cache_info() method returning a named tuple. -- cgit v0.12