- Issue #3300: make urllib.parse.[un]quote() default to UTF-8.

Code contributed by Matt Giuca. quote() now encodes the input before quoting, unquote() decodes after unquoting. There are new arguments to change the encoding and errors settings. There are also new APIs to skip the encode/decode steps. [un]quote_plus() are also affected.
author: Guido van Rossum <guido@python.org> 2008-08-18 21:44:30 (GMT)
committer: Guido van Rossum <guido@python.org> 2008-08-18 21:44:30 (GMT)
commit: 52dbbb906804f36067ecbc8c89a00cdab545bdb2 (patch)
tree: 1b923b821dc0547f6fa3e30401c7dac177a8f557 /Lib/test
parent: 4171da5c9d899dc64cb15f177f05b9de05563148 (diff)
download: cpython-52dbbb906804f36067ecbc8c89a00cdab545bdb2.zip
cpython-52dbbb906804f36067ecbc8c89a00cdab545bdb2.tar.gz
cpython-52dbbb906804f36067ecbc8c89a00cdab545bdb2.tar.bz2
4 files changed, 235 insertions, 23 deletions
diff --git a/Lib/test/test_cgi.py b/Lib/test/test_cgi.py
index 9381a3c..cc11acc 100644
--- a/Lib/test/test_cgi.py
+++ b/Lib/test/test_cgi.py
@@ -68,6 +68,8 @@ parse_qsl_test_cases = [
     ("&a=b", [('a', 'b')]),
     ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]),
     ("a=1&a=2", [('a', '1'), ('a', '2')]),
+    ("a=%26&b=%3D", [('a', '&'), ('b', '=')]),
+    ("a=%C3%BC&b=%CA%83", [('a', '\xfc'), ('b', '\u0283')]),
 ]
 
 parse_strict_test_cases = [
diff --git a/Lib/test/test_http_cookiejar.py b/Lib/test/test_http_cookiejar.py
index 1627923..a97c6fa 100644
--- a/Lib/test/test_http_cookiejar.py
+++ b/Lib/test/test_http_cookiejar.py
@@ -539,6 +539,8 @@ class CookieTests(TestCase):
             # unquoted unsafe
             ("/foo\031/bar", "/foo%19/bar"),
             ("/\175foo/bar", "/%7Dfoo/bar"),
+            # unicode, latin-1 range
+            ("/foo/bar\u00fc", "/foo/bar%C3%BC"),     # UTF-8 encoded
             # unicode
             ("/foo/bar\uabcd", "/foo/bar%EA%AF%8D"),  # UTF-8 encoded
             ]
@@ -1444,7 +1446,8 @@ class LWPCookieTests(TestCase):
         # Try some URL encodings of the PATHs.
         # (the behaviour here has changed from libwww-perl)
         c = CookieJar(DefaultCookiePolicy(rfc2965=True))
-        interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5",
+        interact_2965(c, "http://www.acme.com/foo%2f%25/"
+                         "%3c%3c%0Anew%C3%A5/%C3%A5",
                       "foo  =   bar; version    =   1")
 
         cookie = interact_2965(
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index f5a9d5d..a8a8111 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -336,10 +336,10 @@ class QuotingTests(unittest.TestCase):
                                  "_.-"])
         result = urllib.parse.quote(do_not_quote)
         self.assertEqual(do_not_quote, result,
-                         "using quote(): %s != %s" % (do_not_quote, result))
+                         "using quote(): %r != %r" % (do_not_quote, result))
         result = urllib.parse.quote_plus(do_not_quote)
         self.assertEqual(do_not_quote, result,
-                        "using quote_plus(): %s != %s" % (do_not_quote, result))
+                        "using quote_plus(): %r != %r" % (do_not_quote, result))
 
     def test_default_safe(self):
         # Test '/' is default value for 'safe' parameter
@@ -350,11 +350,28 @@ class QuotingTests(unittest.TestCase):
         quote_by_default = "<>"
         result = urllib.parse.quote(quote_by_default, safe=quote_by_default)
         self.assertEqual(quote_by_default, result,
-                         "using quote(): %s != %s" % (quote_by_default, result))
+                         "using quote(): %r != %r" % (quote_by_default, result))
         result = urllib.parse.quote_plus(quote_by_default, safe=quote_by_default)
         self.assertEqual(quote_by_default, result,
-                         "using quote_plus(): %s != %s" %
+                         "using quote_plus(): %r != %r" %
                          (quote_by_default, result))
+        # Safe expressed as bytes rather than str
+        result = urllib.parse.quote(quote_by_default, safe=b"<>")
+        self.assertEqual(quote_by_default, result,
+                         "using quote(): %r != %r" % (quote_by_default, result))
+        # "Safe" non-ASCII characters should have no effect
+        # (Since URIs are not allowed to have non-ASCII characters)
+        result = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="\xfc")
+        expect = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="")
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" %
+                         (expect, result))
+        # Same as above, but using a bytes rather than str
+        result = urllib.parse.quote("a\xfcb", encoding="latin-1", safe=b"\xfc")
+        expect = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="")
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" %
+                         (expect, result))
 
     def test_default_quoting(self):
         # Make sure all characters that should be quoted are by default sans
@@ -378,34 +395,98 @@ class QuotingTests(unittest.TestCase):
         expected = "ab%5B%5Dcd"
         result = urllib.parse.quote(partial_quote)
         self.assertEqual(expected, result,
-                         "using quote(): %s != %s" % (expected, result))
+                         "using quote(): %r != %r" % (expected, result))
         self.assertEqual(expected, result,
-                         "using quote_plus(): %s != %s" % (expected, result))
+                         "using quote_plus(): %r != %r" % (expected, result))
 
     def test_quoting_space(self):
         # Make sure quote() and quote_plus() handle spaces as specified in
         # their unique way
         result = urllib.parse.quote(' ')
         self.assertEqual(result, hexescape(' '),
-                         "using quote(): %s != %s" % (result, hexescape(' ')))
+                         "using quote(): %r != %r" % (result, hexescape(' ')))
         result = urllib.parse.quote_plus(' ')
         self.assertEqual(result, '+',
-                         "using quote_plus(): %s != +" % result)
+                         "using quote_plus(): %r != +" % result)
         given = "a b cd e f"
         expect = given.replace(' ', hexescape(' '))
         result = urllib.parse.quote(given)
         self.assertEqual(expect, result,
-                         "using quote(): %s != %s" % (expect, result))
+                         "using quote(): %r != %r" % (expect, result))
         expect = given.replace(' ', '+')
         result = urllib.parse.quote_plus(given)
         self.assertEqual(expect, result,
-                         "using quote_plus(): %s != %s" % (expect, result))
+                         "using quote_plus(): %r != %r" % (expect, result))
 
     def test_quoting_plus(self):
         self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma'),
                          'alpha%2Bbeta+gamma')
         self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', '+'),
                          'alpha+beta+gamma')
+        # Test with bytes
+        self.assertEqual(urllib.parse.quote_plus(b'alpha+beta gamma'),
+                         'alpha%2Bbeta+gamma')
+        # Test with safe bytes
+        self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', b'+'),
+                         'alpha+beta+gamma')
+
+    def test_quote_bytes(self):
+        # Bytes should quote directly to percent-encoded values
+        given = b"\xa2\xd8ab\xff"
+        expect = "%A2%D8ab%FF"
+        result = urllib.parse.quote(given)
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
+        # Encoding argument should raise type error on bytes input
+        self.assertRaises(TypeError, urllib.parse.quote, given,
+                            encoding="latin-1")
+        # quote_from_bytes should work the same
+        result = urllib.parse.quote_from_bytes(given)
+        self.assertEqual(expect, result,
+                         "using quote_from_bytes(): %r != %r"
+                         % (expect, result))
+
+    def test_quote_with_unicode(self):
+        # Characters in Latin-1 range, encoded by default in UTF-8
+        given = "\xa2\xd8ab\xff"
+        expect = "%C2%A2%C3%98ab%C3%BF"
+        result = urllib.parse.quote(given)
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
+        # Characters in Latin-1 range, encoded by with None (default)
+        result = urllib.parse.quote(given, encoding=None, errors=None)
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
+        # Characters in Latin-1 range, encoded with Latin-1
+        given = "\xa2\xd8ab\xff"
+        expect = "%A2%D8ab%FF"
+        result = urllib.parse.quote(given, encoding="latin-1")
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
+        # Characters in BMP, encoded by default in UTF-8
+        given = "\u6f22\u5b57"              # "Kanji"
+        expect = "%E6%BC%A2%E5%AD%97"
+        result = urllib.parse.quote(given)
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
+        # Characters in BMP, encoded with Latin-1
+        given = "\u6f22\u5b57"
+        self.assertRaises(UnicodeEncodeError, urllib.parse.quote, given,
+                                    encoding="latin-1")
+        # Characters in BMP, encoded with Latin-1, with replace error handling
+        given = "\u6f22\u5b57"
+        expect = "%3F%3F"                   # "??"
+        result = urllib.parse.quote(given, encoding="latin-1",
+                                    errors="replace")
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
+        # Characters in BMP, Latin-1, with xmlcharref error handling
+        given = "\u6f22\u5b57"
+        expect = "%26%2328450%3B%26%2323383%3B"     # "&#28450;&#23383;"
+        result = urllib.parse.quote(given, encoding="latin-1",
+                                    errors="xmlcharrefreplace")
+        self.assertEqual(expect, result,
+                         "using quote(): %r != %r" % (expect, result))
 
 class UnquotingTests(unittest.TestCase):
     """Tests for unquote() and unquote_plus()
@@ -422,23 +503,62 @@ class UnquotingTests(unittest.TestCase):
             expect = chr(num)
             result = urllib.parse.unquote(given)
             self.assertEqual(expect, result,
-                             "using unquote(): %s != %s" % (expect, result))
+                             "using unquote(): %r != %r" % (expect, result))
             result = urllib.parse.unquote_plus(given)
             self.assertEqual(expect, result,
-                             "using unquote_plus(): %s != %s" %
+                             "using unquote_plus(): %r != %r" %
                              (expect, result))
             escape_list.append(given)
         escape_string = ''.join(escape_list)
         del escape_list
         result = urllib.parse.unquote(escape_string)
         self.assertEqual(result.count('%'), 1,
-                         "using quote(): not all characters escaped; %s" %
-                         result)
-        result = urllib.parse.unquote(escape_string)
-        self.assertEqual(result.count('%'), 1,
                          "using unquote(): not all characters escaped: "
                          "%s" % result)
 
+    def test_unquoting_badpercent(self):
+        # Test unquoting on bad percent-escapes
+        given = '%xab'
+        expect = given
+        result = urllib.parse.unquote(given)
+        self.assertEqual(expect, result, "using unquote(): %r != %r"
+                         % (expect, result))
+        given = '%x'
+        expect = given
+        result = urllib.parse.unquote(given)
+        self.assertEqual(expect, result, "using unquote(): %r != %r"
+                         % (expect, result))
+        given = '%'
+        expect = given
+        result = urllib.parse.unquote(given)
+        self.assertEqual(expect, result, "using unquote(): %r != %r"
+                         % (expect, result))
+        # unquote_to_bytes
+        given = '%xab'
+        expect = bytes(given, 'ascii')
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+        given = '%x'
+        expect = bytes(given, 'ascii')
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+        given = '%'
+        expect = bytes(given, 'ascii')
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+
+    def test_unquoting_mixed_case(self):
+        # Test unquoting on mixed-case hex digits in the percent-escapes
+        given = '%Ab%eA'
+        expect = b'\xab\xea'
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result,
+                         "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+
     def test_unquoting_parts(self):
         # Make sure unquoting works when have non-quoted characters
         # interspersed
@@ -446,10 +566,10 @@ class UnquotingTests(unittest.TestCase):
         expect = "abcd"
         result = urllib.parse.unquote(given)
         self.assertEqual(expect, result,
-                         "using quote(): %s != %s" % (expect, result))
+                         "using quote(): %r != %r" % (expect, result))
         result = urllib.parse.unquote_plus(given)
         self.assertEqual(expect, result,
-                         "using unquote_plus(): %s != %s" % (expect, result))
+                         "using unquote_plus(): %r != %r" % (expect, result))
 
     def test_unquoting_plus(self):
         # Test difference between unquote() and unquote_plus()
@@ -457,15 +577,100 @@ class UnquotingTests(unittest.TestCase):
         expect = given
         result = urllib.parse.unquote(given)
         self.assertEqual(expect, result,
-                         "using unquote(): %s != %s" % (expect, result))
+                         "using unquote(): %r != %r" % (expect, result))
         expect = given.replace('+', ' ')
         result = urllib.parse.unquote_plus(given)
         self.assertEqual(expect, result,
-                         "using unquote_plus(): %s != %s" % (expect, result))
+                         "using unquote_plus(): %r != %r" % (expect, result))
+
+    def test_unquote_to_bytes(self):
+        given = 'br%C3%BCckner_sapporo_20050930.doc'
+        expect = b'br\xc3\xbcckner_sapporo_20050930.doc'
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result,
+                         "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+        # Test on a string with unescaped non-ASCII characters
+        # (Technically an invalid URI; expect those characters to be UTF-8
+        # encoded).
+        result = urllib.parse.unquote_to_bytes("\u6f22%C3%BC")
+        expect = b'\xe6\xbc\xa2\xc3\xbc'    # UTF-8 for "\u6f22\u00fc"
+        self.assertEqual(expect, result,
+                         "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+        # Test with a bytes as input
+        given = b'%A2%D8ab%FF'
+        expect = b'\xa2\xd8ab\xff'
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result,
+                         "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
+        # Test with a bytes as input, with unescaped non-ASCII bytes
+        # (Technically an invalid URI; expect those bytes to be preserved)
+        given = b'%A2\xd8ab%FF'
+        expect = b'\xa2\xd8ab\xff'
+        result = urllib.parse.unquote_to_bytes(given)
+        self.assertEqual(expect, result,
+                         "using unquote_to_bytes(): %r != %r"
+                         % (expect, result))
 
     def test_unquote_with_unicode(self):
-        r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc')
-        self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc')
+        # Characters in the Latin-1 range, encoded with UTF-8
+        given = 'br%C3%BCckner_sapporo_20050930.doc'
+        expect = 'br\u00fcckner_sapporo_20050930.doc'
+        result = urllib.parse.unquote(given)
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+        # Characters in the Latin-1 range, encoded with None (default)
+        result = urllib.parse.unquote(given, encoding=None, errors=None)
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # Characters in the Latin-1 range, encoded with Latin-1
+        result = urllib.parse.unquote('br%FCckner_sapporo_20050930.doc',
+                                      encoding="latin-1")
+        expect = 'br\u00fcckner_sapporo_20050930.doc'
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # Characters in BMP, encoded with UTF-8
+        given = "%E6%BC%A2%E5%AD%97"
+        expect = "\u6f22\u5b57"             # "Kanji"
+        result = urllib.parse.unquote(given)
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # Decode with UTF-8, invalid sequence
+        given = "%F3%B1"
+        expect = "\ufffd"                   # Replacement character
+        result = urllib.parse.unquote(given)
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # Decode with UTF-8, invalid sequence, replace errors
+        result = urllib.parse.unquote(given, errors="replace")
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # Decode with UTF-8, invalid sequence, ignoring errors
+        given = "%F3%B1"
+        expect = ""
+        result = urllib.parse.unquote(given, errors="ignore")
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # A mix of non-ASCII and percent-encoded characters, UTF-8
+        result = urllib.parse.unquote("\u6f22%C3%BC")
+        expect = '\u6f22\u00fc'
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
+
+        # A mix of non-ASCII and percent-encoded characters, Latin-1
+        # (Note, the string contains non-Latin-1-representable characters)
+        result = urllib.parse.unquote("\u6f22%FC", encoding="latin-1")
+        expect = '\u6f22\u00fc'
+        self.assertEqual(expect, result,
+                         "using unquote(): %r != %r" % (expect, result))
 
 class urlencode_Tests(unittest.TestCase):
     """Tests for urlencode()"""
diff --git a/Lib/test/test_wsgiref.py b/Lib/test/test_wsgiref.py
index ac5ada3..b98452d 100755
--- a/Lib/test/test_wsgiref.py
+++ b/Lib/test/test_wsgiref.py
@@ -291,6 +291,7 @@ class UtilityTests(TestCase):
     def testAppURIs(self):
         self.checkAppURI("http://127.0.0.1/")
         self.checkAppURI("http://127.0.0.1/spam", SCRIPT_NAME="/spam")
+        self.checkAppURI("http://127.0.0.1/sp%C3%A4m", SCRIPT_NAME="/späm")
         self.checkAppURI("http://spam.example.com:2071/",
             HTTP_HOST="spam.example.com:2071", SERVER_PORT="2071")
         self.checkAppURI("http://spam.example.com/",
@@ -304,6 +305,7 @@ class UtilityTests(TestCase):
     def testReqURIs(self):
         self.checkReqURI("http://127.0.0.1/")
         self.checkReqURI("http://127.0.0.1/spam", SCRIPT_NAME="/spam")
+        self.checkReqURI("http://127.0.0.1/sp%C3%A4m", SCRIPT_NAME="/späm")
         self.checkReqURI("http://127.0.0.1/spammity/spam",
             SCRIPT_NAME="/spammity", PATH_INFO="/spam")
         self.checkReqURI("http://127.0.0.1/spammity/spam?say=ni",
author	Guido van Rossum <guido@python.org>	2008-08-18 21:44:30 (GMT)
committer	Guido van Rossum <guido@python.org>	2008-08-18 21:44:30 (GMT)
commit	52dbbb906804f36067ecbc8c89a00cdab545bdb2 (patch)
tree	1b923b821dc0547f6fa3e30401c7dac177a8f557 /Lib/test
parent	4171da5c9d899dc64cb15f177f05b9de05563148 (diff)
download	cpython-52dbbb906804f36067ecbc8c89a00cdab545bdb2.zip cpython-52dbbb906804f36067ecbc8c89a00cdab545bdb2.tar.gz cpython-52dbbb906804f36067ecbc8c89a00cdab545bdb2.tar.bz2