Enhance the punycode decoder so that it can decode

unicode objects. Fix the idna codec and the tests.
author: Walter Dörwald <walter@livinglogic.de> 2007-05-11 10:32:57 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2007-05-11 10:32:57 (GMT)
commit: 0ac30f82fe1beb4e0255d06c693ccfba56e45a9f (patch)
tree: 1795d671685687ef172c7f4d57290292cdf06879
parent: 1f05a3b7fb754d6b30300e1e50aeb92aabe6afd6 (diff)
download: cpython-0ac30f82fe1beb4e0255d06c693ccfba56e45a9f.zip
cpython-0ac30f82fe1beb4e0255d06c693ccfba56e45a9f.tar.gz
cpython-0ac30f82fe1beb4e0255d06c693ccfba56e45a9f.tar.bz2
3 files changed, 136 insertions, 130 deletions
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index 5c3d056..55e1643 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@@ -7,7 +7,8 @@ from unicodedata import ucd_3_2_0 as unicodedata
 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
 
 # IDNA section 5
-ace_prefix = "xn--"
+ace_prefix = b"xn--"
+sace_prefix = "xn--"
 
 # This assumes query strings, so AllowUnassigned is true
 def nameprep(label):
@@ -87,7 +88,7 @@ def ToASCII(label):
         raise UnicodeError("label empty or too long")
 
     # Step 5: Check ACE prefix
-    if label.startswith(ace_prefix):
+    if label.startswith(sace_prefix):
         raise UnicodeError("Label starts with ACE prefix")
 
     # Step 6: Encode with PUNYCODE
@@ -134,7 +135,7 @@ def ToUnicode(label):
 
     # Step 7: Compare the result of step 6 with the one of step 3
     # label2 will already be in lower case.
-    if label.lower() != label2:
+    if str(label, "ascii").lower() != str(label2, "ascii"):
         raise UnicodeError("IDNA does not round-trip", label, label2)
 
     # Step 8: return the result of step 5
@@ -143,7 +144,7 @@ def ToUnicode(label):
 ### Codec APIs
 
 class Codec(codecs.Codec):
-    def encode(self,input,errors='strict'):
+    def encode(self, input, errors='strict'):
 
         if errors != 'strict':
             # IDNA is quite clear that implementations must be strict
@@ -152,19 +153,21 @@ class Codec(codecs.Codec):
         if not input:
             return b"", 0
 
-        result = []
+        result = b""
         labels = dots.split(input)
-        if labels and len(labels[-1])==0:
+        if labels and not labels[-1]:
             trailing_dot = b'.'
             del labels[-1]
         else:
             trailing_dot = b''
         for label in labels:
-            result.append(ToASCII(label))
-        # Join with U+002E
-        return b".".join(result)+trailing_dot, len(input)
+            if result:
+                # Join with U+002E
+                result.extend(b'.')
+            result.extend(ToASCII(label))
+        return result+trailing_dot, len(input)
 
-    def decode(self,input,errors='strict'):
+    def decode(self, input, errors='strict'):
 
         if errors != 'strict':
             raise UnicodeError("Unsupported error handling "+errors)
@@ -199,30 +202,31 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
             raise UnicodeError("unsupported error handling "+errors)
 
         if not input:
-            return ("", 0)
+            return (b'', 0)
 
         labels = dots.split(input)
-        trailing_dot = ''
+        trailing_dot = b''
         if labels:
             if not labels[-1]:
-                trailing_dot = '.'
+                trailing_dot = b'.'
                 del labels[-1]
             elif not final:
                 # Keep potentially unfinished label until the next call
                 del labels[-1]
                 if labels:
-                    trailing_dot = '.'
+                    trailing_dot = b'.'
 
-        result = []
+        result = b""
         size = 0
         for label in labels:
-            result.append(ToASCII(label))
             if size:
+                # Join with U+002E
+                result.extend(b'.')
                 size += 1
+            result.extend(ToASCII(label))
             size += len(label)
 
-        # Join with U+002E
-        result = ".".join(result) + trailing_dot
+        result += trailing_dot
         size += len(trailing_dot)
         return (result, size)
 
@@ -239,8 +243,7 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
             labels = dots.split(input)
         else:
             # Must be ASCII string
-            input = str(input)
-            str(input, "ascii")
+            input = str(input, "ascii")
             labels = input.split(".")
 
         trailing_dot = ''
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
index 9d7df10..4c22fe5 100644
--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
@@ -181,6 +181,8 @@ def insertion_sort(base, extended, errors):
     return base
 
 def punycode_decode(text, errors):
+    if isinstance(text, str):
+        text = text.encode("ascii")
     pos = text.rfind(b"-")
     if pos == -1:
         base = ""
@@ -194,11 +196,11 @@ def punycode_decode(text, errors):
 
 class Codec(codecs.Codec):
 
-    def encode(self,input,errors='strict'):
+    def encode(self, input, errors='strict'):
         res = punycode_encode(input)
         return res, len(input)
 
-    def decode(self,input,errors='strict'):
+    def decode(self, input, errors='strict'):
         if errors not in ('strict', 'replace', 'ignore'):
             raise UnicodeError, "Unsupported error handling "+errors
         res = punycode_decode(input, errors)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index f8c22f8..f61cc33 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -624,6 +624,7 @@ class PunycodeTest(unittest.TestCase):
     def test_decode(self):
         for uni, puny in punycode_testcases:
             self.assertEquals(uni, puny.decode("punycode"))
+            self.assertEquals(uni, puny.decode("ascii").decode("punycode"))
 
 class UnicodeInternalTest(unittest.TestCase):
     def test_bug1251300(self):
@@ -676,154 +677,154 @@ class UnicodeInternalTest(unittest.TestCase):
 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
 nameprep_tests = [
     # 3.1 Map to nothing.
-    ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
-     '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
-     '\xb8\x8f\xef\xbb\xbf',
-     'foobarbaz'),
+    (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
+     b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
+     b'\xb8\x8f\xef\xbb\xbf',
+     b'foobarbaz'),
     # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
-    ('CAFE',
-     'cafe'),
+    (b'CAFE',
+     b'cafe'),
     # 3.3 Case folding 8bit U+00DF (german sharp s).
     # The original test case is bogus; it says \xc3\xdf
-    ('\xc3\x9f',
-     'ss'),
+    (b'\xc3\x9f',
+     b'ss'),
     # 3.4 Case folding U+0130 (turkish capital I with dot).
-    ('\xc4\xb0',
-     'i\xcc\x87'),
+    (b'\xc4\xb0',
+     b'i\xcc\x87'),
     # 3.5 Case folding multibyte U+0143 U+037A.
-    ('\xc5\x83\xcd\xba',
-     '\xc5\x84 \xce\xb9'),
+    (b'\xc5\x83\xcd\xba',
+     b'\xc5\x84 \xce\xb9'),
     # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
     # XXX: skip this as it fails in UCS-2 mode
     #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
     # 'telc\xe2\x88\x95kg\xcf\x83'),
     (None, None),
     # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
-    ('j\xcc\x8c\xc2\xa0\xc2\xaa',
-     '\xc7\xb0 a'),
+    (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
+     b'\xc7\xb0 a'),
     # 3.8 Case folding U+1FB7 and normalization.
-    ('\xe1\xbe\xb7',
-     '\xe1\xbe\xb6\xce\xb9'),
+    (b'\xe1\xbe\xb7',
+     b'\xe1\xbe\xb6\xce\xb9'),
     # 3.9 Self-reverting case folding U+01F0 and normalization.
     # The original test case is bogus, it says `\xc7\xf0'
-    ('\xc7\xb0',
-     '\xc7\xb0'),
+    (b'\xc7\xb0',
+     b'\xc7\xb0'),
     # 3.10 Self-reverting case folding U+0390 and normalization.
-    ('\xce\x90',
-     '\xce\x90'),
+    (b'\xce\x90',
+     b'\xce\x90'),
     # 3.11 Self-reverting case folding U+03B0 and normalization.
-    ('\xce\xb0',
-     '\xce\xb0'),
+    (b'\xce\xb0',
+     b'\xce\xb0'),
     # 3.12 Self-reverting case folding U+1E96 and normalization.
-    ('\xe1\xba\x96',
-     '\xe1\xba\x96'),
+    (b'\xe1\xba\x96',
+     b'\xe1\xba\x96'),
     # 3.13 Self-reverting case folding U+1F56 and normalization.
-    ('\xe1\xbd\x96',
-     '\xe1\xbd\x96'),
+    (b'\xe1\xbd\x96',
+     b'\xe1\xbd\x96'),
     # 3.14 ASCII space character U+0020.
-    (' ',
-     ' '),
+    (b' ',
+     b' '),
     # 3.15 Non-ASCII 8bit space character U+00A0.
-    ('\xc2\xa0',
-     ' '),
+    (b'\xc2\xa0',
+     b' '),
     # 3.16 Non-ASCII multibyte space character U+1680.
-    ('\xe1\x9a\x80',
+    (b'\xe1\x9a\x80',
      None),
     # 3.17 Non-ASCII multibyte space character U+2000.
-    ('\xe2\x80\x80',
-     ' '),
+    (b'\xe2\x80\x80',
+     b' '),
     # 3.18 Zero Width Space U+200b.
-    ('\xe2\x80\x8b',
-     ''),
+    (b'\xe2\x80\x8b',
+     b''),
     # 3.19 Non-ASCII multibyte space character U+3000.
-    ('\xe3\x80\x80',
-     ' '),
+    (b'\xe3\x80\x80',
+     b' '),
     # 3.20 ASCII control characters U+0010 U+007F.
-    ('\x10\x7f',
-     '\x10\x7f'),
+    (b'\x10\x7f',
+     b'\x10\x7f'),
     # 3.21 Non-ASCII 8bit control character U+0085.
-    ('\xc2\x85',
+    (b'\xc2\x85',
      None),
     # 3.22 Non-ASCII multibyte control character U+180E.
-    ('\xe1\xa0\x8e',
+    (b'\xe1\xa0\x8e',
      None),
     # 3.23 Zero Width No-Break Space U+FEFF.
-    ('\xef\xbb\xbf',
-     ''),
+    (b'\xef\xbb\xbf',
+     b''),
     # 3.24 Non-ASCII control character U+1D175.
-    ('\xf0\x9d\x85\xb5',
+    (b'\xf0\x9d\x85\xb5',
      None),
     # 3.25 Plane 0 private use character U+F123.
-    ('\xef\x84\xa3',
+    (b'\xef\x84\xa3',
      None),
     # 3.26 Plane 15 private use character U+F1234.
-    ('\xf3\xb1\x88\xb4',
+    (b'\xf3\xb1\x88\xb4',
      None),
     # 3.27 Plane 16 private use character U+10F234.
-    ('\xf4\x8f\x88\xb4',
+    (b'\xf4\x8f\x88\xb4',
      None),
     # 3.28 Non-character code point U+8FFFE.
-    ('\xf2\x8f\xbf\xbe',
+    (b'\xf2\x8f\xbf\xbe',
      None),
     # 3.29 Non-character code point U+10FFFF.
-    ('\xf4\x8f\xbf\xbf',
+    (b'\xf4\x8f\xbf\xbf',
      None),
     # 3.30 Surrogate code U+DF42.
-    ('\xed\xbd\x82',
+    (b'\xed\xbd\x82',
      None),
     # 3.31 Non-plain text character U+FFFD.
-    ('\xef\xbf\xbd',
+    (b'\xef\xbf\xbd',
      None),
     # 3.32 Ideographic description character U+2FF5.
-    ('\xe2\xbf\xb5',
+    (b'\xe2\xbf\xb5',
      None),
     # 3.33 Display property character U+0341.
-    ('\xcd\x81',
-     '\xcc\x81'),
+    (b'\xcd\x81',
+     b'\xcc\x81'),
     # 3.34 Left-to-right mark U+200E.
-    ('\xe2\x80\x8e',
+    (b'\xe2\x80\x8e',
      None),
     # 3.35 Deprecated U+202A.
-    ('\xe2\x80\xaa',
+    (b'\xe2\x80\xaa',
      None),
     # 3.36 Language tagging character U+E0001.
-    ('\xf3\xa0\x80\x81',
+    (b'\xf3\xa0\x80\x81',
      None),
     # 3.37 Language tagging character U+E0042.
-    ('\xf3\xa0\x81\x82',
+    (b'\xf3\xa0\x81\x82',
      None),
     # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
-    ('foo\xd6\xbebar',
+    (b'foo\xd6\xbebar',
      None),
     # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
-    ('foo\xef\xb5\x90bar',
+    (b'foo\xef\xb5\x90bar',
      None),
     # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
-    ('foo\xef\xb9\xb6bar',
-     'foo \xd9\x8ebar'),
+    (b'foo\xef\xb9\xb6bar',
+     b'foo \xd9\x8ebar'),
     # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
-    ('\xd8\xa71',
+    (b'\xd8\xa71',
      None),
     # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
-    ('\xd8\xa71\xd8\xa8',
-     '\xd8\xa71\xd8\xa8'),
+    (b'\xd8\xa71\xd8\xa8',
+     b'\xd8\xa71\xd8\xa8'),
     # 3.43 Unassigned code point U+E0002.
     # Skip this test as we allow unassigned
-    #('\xf3\xa0\x80\x82',
+    #(b'\xf3\xa0\x80\x82',
     # None),
     (None, None),
     # 3.44 Larger test (shrinking).
     # Original test case reads \xc3\xdf
-    ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
-     '\xaa\xce\xb0\xe2\x80\x80',
-     'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
+    (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
+     b'\xaa\xce\xb0\xe2\x80\x80',
+     b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
     # 3.45 Larger test (expanding).
     # Original test case reads \xc3\x9f
-    ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
-     '\x80',
-     'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
-     '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
-     '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
+    (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
+     b'\x80',
+     b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
+     b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
+     b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
     ]
 
 
@@ -848,16 +849,16 @@ class NameprepTest(unittest.TestCase):
 
 class IDNACodecTest(unittest.TestCase):
     def test_builtin_decode(self):
-        self.assertEquals(str("python.org", "idna"), "python.org")
-        self.assertEquals(str("python.org.", "idna"), "python.org.")
-        self.assertEquals(str("xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
-        self.assertEquals(str("xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
+        self.assertEquals(str(b"python.org", "idna"), "python.org")
+        self.assertEquals(str(b"python.org.", "idna"), "python.org.")
+        self.assertEquals(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
+        self.assertEquals(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
 
     def test_builtin_encode(self):
-        self.assertEquals("python.org".encode("idna"), "python.org")
-        self.assertEquals("python.org.".encode("idna"), "python.org.")
-        self.assertEquals("pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
-        self.assertEquals("pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
+        self.assertEquals("python.org".encode("idna"), b"python.org")
+        self.assertEquals("python.org.".encode("idna"), b"python.org.")
+        self.assertEquals("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
+        self.assertEquals("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
 
     def test_stream(self):
         r = codecs.getreader("idna")(io.BytesIO(b"abc"))
@@ -866,61 +867,61 @@ class IDNACodecTest(unittest.TestCase):
 
     def test_incremental_decode(self):
         self.assertEquals(
-            "".join(codecs.iterdecode("python.org", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org"), "idna")),
             "python.org"
         )
         self.assertEquals(
-            "".join(codecs.iterdecode("python.org.", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"python.org."), "idna")),
             "python.org."
         )
         self.assertEquals(
-            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
             "pyth\xf6n.org."
         )
         self.assertEquals(
-            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
+            "".join(codecs.iterdecode((bytes(chr(c)) for c in b"xn--pythn-mua.org."), "idna")),
             "pyth\xf6n.org."
         )
 
         decoder = codecs.getincrementaldecoder("idna")()
-        self.assertEquals(decoder.decode("xn--xam", ), "")
-        self.assertEquals(decoder.decode("ple-9ta.o", ), "\xe4xample.")
-        self.assertEquals(decoder.decode("rg"), "")
-        self.assertEquals(decoder.decode("", True), "org")
+        self.assertEquals(decoder.decode(b"xn--xam", ), "")
+        self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
+        self.assertEquals(decoder.decode(b"rg"), "")
+        self.assertEquals(decoder.decode(b"", True), "org")
 
         decoder.reset()
-        self.assertEquals(decoder.decode("xn--xam", ), "")
-        self.assertEquals(decoder.decode("ple-9ta.o", ), "\xe4xample.")
-        self.assertEquals(decoder.decode("rg."), "org.")
-        self.assertEquals(decoder.decode("", True), "")
+        self.assertEquals(decoder.decode(b"xn--xam", ), "")
+        self.assertEquals(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
+        self.assertEquals(decoder.decode(b"rg."), "org.")
+        self.assertEquals(decoder.decode(b"", True), "")
 
     def test_incremental_encode(self):
         self.assertEquals(
-            "".join(codecs.iterencode("python.org", "idna")),
-            "python.org"
+            b"".join(codecs.iterencode("python.org", "idna")),
+            b"python.org"
         )
         self.assertEquals(
-            "".join(codecs.iterencode("python.org.", "idna")),
-            "python.org."
+            b"".join(codecs.iterencode("python.org.", "idna")),
+            b"python.org."
         )
         self.assertEquals(
-            "".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
-            "xn--pythn-mua.org."
+            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
+            b"xn--pythn-mua.org."
         )
         self.assertEquals(
-            "".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
-            "xn--pythn-mua.org."
+            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
+            b"xn--pythn-mua.org."
         )
 
         encoder = codecs.getincrementalencoder("idna")()
-        self.assertEquals(encoder.encode("\xe4x"), "")
-        self.assertEquals(encoder.encode("ample.org"), "xn--xample-9ta.")
-        self.assertEquals(encoder.encode("", True), "org")
+        self.assertEquals(encoder.encode("\xe4x"), b"")
+        self.assertEquals(encoder.encode("ample.org"), b"xn--xample-9ta.")
+        self.assertEquals(encoder.encode("", True), b"org")
 
         encoder.reset()
-        self.assertEquals(encoder.encode("\xe4x"), "")
-        self.assertEquals(encoder.encode("ample.org."), "xn--xample-9ta.org.")
-        self.assertEquals(encoder.encode("", True), "")
+        self.assertEquals(encoder.encode("\xe4x"), b"")
+        self.assertEquals(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
+        self.assertEquals(encoder.encode("", True), b"")
 
 class CodecsModuleTest(unittest.TestCase):
author	Walter Dörwald <walter@livinglogic.de>	2007-05-11 10:32:57 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2007-05-11 10:32:57 (GMT)
commit	0ac30f82fe1beb4e0255d06c693ccfba56e45a9f (patch)
tree	1795d671685687ef172c7f4d57290292cdf06879
parent	1f05a3b7fb754d6b30300e1e50aeb92aabe6afd6 (diff)
download	cpython-0ac30f82fe1beb4e0255d06c693ccfba56e45a9f.zip cpython-0ac30f82fe1beb4e0255d06c693ccfba56e45a9f.tar.gz cpython-0ac30f82fe1beb4e0255d06c693ccfba56e45a9f.tar.bz2