From a4c612845aceed4a9f1ef25328b0cfa39d5038ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Thu, 10 May 2007 12:36:25 +0000 Subject: Fix punycode codec and tests. --- Lib/encodings/punycode.py | 33 ++++++++++++++--------------- Lib/test/test_codecs.py | 53 +++++++++++++++++++++++++---------------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index 89906ae..9d7df10 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -10,15 +10,15 @@ import codecs def segregate(str): """3.1 Basic code point segregation""" - base = [] - extended = {} + base = b"" + extended = set() for c in str: if ord(c) < 128: - base.append(c) + base.append(ord(c)) else: - extended[c] = 1 - extended = sorted(extended.keys()) - return "".join(base).encode("ascii"),extended + extended.add(c) + extended = sorted(extended) + return (base, extended) def selective_len(str, max): """Return the length of str, considering only characters below max.""" @@ -75,10 +75,10 @@ def T(j, bias): if res > 26: return 26 return res -digits = "abcdefghijklmnopqrstuvwxyz0123456789" +digits = b"abcdefghijklmnopqrstuvwxyz0123456789" def generate_generalized_integer(N, bias): """3.3 Generalized variable-length integers""" - result = [] + result = b"" j = 0 while 1: t = T(j, bias) @@ -107,21 +107,20 @@ def adapt(delta, first, numchars): def generate_integers(baselen, deltas): """3.4 Bias adaptation""" # Punycode parameters: initial bias = 72, damp = 700, skew = 38 - result = [] + result = b"" bias = 72 for points, delta in enumerate(deltas): s = generate_generalized_integer(delta, bias) result.extend(s) bias = adapt(delta, points==0, baselen+points+1) - return "".join(result) + return result def punycode_encode(text): base, extended = segregate(text) - base = base.encode("ascii") deltas = insertion_unsort(text, extended) extended = generate_integers(len(base), deltas) if base: - return base + "-" + extended + return base + b"-" + extended return extended ##################### Decoding ##################################### @@ -182,15 +181,13 @@ def insertion_sort(base, extended, errors): return base def punycode_decode(text, errors): - pos = text.rfind("-") + pos = text.rfind(b"-") if pos == -1: base = "" - extended = text + extended = str(text, "ascii").upper() else: - base = text[:pos] - extended = text[pos+1:] - base = str(base, "ascii", errors) - extended = extended.upper() + base = str(text[:pos], "ascii", errors) + extended = str(text[pos+1:], "ascii").upper() return insertion_sort(base, extended, errors) ### Codec APIs diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 9634099..db63648 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -505,48 +505,48 @@ punycode_testcases = [ # A Arabic (Egyptian): ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", - "egbpdaj6bu4bxfgehfvwxn"), + b"egbpdaj6bu4bxfgehfvwxn"), # B Chinese (simplified): ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", - "ihqwcrb4cv8a8dqg056pqjye"), + b"ihqwcrb4cv8a8dqg056pqjye"), # C Chinese (traditional): ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", - "ihqwctvzc91f659drss3x8bo0yb"), + b"ihqwctvzc91f659drss3x8bo0yb"), # D Czech: Proprostnemluvesky ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" "\u0065\u0073\u006B\u0079", - "Proprostnemluvesky-uyb24dma41a"), + b"Proprostnemluvesky-uyb24dma41a"), # E Hebrew: ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" "\u05D1\u05E8\u05D9\u05EA", - "4dbcagdahymbxekheh6e0a7fei0b"), + b"4dbcagdahymbxekheh6e0a7fei0b"), # F Hindi (Devanagari): ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" - "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" - "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" - "\u0939\u0948\u0902", - "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), + "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" + "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" + "\u0939\u0948\u0902", + b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), #(G) Japanese (kanji and hiragana): ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" - "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", - "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), + "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", + b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), # (H) Korean (Hangul syllables): ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", - "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" - "psd879ccm6fea98c"), + b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" + b"psd879ccm6fea98c"), # (I) Russian (Cyrillic): ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" "\u0438", - "b1abfaaepdrnnbgefbaDotcwatmq2g4l"), + b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"), # (J) Spanish: PorqunopuedensimplementehablarenEspaol ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" @@ -554,7 +554,7 @@ punycode_testcases = [ "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" "\u0061\u00F1\u006F\u006C", - "PorqunopuedensimplementehablarenEspaol-fmd56a"), + b"PorqunopuedensimplementehablarenEspaol-fmd56a"), # (K) Vietnamese: # Tisaohkhngthch\ @@ -563,45 +563,45 @@ punycode_testcases = [ "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" "\u0056\u0069\u1EC7\u0074", - "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), + b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), #(L) 3B ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", - "3B-ww4c5e180e575a65lsy2b"), + b"3B-ww4c5e180e575a65lsy2b"), # (M) -with-SUPER-MONKEYS ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" "\u004F\u004E\u004B\u0045\u0059\u0053", - "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), + b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), # (N) Hello-Another-Way- ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" "\u305D\u308C\u305E\u308C\u306E\u5834\u6240", - "Hello-Another-Way--fc4qua05auwb3674vfr0b"), + b"Hello-Another-Way--fc4qua05auwb3674vfr0b"), # (O) 2 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", - "2-u9tlzr9756bt3uc0v"), + b"2-u9tlzr9756bt3uc0v"), # (P) MajiKoi5 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" "\u308B\u0035\u79D2\u524D", - "MajiKoi5-783gue6qz075azm5e"), + b"MajiKoi5-783gue6qz075azm5e"), # (Q) de ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", - "de-jg4avhby1noc0d"), + b"de-jg4avhby1noc0d"), # (R) ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", - "d9juau41awczczp"), + b"d9juau41awczczp"), # (S) -> $1.00 <- ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" "\u003C\u002D", - "-> $1.00 <--") + b"-> $1.00 <--") ] for i in punycode_testcases: @@ -616,7 +616,10 @@ class PunycodeTest(unittest.TestCase): # code produces only lower case. Converting just puny to # lower is also insufficient, since some of the input characters # are upper case. - self.assertEquals(uni.encode("punycode").lower(), puny.lower()) + self.assertEquals( + str(uni.encode("punycode"), "ascii").lower(), + str(puny, "ascii").lower() + ) def test_decode(self): for uni, puny in punycode_testcases: -- cgit v0.12