Implement IDNA (Internationalized Domain Names in Applications).

author: Martin v. Löwis <martin@v.loewis.de> 2003-04-18 10:39:54 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2003-04-18 10:39:54 (GMT)
commit: 2548c730c17d766ca04b2bf633552655f7f96cdf (patch)
tree: b128f16abd8b4c3058d1be4093f30bfb5454b59e /Lib
parent: 8d17a90b830ae9b9c672a504f01d4f93bac3d23d (diff)
download: cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.zip
cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.gz
cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.bz2
5 files changed, 981 insertions, 3 deletions
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
new file mode 100644
index 0000000..7e4d04e
--- /dev/null
+++ b/Lib/encodings/idna.py
@@ -0,0 +1,187 @@
+# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
+
+import stringprep, unicodedata, re, codecs
+
+# IDNA section 3.1
+dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
+
+# IDNA section 5
+ace_prefix = "xn--"
+uace_prefix = unicode(ace_prefix, "ascii")
+
+# This assumes query strings, so AllowUnassigned is true
+def nameprep(label):
+    # Map
+    newlabel = []
+    for c in label:
+        if stringprep.in_table_b1(c):
+            # Map to nothing
+            continue
+        newlabel.append(stringprep.map_table_b2(c))
+    label = u"".join(newlabel)
+    
+    # Normalize
+    label = unicodedata.normalize("NFKC", label)
+    
+    # Prohibit
+    for c in label:
+        if stringprep.in_table_c12(c) or \
+           stringprep.in_table_c22(c) or \
+           stringprep.in_table_c3(c) or \
+           stringprep.in_table_c4(c) or \
+           stringprep.in_table_c5(c) or \
+           stringprep.in_table_c6(c) or \
+           stringprep.in_table_c7(c) or \
+           stringprep.in_table_c8(c) or \
+           stringprep.in_table_c9(c):
+            raise UnicodeError, "Invalid character %s" % repr(c)
+
+    # Check bidi
+    RandAL = map(stringprep.in_table_d1, label)
+    for c in RandAL:
+        if c:
+            # There is a RandAL char in the string. Must perform further
+            # tests:
+            # 1) The characters in section 5.8 MUST be prohibited.
+            # This is table C.8, which was already checked
+            # 2) If a string contains any RandALCat character, the string
+            # MUST NOT contain any LCat character.
+            if filter(stringprep.in_table_d2, label):
+                raise UnicodeError, "Violation of BIDI requirement 2"
+
+            # 3) If a string contains any RandALCat character, a
+            # RandALCat character MUST be the first character of the
+            # string, and a RandALCat character MUST be the last
+            # character of the string.
+            if not RandAL[0] or not RandAL[-1]:
+                raise UnicodeError, "Violation of BIDI requirement 3"
+
+    return label
+
+def ToASCII(label):
+    try:
+        # Step 1: try ASCII
+        label = label.encode("ascii")
+    except UnicodeError:
+        pass
+    else:
+        # Skip to step 3: UseSTD3ASCIIRules is false, so
+        # Skip to step 8.
+        if 0 < len(label) < 64:
+            return label
+        raise UnicodeError, "label too long"
+
+    # Step 2: nameprep
+    label = nameprep(label)
+
+    # Step 3: UseSTD3ASCIIRules is false
+    # Step 4: try ASCII
+    try:
+        label = label.encode("ascii")
+    except UnicodeError:
+        pass
+    else:
+        # Skip to step 8.
+        if 0 < len(label) < 64:
+            return label
+        raise UnicodeError, "label too long"
+
+    # Step 5: Check ACE prefix
+    if label.startswith(uace_prefix):
+        raise UnicodeError, "Label starts with ACE prefix"
+
+    # Step 6: Encode with PUNYCODE
+    label = label.encode("punycode")
+
+    # Step 7: Prepend ACE prefix
+    label = ace_prefix + label
+
+    # Step 8: Check size
+    if 0 < len(label) < 64:
+        return label
+    raise UnicodeError, "label too long"
+
+def ToUnicode(label):
+    # Step 1: Check for ASCII
+    if isinstance(label, str):
+        pure_ascii = True
+    else:
+        try:
+            label = label.encode("ascii")
+            pure_ascii = True
+        except UnicodeError:
+            pure_ascii = False
+    if not pure_ascii:
+        # Step 2: Perform nameprep
+        label = nameprep(label)
+        # It doesn't say this, but apparently, it should be ASCII now
+        try:
+            label = label.encode("ascii")
+        except UnicodeError:
+            raise UnicodeError, "Invalid character in IDN label"
+    # Step 3: Check for ACE prefix
+    if not label.startswith(ace_prefix):
+        return unicode(label, "ascii")
+
+    # Step 4: Remove ACE prefix
+    label1 = label[len(ace_prefix):]
+
+    # Step 5: Decode using PUNYCODE
+    result = label1.decode("punycode")
+
+    # Step 6: Apply ToASCII
+    label2 = ToASCII(result)
+
+    # Step 7: Compare the result of step 6 with the one of step 3
+    # label2 will already be in lower case.
+    if label.lower() != label2:
+        raise UnicodeError, ("IDNA does not round-trip", label, label2)
+
+    # Step 8: return the result of step 5
+    return result
+        
+### Codec APIs
+
+class Codec(codecs.Codec):
+    def encode(self,input,errors='strict'):
+
+        if errors != 'strict':
+            # IDNA is quite clear that implementations must be strict
+            raise UnicodeError, "unsupported error handling "+errors
+
+        result = []
+        for label in dots.split(input):
+            result.append(ToASCII(label))
+        # Join with U+002E
+        return ".".join(result), len(input)
+
+    def decode(self,input,errors='strict'):
+        
+        if errors != 'strict':
+            raise UnicodeError, "Unsupported error handling "+errors
+
+        # IDNA allows decoding to operate on Unicode strings, too.
+        if isinstance(input, unicode):
+            labels = dots.split(input)
+        else:
+            # Must be ASCII string
+            unicode(input, "ascii")
+            labels = input.split(".")
+
+        result = []
+        for label in labels:
+            result.append(ToUnicode(label))
+
+        return u".".join(result), len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+    pass
+
+class StreamReader(Codec,codecs.StreamReader):
+    pass
+
+### encodings module API
+
+def getregentry():
+
+    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
new file mode 100644
index 0000000..e7f2d45
--- /dev/null
+++ b/Lib/encodings/punycode.py
@@ -0,0 +1,222 @@
+# -*- coding: iso-8859-1 -*-
+""" Codec for the Punicode encoding, as specified in RFC 3492
+
+Written by Martin v. L�wis.
+"""
+
+import codecs
+
+##################### Encoding #####################################
+
+def segregate(str):
+    """3.1 Basic code point segregation""" 
+    base = []
+    extended = {}
+    for c in str:
+        if ord(c) < 128:
+            base.append(c)
+        else:
+            extended[c] = 1
+    extended = extended.keys()
+    extended.sort()
+    return "".join(base).encode("ascii"),extended
+
+def selective_len(str, max):
+    """Return the length of str, considering only characters below max."""
+    res = 0
+    for c in str:
+        if ord(c) < max:
+            res += 1
+    return res
+
+def selective_find(str, char, index, pos):
+    """Return a pair (index, pos), indicating the next occurrence of
+    char in str. index is the position of the character considering
+    only ordinals up to and including char, and pos is the position in
+    the full string. index/pos is the starting position in the full
+    string."""
+
+    l = len(str)
+    while 1:
+        pos += 1
+        if pos == l:
+            return (-1, -1)
+        c = str[pos]
+        if c == char:
+            return index+1, pos
+        elif c < char:
+            index += 1
+
+def insertion_unsort(str, extended):
+    """3.2 Insertion unsort coding"""
+    oldchar = 0x80
+    result = []
+    oldindex = -1
+    for c in extended:
+        index = pos = -1
+        char = ord(c)
+        curlen = selective_len(str, char)
+        delta = (curlen+1) * (char - oldchar)
+        while 1:
+            index,pos = selective_find(str,c,index,pos)
+            if index == -1:
+                break
+            delta += index - oldindex
+            result.append(delta-1)
+            oldindex = index
+            delta = 0
+        oldchar = char
+            
+    return result
+
+def T(j, bias):
+    # Punycode parameters: tmin = 1, tmax = 26, base = 36
+    res = 36 * (j + 1) - bias
+    if res < 1: return 1
+    if res > 26: return 26
+    return res
+
+digits = "abcdefghijklmnopqrstuvwxyz0123456789"
+def generate_generalized_integer(N, bias):
+    """3.3 Generalized variable-length integers"""
+    result = []
+    j = 0
+    while 1:
+        t = T(j, bias)
+        if N < t:
+            result.append(digits[N])
+            return result
+        result.append(digits[t + ((N - t) % (36 - t))])
+        N = (N - t) // (36 - t)
+        j += 1
+
+def adapt(delta, first, numchars):
+    if first:
+        delta //= 700
+    else:
+        delta //= 2
+    delta += delta // numchars
+    # ((base - tmin) * tmax) // 2 == 455
+    divisions = 0
+    while delta > 455:
+        delta = delta // 35 # base - tmin
+        divisions += 36
+    bias = divisions + (36 * delta // (delta + 38))
+    return bias
+    
+
+def generate_integers(baselen, deltas):
+    """3.4 Bias adaptation"""
+    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
+    result = []
+    bias = 72
+    for points, delta in enumerate(deltas):
+        s = generate_generalized_integer(delta, bias)
+        result.extend(s)
+        bias = adapt(delta, points==0, baselen+points+1)
+    return "".join(result)
+
+def punycode_encode(text):
+    base, extended = segregate(text)
+    base = base.encode("ascii")
+    deltas = insertion_unsort(text, extended)
+    extended = generate_integers(len(base), deltas)
+    if base:
+        return base + "-" + extended
+    return extended
+
+##################### Decoding #####################################
+
+def decode_generalized_number(extended, extpos, bias, errors):
+    """3.3 Generalized variable-length integers"""
+    result = 0
+    w = 1
+    j = 0
+    while 1:
+        try:
+            char = ord(extended[extpos])
+        except IndexError:
+            if errors == "strict":
+                raise UnicodeError, "incomplete punicode string"
+            return extpos + 1, None
+        extpos += 1
+        if 0x41 <= char <= 0x5A: # A-Z
+            digit = char - 0x41
+        elif 0x30 <= char <= 0x39:
+            digit = char - 22 # 0x30-26
+        elif errors == "strict":
+            raise UnicodeError("Invalid extended code point '%s'"
+                               % extended[extpos])
+        else:
+            return extpos, None
+        t = T(j, bias)
+        result += digit * w
+        if digit < t:
+            return extpos, result
+        w = w * (36 - t)
+        j += 1
+        
+
+def insertion_sort(base, extended, errors):
+    """3.2 Insertion unsort coding"""
+    char = 0x80
+    pos = -1
+    bias = 72
+    extpos = 0
+    while extpos < len(extended):
+        newpos, delta = decode_generalized_number(extended, extpos,
+                                                  bias, errors)
+        if delta is None:
+            # There was an error in decoding. We can't continue because
+            # synchronization is lost.
+            return base
+        pos += delta+1
+        char += pos // (len(base) + 1)
+        if char > 0x10FFFF:
+            if errors == "strict":
+                raise UnicodeError, ("Invalid character U+%x" % char)
+            char = ord('?')
+        pos = pos % (len(base) + 1)
+        base = base[:pos] + unichr(char) + base[pos:]
+        bias = adapt(delta, (extpos == 0), len(base))
+        extpos = newpos
+    return base
+
+def punycode_decode(text, errors):
+    pos = text.rfind("-")
+    if pos == -1:
+        base = ""
+        extended = text
+    else:
+        base = text[:pos]
+        extended = text[pos+1:]
+    base = unicode(base, "ascii", errors)
+    extended = extended.upper()
+    return insertion_sort(base, extended, errors)
+        
+### Codec APIs
+
+class Codec(codecs.Codec):
+    def encode(self,input,errors='strict'):
+
+        res = punycode_encode(input)
+        return res, len(input)
+
+    def decode(self,input,errors='strict'):
+
+        if errors not in ('strict', 'replace', 'ignore'):
+            raise UnicodeError, "Unsupported error handling "+errors
+        res = punycode_decode(input, errors)
+        return res, len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+    pass
+
+class StreamReader(Codec,codecs.StreamReader):
+    pass
+
+### encodings module API
+
+def getregentry():
+
+    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
diff --git a/Lib/httplib.py b/Lib/httplib.py
index ca215a4..caf6ccd 100644
--- a/Lib/httplib.py
+++ b/Lib/httplib.py
@@ -655,11 +655,11 @@ class HTTPConnection:
                     nil, netloc, nil, nil, nil = urlsplit(url)
 
                 if netloc:
-                    self.putheader('Host', netloc)
+                    self.putheader('Host', netloc.encode("idna"))
                 elif self.port == HTTP_PORT:
-                    self.putheader('Host', self.host)
+                    self.putheader('Host', self.host.encode("idna"))
                 else:
-                    self.putheader('Host', "%s:%s" % (self.host, self.port))
+                    self.putheader('Host', "%s:%s" % (self.host.encode("idna"), self.port))
 
             # note: we are assuming that clients will not attempt to set these
             #       headers since *this* library must deal with the
diff --git a/Lib/stringprep.py b/Lib/stringprep.py
new file mode 100644
index 0000000..ec5b098
--- /dev/null
+++ b/Lib/stringprep.py
@@ -0,0 +1,273 @@
+# This file is generated by mkstringprep.py. DO NOT EDIT.
+"""Library that exposes various tables found in the StringPrep RFC 3454.
+
+There are two kinds of tables: sets, for which a member test is provided,
+and mappings, for which a mapping function is provided.
+"""
+
+import unicodedata, sets
+
+assert unicodedata.unidata_version == '3.2.0'
+
+def in_table_a1(code):
+    if unicodedata.category(code) != 'Cn': return False
+    c = ord(code)
+    if 0xFDD0 <= c < 0xFDF0: return False
+    return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
+
+
+b1_set = sets.Set([173, 847, 6150, 6155, 6156, 6157, 8203, 8204, 8205, 8288, 65279] + range(65024,65040))
+def in_table_b1(code):
+    return ord(code) in b1_set
+
+
+b3_exceptions = {
+0xb5:u'\u03bc', 0xdf:u'ss', 0x130:u'i\u0307', 0x149:u'\u02bcn',
+0x17f:u's', 0x1f0:u'j\u030c', 0x345:u'\u03b9', 0x37a:u' \u03b9',
+0x390:u'\u03b9\u0308\u0301', 0x3b0:u'\u03c5\u0308\u0301', 0x3c2:u'\u03c3', 0x3d0:u'\u03b2',
+0x3d1:u'\u03b8', 0x3d2:u'\u03c5', 0x3d3:u'\u03cd', 0x3d4:u'\u03cb',
+0x3d5:u'\u03c6', 0x3d6:u'\u03c0', 0x3f0:u'\u03ba', 0x3f1:u'\u03c1',
+0x3f2:u'\u03c3', 0x3f5:u'\u03b5', 0x587:u'\u0565\u0582', 0x1e96:u'h\u0331',
+0x1e97:u't\u0308', 0x1e98:u'w\u030a', 0x1e99:u'y\u030a', 0x1e9a:u'a\u02be',
+0x1e9b:u'\u1e61', 0x1f50:u'\u03c5\u0313', 0x1f52:u'\u03c5\u0313\u0300', 0x1f54:u'\u03c5\u0313\u0301',
+0x1f56:u'\u03c5\u0313\u0342', 0x1f80:u'\u1f00\u03b9', 0x1f81:u'\u1f01\u03b9', 0x1f82:u'\u1f02\u03b9',
+0x1f83:u'\u1f03\u03b9', 0x1f84:u'\u1f04\u03b9', 0x1f85:u'\u1f05\u03b9', 0x1f86:u'\u1f06\u03b9',
+0x1f87:u'\u1f07\u03b9', 0x1f88:u'\u1f00\u03b9', 0x1f89:u'\u1f01\u03b9', 0x1f8a:u'\u1f02\u03b9',
+0x1f8b:u'\u1f03\u03b9', 0x1f8c:u'\u1f04\u03b9', 0x1f8d:u'\u1f05\u03b9', 0x1f8e:u'\u1f06\u03b9',
+0x1f8f:u'\u1f07\u03b9', 0x1f90:u'\u1f20\u03b9', 0x1f91:u'\u1f21\u03b9', 0x1f92:u'\u1f22\u03b9',
+0x1f93:u'\u1f23\u03b9', 0x1f94:u'\u1f24\u03b9', 0x1f95:u'\u1f25\u03b9', 0x1f96:u'\u1f26\u03b9',
+0x1f97:u'\u1f27\u03b9', 0x1f98:u'\u1f20\u03b9', 0x1f99:u'\u1f21\u03b9', 0x1f9a:u'\u1f22\u03b9',
+0x1f9b:u'\u1f23\u03b9', 0x1f9c:u'\u1f24\u03b9', 0x1f9d:u'\u1f25\u03b9', 0x1f9e:u'\u1f26\u03b9',
+0x1f9f:u'\u1f27\u03b9', 0x1fa0:u'\u1f60\u03b9', 0x1fa1:u'\u1f61\u03b9', 0x1fa2:u'\u1f62\u03b9',
+0x1fa3:u'\u1f63\u03b9', 0x1fa4:u'\u1f64\u03b9', 0x1fa5:u'\u1f65\u03b9', 0x1fa6:u'\u1f66\u03b9',
+0x1fa7:u'\u1f67\u03b9', 0x1fa8:u'\u1f60\u03b9', 0x1fa9:u'\u1f61\u03b9', 0x1faa:u'\u1f62\u03b9',
+0x1fab:u'\u1f63\u03b9', 0x1fac:u'\u1f64\u03b9', 0x1fad:u'\u1f65\u03b9', 0x1fae:u'\u1f66\u03b9',
+0x1faf:u'\u1f67\u03b9', 0x1fb2:u'\u1f70\u03b9', 0x1fb3:u'\u03b1\u03b9', 0x1fb4:u'\u03ac\u03b9',
+0x1fb6:u'\u03b1\u0342', 0x1fb7:u'\u03b1\u0342\u03b9', 0x1fbc:u'\u03b1\u03b9', 0x1fbe:u'\u03b9',
+0x1fc2:u'\u1f74\u03b9', 0x1fc3:u'\u03b7\u03b9', 0x1fc4:u'\u03ae\u03b9', 0x1fc6:u'\u03b7\u0342',
+0x1fc7:u'\u03b7\u0342\u03b9', 0x1fcc:u'\u03b7\u03b9', 0x1fd2:u'\u03b9\u0308\u0300', 0x1fd3:u'\u03b9\u0308\u0301',
+0x1fd6:u'\u03b9\u0342', 0x1fd7:u'\u03b9\u0308\u0342', 0x1fe2:u'\u03c5\u0308\u0300', 0x1fe3:u'\u03c5\u0308\u0301',
+0x1fe4:u'\u03c1\u0313', 0x1fe6:u'\u03c5\u0342', 0x1fe7:u'\u03c5\u0308\u0342', 0x1ff2:u'\u1f7c\u03b9',
+0x1ff3:u'\u03c9\u03b9', 0x1ff4:u'\u03ce\u03b9', 0x1ff6:u'\u03c9\u0342', 0x1ff7:u'\u03c9\u0342\u03b9',
+0x1ffc:u'\u03c9\u03b9', 0x20a8:u'rs', 0x2102:u'c', 0x2103:u'\xb0c',
+0x2107:u'\u025b', 0x2109:u'\xb0f', 0x210b:u'h', 0x210c:u'h',
+0x210d:u'h', 0x2110:u'i', 0x2111:u'i', 0x2112:u'l',
+0x2115:u'n', 0x2116:u'no', 0x2119:u'p', 0x211a:u'q',
+0x211b:u'r', 0x211c:u'r', 0x211d:u'r', 0x2120:u'sm',
+0x2121:u'tel', 0x2122:u'tm', 0x2124:u'z', 0x2128:u'z',
+0x212c:u'b', 0x212d:u'c', 0x2130:u'e', 0x2131:u'f',
+0x2133:u'm', 0x213e:u'\u03b3', 0x213f:u'\u03c0', 0x2145:u'd',
+0x3371:u'hpa', 0x3373:u'au', 0x3375:u'ov', 0x3380:u'pa',
+0x3381:u'na', 0x3382:u'\u03bca', 0x3383:u'ma', 0x3384:u'ka',
+0x3385:u'kb', 0x3386:u'mb', 0x3387:u'gb', 0x338a:u'pf',
+0x338b:u'nf', 0x338c:u'\u03bcf', 0x3390:u'hz', 0x3391:u'khz',
+0x3392:u'mhz', 0x3393:u'ghz', 0x3394:u'thz', 0x33a9:u'pa',
+0x33aa:u'kpa', 0x33ab:u'mpa', 0x33ac:u'gpa', 0x33b4:u'pv',
+0x33b5:u'nv', 0x33b6:u'\u03bcv', 0x33b7:u'mv', 0x33b8:u'kv',
+0x33b9:u'mv', 0x33ba:u'pw', 0x33bb:u'nw', 0x33bc:u'\u03bcw',
+0x33bd:u'mw', 0x33be:u'kw', 0x33bf:u'mw', 0x33c0:u'k\u03c9',
+0x33c1:u'm\u03c9', 0x33c3:u'bq', 0x33c6:u'c\u2215kg', 0x33c7:u'co.',
+0x33c8:u'db', 0x33c9:u'gy', 0x33cb:u'hp', 0x33cd:u'kk',
+0x33ce:u'km', 0x33d7:u'ph', 0x33d9:u'ppm', 0x33da:u'pr',
+0x33dc:u'sv', 0x33dd:u'wb', 0xfb00:u'ff', 0xfb01:u'fi',
+0xfb02:u'fl', 0xfb03:u'ffi', 0xfb04:u'ffl', 0xfb05:u'st',
+0xfb06:u'st', 0xfb13:u'\u0574\u0576', 0xfb14:u'\u0574\u0565', 0xfb15:u'\u0574\u056b',
+0xfb16:u'\u057e\u0576', 0xfb17:u'\u0574\u056d', 0x1d400:u'a', 0x1d401:u'b',
+0x1d402:u'c', 0x1d403:u'd', 0x1d404:u'e', 0x1d405:u'f',
+0x1d406:u'g', 0x1d407:u'h', 0x1d408:u'i', 0x1d409:u'j',
+0x1d40a:u'k', 0x1d40b:u'l', 0x1d40c:u'm', 0x1d40d:u'n',
+0x1d40e:u'o', 0x1d40f:u'p', 0x1d410:u'q', 0x1d411:u'r',
+0x1d412:u's', 0x1d413:u't', 0x1d414:u'u', 0x1d415:u'v',
+0x1d416:u'w', 0x1d417:u'x', 0x1d418:u'y', 0x1d419:u'z',
+0x1d434:u'a', 0x1d435:u'b', 0x1d436:u'c', 0x1d437:u'd',
+0x1d438:u'e', 0x1d439:u'f', 0x1d43a:u'g', 0x1d43b:u'h',
+0x1d43c:u'i', 0x1d43d:u'j', 0x1d43e:u'k', 0x1d43f:u'l',
+0x1d440:u'm', 0x1d441:u'n', 0x1d442:u'o', 0x1d443:u'p',
+0x1d444:u'q', 0x1d445:u'r', 0x1d446:u's', 0x1d447:u't',
+0x1d448:u'u', 0x1d449:u'v', 0x1d44a:u'w', 0x1d44b:u'x',
+0x1d44c:u'y', 0x1d44d:u'z', 0x1d468:u'a', 0x1d469:u'b',
+0x1d46a:u'c', 0x1d46b:u'd', 0x1d46c:u'e', 0x1d46d:u'f',
+0x1d46e:u'g', 0x1d46f:u'h', 0x1d470:u'i', 0x1d471:u'j',
+0x1d472:u'k', 0x1d473:u'l', 0x1d474:u'm', 0x1d475:u'n',
+0x1d476:u'o', 0x1d477:u'p', 0x1d478:u'q', 0x1d479:u'r',
+0x1d47a:u's', 0x1d47b:u't', 0x1d47c:u'u', 0x1d47d:u'v',
+0x1d47e:u'w', 0x1d47f:u'x', 0x1d480:u'y', 0x1d481:u'z',
+0x1d49c:u'a', 0x1d49e:u'c', 0x1d49f:u'd', 0x1d4a2:u'g',
+0x1d4a5:u'j', 0x1d4a6:u'k', 0x1d4a9:u'n', 0x1d4aa:u'o',
+0x1d4ab:u'p', 0x1d4ac:u'q', 0x1d4ae:u's', 0x1d4af:u't',
+0x1d4b0:u'u', 0x1d4b1:u'v', 0x1d4b2:u'w', 0x1d4b3:u'x',
+0x1d4b4:u'y', 0x1d4b5:u'z', 0x1d4d0:u'a', 0x1d4d1:u'b',
+0x1d4d2:u'c', 0x1d4d3:u'd', 0x1d4d4:u'e', 0x1d4d5:u'f',
+0x1d4d6:u'g', 0x1d4d7:u'h', 0x1d4d8:u'i', 0x1d4d9:u'j',
+0x1d4da:u'k', 0x1d4db:u'l', 0x1d4dc:u'm', 0x1d4dd:u'n',
+0x1d4de:u'o', 0x1d4df:u'p', 0x1d4e0:u'q', 0x1d4e1:u'r',
+0x1d4e2:u's', 0x1d4e3:u't', 0x1d4e4:u'u', 0x1d4e5:u'v',
+0x1d4e6:u'w', 0x1d4e7:u'x', 0x1d4e8:u'y', 0x1d4e9:u'z',
+0x1d504:u'a', 0x1d505:u'b', 0x1d507:u'd', 0x1d508:u'e',
+0x1d509:u'f', 0x1d50a:u'g', 0x1d50d:u'j', 0x1d50e:u'k',
+0x1d50f:u'l', 0x1d510:u'm', 0x1d511:u'n', 0x1d512:u'o',
+0x1d513:u'p', 0x1d514:u'q', 0x1d516:u's', 0x1d517:u't',
+0x1d518:u'u', 0x1d519:u'v', 0x1d51a:u'w', 0x1d51b:u'x',
+0x1d51c:u'y', 0x1d538:u'a', 0x1d539:u'b', 0x1d53b:u'd',
+0x1d53c:u'e', 0x1d53d:u'f', 0x1d53e:u'g', 0x1d540:u'i',
+0x1d541:u'j', 0x1d542:u'k', 0x1d543:u'l', 0x1d544:u'm',
+0x1d546:u'o', 0x1d54a:u's', 0x1d54b:u't', 0x1d54c:u'u',
+0x1d54d:u'v', 0x1d54e:u'w', 0x1d54f:u'x', 0x1d550:u'y',
+0x1d56c:u'a', 0x1d56d:u'b', 0x1d56e:u'c', 0x1d56f:u'd',
+0x1d570:u'e', 0x1d571:u'f', 0x1d572:u'g', 0x1d573:u'h',
+0x1d574:u'i', 0x1d575:u'j', 0x1d576:u'k', 0x1d577:u'l',
+0x1d578:u'm', 0x1d579:u'n', 0x1d57a:u'o', 0x1d57b:u'p',
+0x1d57c:u'q', 0x1d57d:u'r', 0x1d57e:u's', 0x1d57f:u't',
+0x1d580:u'u', 0x1d581:u'v', 0x1d582:u'w', 0x1d583:u'x',
+0x1d584:u'y', 0x1d585:u'z', 0x1d5a0:u'a', 0x1d5a1:u'b',
+0x1d5a2:u'c', 0x1d5a3:u'd', 0x1d5a4:u'e', 0x1d5a5:u'f',
+0x1d5a6:u'g', 0x1d5a7:u'h', 0x1d5a8:u'i', 0x1d5a9:u'j',
+0x1d5aa:u'k', 0x1d5ab:u'l', 0x1d5ac:u'm', 0x1d5ad:u'n',
+0x1d5ae:u'o', 0x1d5af:u'p', 0x1d5b0:u'q', 0x1d5b1:u'r',
+0x1d5b2:u's', 0x1d5b3:u't', 0x1d5b4:u'u', 0x1d5b5:u'v',
+0x1d5b6:u'w', 0x1d5b7:u'x', 0x1d5b8:u'y', 0x1d5b9:u'z',
+0x1d5d4:u'a', 0x1d5d5:u'b', 0x1d5d6:u'c', 0x1d5d7:u'd',
+0x1d5d8:u'e', 0x1d5d9:u'f', 0x1d5da:u'g', 0x1d5db:u'h',
+0x1d5dc:u'i', 0x1d5dd:u'j', 0x1d5de:u'k', 0x1d5df:u'l',
+0x1d5e0:u'm', 0x1d5e1:u'n', 0x1d5e2:u'o', 0x1d5e3:u'p',
+0x1d5e4:u'q', 0x1d5e5:u'r', 0x1d5e6:u's', 0x1d5e7:u't',
+0x1d5e8:u'u', 0x1d5e9:u'v', 0x1d5ea:u'w', 0x1d5eb:u'x',
+0x1d5ec:u'y', 0x1d5ed:u'z', 0x1d608:u'a', 0x1d609:u'b',
+0x1d60a:u'c', 0x1d60b:u'd', 0x1d60c:u'e', 0x1d60d:u'f',
+0x1d60e:u'g', 0x1d60f:u'h', 0x1d610:u'i', 0x1d611:u'j',
+0x1d612:u'k', 0x1d613:u'l', 0x1d614:u'm', 0x1d615:u'n',
+0x1d616:u'o', 0x1d617:u'p', 0x1d618:u'q', 0x1d619:u'r',
+0x1d61a:u's', 0x1d61b:u't', 0x1d61c:u'u', 0x1d61d:u'v',
+0x1d61e:u'w', 0x1d61f:u'x', 0x1d620:u'y', 0x1d621:u'z',
+0x1d63c:u'a', 0x1d63d:u'b', 0x1d63e:u'c', 0x1d63f:u'd',
+0x1d640:u'e', 0x1d641:u'f', 0x1d642:u'g', 0x1d643:u'h',
+0x1d644:u'i', 0x1d645:u'j', 0x1d646:u'k', 0x1d647:u'l',
+0x1d648:u'm', 0x1d649:u'n', 0x1d64a:u'o', 0x1d64b:u'p',
+0x1d64c:u'q', 0x1d64d:u'r', 0x1d64e:u's', 0x1d64f:u't',
+0x1d650:u'u', 0x1d651:u'v', 0x1d652:u'w', 0x1d653:u'x',
+0x1d654:u'y', 0x1d655:u'z', 0x1d670:u'a', 0x1d671:u'b',
+0x1d672:u'c', 0x1d673:u'd', 0x1d674:u'e', 0x1d675:u'f',
+0x1d676:u'g', 0x1d677:u'h', 0x1d678:u'i', 0x1d679:u'j',
+0x1d67a:u'k', 0x1d67b:u'l', 0x1d67c:u'm', 0x1d67d:u'n',
+0x1d67e:u'o', 0x1d67f:u'p', 0x1d680:u'q', 0x1d681:u'r',
+0x1d682:u's', 0x1d683:u't', 0x1d684:u'u', 0x1d685:u'v',
+0x1d686:u'w', 0x1d687:u'x', 0x1d688:u'y', 0x1d689:u'z',
+0x1d6a8:u'\u03b1', 0x1d6a9:u'\u03b2', 0x1d6aa:u'\u03b3', 0x1d6ab:u'\u03b4',
+0x1d6ac:u'\u03b5', 0x1d6ad:u'\u03b6', 0x1d6ae:u'\u03b7', 0x1d6af:u'\u03b8',
+0x1d6b0:u'\u03b9', 0x1d6b1:u'\u03ba', 0x1d6b2:u'\u03bb', 0x1d6b3:u'\u03bc',
+0x1d6b4:u'\u03bd', 0x1d6b5:u'\u03be', 0x1d6b6:u'\u03bf', 0x1d6b7:u'\u03c0',
+0x1d6b8:u'\u03c1', 0x1d6b9:u'\u03b8', 0x1d6ba:u'\u03c3', 0x1d6bb:u'\u03c4',
+0x1d6bc:u'\u03c5', 0x1d6bd:u'\u03c6', 0x1d6be:u'\u03c7', 0x1d6bf:u'\u03c8',
+0x1d6c0:u'\u03c9', 0x1d6d3:u'\u03c3', 0x1d6e2:u'\u03b1', 0x1d6e3:u'\u03b2',
+0x1d6e4:u'\u03b3', 0x1d6e5:u'\u03b4', 0x1d6e6:u'\u03b5', 0x1d6e7:u'\u03b6',
+0x1d6e8:u'\u03b7', 0x1d6e9:u'\u03b8', 0x1d6ea:u'\u03b9', 0x1d6eb:u'\u03ba',
+0x1d6ec:u'\u03bb', 0x1d6ed:u'\u03bc', 0x1d6ee:u'\u03bd', 0x1d6ef:u'\u03be',
+0x1d6f0:u'\u03bf', 0x1d6f1:u'\u03c0', 0x1d6f2:u'\u03c1', 0x1d6f3:u'\u03b8',
+0x1d6f4:u'\u03c3', 0x1d6f5:u'\u03c4', 0x1d6f6:u'\u03c5', 0x1d6f7:u'\u03c6',
+0x1d6f8:u'\u03c7', 0x1d6f9:u'\u03c8', 0x1d6fa:u'\u03c9', 0x1d70d:u'\u03c3',
+0x1d71c:u'\u03b1', 0x1d71d:u'\u03b2', 0x1d71e:u'\u03b3', 0x1d71f:u'\u03b4',
+0x1d720:u'\u03b5', 0x1d721:u'\u03b6', 0x1d722:u'\u03b7', 0x1d723:u'\u03b8',
+0x1d724:u'\u03b9', 0x1d725:u'\u03ba', 0x1d726:u'\u03bb', 0x1d727:u'\u03bc',
+0x1d728:u'\u03bd', 0x1d729:u'\u03be', 0x1d72a:u'\u03bf', 0x1d72b:u'\u03c0',
+0x1d72c:u'\u03c1', 0x1d72d:u'\u03b8', 0x1d72e:u'\u03c3', 0x1d72f:u'\u03c4',
+0x1d730:u'\u03c5', 0x1d731:u'\u03c6', 0x1d732:u'\u03c7', 0x1d733:u'\u03c8',
+0x1d734:u'\u03c9', 0x1d747:u'\u03c3', 0x1d756:u'\u03b1', 0x1d757:u'\u03b2',
+0x1d758:u'\u03b3', 0x1d759:u'\u03b4', 0x1d75a:u'\u03b5', 0x1d75b:u'\u03b6',
+0x1d75c:u'\u03b7', 0x1d75d:u'\u03b8', 0x1d75e:u'\u03b9', 0x1d75f:u'\u03ba',
+0x1d760:u'\u03bb', 0x1d761:u'\u03bc', 0x1d762:u'\u03bd', 0x1d763:u'\u03be',
+0x1d764:u'\u03bf', 0x1d765:u'\u03c0', 0x1d766:u'\u03c1', 0x1d767:u'\u03b8',
+0x1d768:u'\u03c3', 0x1d769:u'\u03c4', 0x1d76a:u'\u03c5', 0x1d76b:u'\u03c6',
+0x1d76c:u'\u03c7', 0x1d76d:u'\u03c8', 0x1d76e:u'\u03c9', 0x1d781:u'\u03c3',
+0x1d790:u'\u03b1', 0x1d791:u'\u03b2', 0x1d792:u'\u03b3', 0x1d793:u'\u03b4',
+0x1d794:u'\u03b5', 0x1d795:u'\u03b6', 0x1d796:u'\u03b7', 0x1d797:u'\u03b8',
+0x1d798:u'\u03b9', 0x1d799:u'\u03ba', 0x1d79a:u'\u03bb', 0x1d79b:u'\u03bc',
+0x1d79c:u'\u03bd', 0x1d79d:u'\u03be', 0x1d79e:u'\u03bf', 0x1d79f:u'\u03c0',
+0x1d7a0:u'\u03c1', 0x1d7a1:u'\u03b8', 0x1d7a2:u'\u03c3', 0x1d7a3:u'\u03c4',
+0x1d7a4:u'\u03c5', 0x1d7a5:u'\u03c6', 0x1d7a6:u'\u03c7', 0x1d7a7:u'\u03c8',
+0x1d7a8:u'\u03c9', 0x1d7bb:u'\u03c3', }
+
+def map_table_b3(code):
+    r = b3_exceptions.get(ord(code))
+    if r is not None: return r
+    return code.lower()
+
+
+def map_table_b2(a):
+    al = map_table_b3(a)
+    b = unicodedata.normalize("NFKC", al)
+    bl = u"".join([map_table_b3(ch) for ch in b])
+    c = unicodedata.normalize("NFKC", bl)
+    if b != c:
+        return c
+    else:
+        return al
+
+
+def in_table_c11(code):
+    return code == u" "
+
+
+def in_table_c12(code):
+    return unicodedata.category(code) == "Zs" and code != u" "
+
+def in_table_c11_c12(code):
+    return unicodedata.category(code) == "Zs"
+
+
+def in_table_c21(code):
+    return ord(code) < 128 and unicodedata.category(code) == "Cc"
+
+c22_specials = sets.Set([1757, 1807, 6158, 8204, 8205, 8232, 8233, 65279] + range(8288,8292) + range(8298,8304) + range(65529,65533) + range(119155,119163))
+def in_table_c22(code):
+    c = ord(code)
+    if c < 128: return False
+    if unicodedata.category(code) == "Cc": return True
+    return c in c22_specials
+
+def in_table_c21_c22(code):
+    return unicodedata.category(code) == "Cc" or \
+           ord(code) in c22_specials
+
+
+def in_table_c3(code):
+    return unicodedata.category(code) == "Co"
+
+
+def in_table_c4(code):
+    c = ord(code)
+    if c < 0xFDD0: return False
+    if c < 0xFDF0: return True
+    return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
+
+
+def in_table_c5(code):
+    return unicodedata.category(code) == "Cs"
+
+
+c6_set = sets.Set(range(65529,65534))
+def in_table_c6(code):
+    return ord(code) in c6_set
+
+
+c7_set = sets.Set(range(12272,12284))
+def in_table_c7(code):
+    return ord(code) in c7_set
+
+
+c8_set = sets.Set([832, 833, 8206, 8207] + range(8234,8239) + range(8298,8304))
+def in_table_c8(code):
+    return ord(code) in c8_set
+
+
+c9_set = sets.Set([917505] + range(917536,917632))
+def in_table_c9(code):
+    return ord(code) in c9_set
+
+
+def in_table_d1(code):
+    return unicodedata.bidirectional(code) in ("R","AL")
+
+
+def in_table_d2(code):
+    return unicodedata.bidirectional(code) == "L"
+
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 9a4f35f..769a40d 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -36,11 +36,307 @@ class RecodingTest(unittest.TestCase):
         # Python used to crash on this at exit because of a refcount
         # bug in _codecsmodule.c
 
+# From RFC 3492
+punycode_testcases = [
+    # A Arabic (Egyptian):
+    (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
+     u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
+     "egbpdaj6bu4bxfgehfvwxn"),
+    # B Chinese (simplified):
+    (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
+     "ihqwcrb4cv8a8dqg056pqjye"),
+    # C Chinese (traditional):
+    (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
+     "ihqwctvzc91f659drss3x8bo0yb"),
+    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
+    (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
+     u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
+     u"\u0065\u0073\u006B\u0079",
+     "Proprostnemluvesky-uyb24dma41a"),
+    # E Hebrew:
+    (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
+     u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
+     u"\u05D1\u05E8\u05D9\u05EA",
+     "4dbcagdahymbxekheh6e0a7fei0b"),
+    # F Hindi (Devanagari):
+    (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
+    u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
+    u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
+    u"\u0939\u0948\u0902",
+    "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
+
+    #(G) Japanese (kanji and hiragana):
+    (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
+    u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
+     "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
+
+    # (H) Korean (Hangul syllables):
+    (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
+     u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
+     u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
+     "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
+     "psd879ccm6fea98c"),
+
+    # (I) Russian (Cyrillic):
+    (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
+     u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
+     u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
+     u"\u0438",
+     "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
+
+    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
+    (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
+     u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
+     u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
+     u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
+     u"\u0061\u00F1\u006F\u006C",
+     "PorqunopuedensimplementehablarenEspaol-fmd56a"),
+
+    # (K) Vietnamese:
+    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
+    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
+    (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
+     u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
+     u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
+     u"\u0056\u0069\u1EC7\u0074",
+     "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
+
+
+    #(L) 3<nen>B<gumi><kinpachi><sensei>
+    (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
+     "3B-ww4c5e180e575a65lsy2b"),
+    
+    # (M) <amuro><namie>-with-SUPER-MONKEYS
+    (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
+     u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
+     u"\u004F\u004E\u004B\u0045\u0059\u0053",
+     "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
+
+    # (N) Hello-Another-Way-<sorezore><no><basho>
+    (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
+     u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
+     u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
+     "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
+
+    # (O) <hitotsu><yane><no><shita>2
+    (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
+     "2-u9tlzr9756bt3uc0v"),
+
+    # (P) Maji<de>Koi<suru>5<byou><mae>
+    (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
+     u"\u308B\u0035\u79D2\u524D",
+     "MajiKoi5-783gue6qz075azm5e"),
+
+     # (Q) <pafii>de<runba>
+    (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
+     "de-jg4avhby1noc0d"),
+
+    # (R) <sono><supiido><de>
+    (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
+     "d9juau41awczczp"),
+
+    # (S) -> $1.00 <-
+    (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
+     u"\u003C\u002D",
+     "-> $1.00 <--")
+    ]
+
+for i in punycode_testcases:
+    if len(i)!=2:
+        print repr(i)
+
+class PunycodeTest(unittest.TestCase):
+    def test_encode(self):
+        for uni, puny in punycode_testcases:
+            # Need to convert both strings to lower case, since
+            # some of the extended encodings use upper case, but our
+            # code produces only lower case. Converting just puny to
+            # lower is also insufficient, since some of the input characters
+            # are upper case.
+            self.assertEquals(uni.encode("punycode").lower(), puny.lower())
+
+    def test_decode(self):
+        for uni, puny in punycode_testcases:
+            self.assertEquals(uni, puny.decode("punycode"))
+
+# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
+nameprep_tests = [
+    # 3.1 Map to nothing.
+    ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
+     '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
+     '\xb8\x8f\xef\xbb\xbf',
+     'foobarbaz'),
+    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
+    ('CAFE',
+     'cafe'),
+    # 3.3 Case folding 8bit U+00DF (german sharp s).
+    # The original test case is bogus; it says \xc3\xdf
+    ('\xc3\x9f',
+     'ss'),
+    # 3.4 Case folding U+0130 (turkish capital I with dot).
+    ('\xc4\xb0',
+     'i\xcc\x87'),
+    # 3.5 Case folding multibyte U+0143 U+037A.
+    ('\xc5\x83\xcd\xba',
+     '\xc5\x84 \xce\xb9'),
+    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
+    # XXX: skip this as it fails in UCS-2 mode
+    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
+    # 'telc\xe2\x88\x95kg\xcf\x83'),
+    (None, None),
+    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
+    ('j\xcc\x8c\xc2\xa0\xc2\xaa',
+     '\xc7\xb0 a'),
+    # 3.8 Case folding U+1FB7 and normalization.
+    ('\xe1\xbe\xb7',
+     '\xe1\xbe\xb6\xce\xb9'),
+    # 3.9 Self-reverting case folding U+01F0 and normalization.
+    # The original test case is bogus, it says `\xc7\xf0'
+    ('\xc7\xb0',
+     '\xc7\xb0'),
+    # 3.10 Self-reverting case folding U+0390 and normalization.
+    ('\xce\x90',
+     '\xce\x90'),
+    # 3.11 Self-reverting case folding U+03B0 and normalization.
+    ('\xce\xb0',
+     '\xce\xb0'),
+    # 3.12 Self-reverting case folding U+1E96 and normalization.
+    ('\xe1\xba\x96',
+     '\xe1\xba\x96'),
+    # 3.13 Self-reverting case folding U+1F56 and normalization.
+    ('\xe1\xbd\x96',
+     '\xe1\xbd\x96'),
+    # 3.14 ASCII space character U+0020.
+    (' ',
+     ' '),
+    # 3.15 Non-ASCII 8bit space character U+00A0.
+    ('\xc2\xa0',
+     ' '),
+    # 3.16 Non-ASCII multibyte space character U+1680.
+    ('\xe1\x9a\x80',
+     None),
+    # 3.17 Non-ASCII multibyte space character U+2000.
+    ('\xe2\x80\x80',
+     ' '),
+    # 3.18 Zero Width Space U+200b.
+    ('\xe2\x80\x8b',
+     ''),
+    # 3.19 Non-ASCII multibyte space character U+3000.
+    ('\xe3\x80\x80',
+     ' '),
+    # 3.20 ASCII control characters U+0010 U+007F.
+    ('\x10\x7f',
+     '\x10\x7f'),
+    # 3.21 Non-ASCII 8bit control character U+0085.
+    ('\xc2\x85',
+     None),
+    # 3.22 Non-ASCII multibyte control character U+180E.
+    ('\xe1\xa0\x8e',
+     None),
+    # 3.23 Zero Width No-Break Space U+FEFF.
+    ('\xef\xbb\xbf',
+     ''),
+    # 3.24 Non-ASCII control character U+1D175.
+    ('\xf0\x9d\x85\xb5',
+     None),
+    # 3.25 Plane 0 private use character U+F123.
+    ('\xef\x84\xa3',
+     None),
+    # 3.26 Plane 15 private use character U+F1234.
+    ('\xf3\xb1\x88\xb4',
+     None),
+    # 3.27 Plane 16 private use character U+10F234.
+    ('\xf4\x8f\x88\xb4',
+     None),
+    # 3.28 Non-character code point U+8FFFE.
+    ('\xf2\x8f\xbf\xbe',
+     None),
+    # 3.29 Non-character code point U+10FFFF.
+    ('\xf4\x8f\xbf\xbf',
+     None),
+    # 3.30 Surrogate code U+DF42.
+    ('\xed\xbd\x82',
+     None),
+    # 3.31 Non-plain text character U+FFFD.
+    ('\xef\xbf\xbd',
+     None),
+    # 3.32 Ideographic description character U+2FF5.
+    ('\xe2\xbf\xb5',
+     None),
+    # 3.33 Display property character U+0341.
+    ('\xcd\x81', 
+     '\xcc\x81'),
+    # 3.34 Left-to-right mark U+200E.
+    ('\xe2\x80\x8e',
+     None),
+    # 3.35 Deprecated U+202A.
+    ('\xe2\x80\xaa',
+     None),
+    # 3.36 Language tagging character U+E0001.
+    ('\xf3\xa0\x80\x81',
+     None),
+    # 3.37 Language tagging character U+E0042.
+    ('\xf3\xa0\x81\x82',
+     None),
+    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
+    ('foo\xd6\xbebar',
+     None),
+    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
+    ('foo\xef\xb5\x90bar',
+     None),
+    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
+    ('foo\xef\xb9\xb6bar',
+     'foo \xd9\x8ebar'),
+    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
+    ('\xd8\xa71',
+     None),
+    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
+    ('\xd8\xa71\xd8\xa8',
+     '\xd8\xa71\xd8\xa8'),
+    # 3.43 Unassigned code point U+E0002.
+    ('\xf3\xa0\x80\x82',
+     None),
+    # 3.44 Larger test (shrinking).
+    # Original test case reads \xc3\xdf
+    ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
+     '\xaa\xce\xb0\xe2\x80\x80',
+     'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
+    # 3.45 Larger test (expanding).
+    # Original test case reads \xc3\x9f
+    ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
+     '\x80',
+     'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
+     '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
+     '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
+    ]
+
+
+class NameprepTest(unittest.TestCase):
+    def test_nameprep(self):
+        from encodings.idna import nameprep
+        for pos, (orig, prepped) in enumerate(nameprep_tests):
+            if orig is None:
+                # Skipped
+                continue
+            # The Unicode strings are given in UTF-8
+            orig = unicode(orig, "utf-8")
+            if prepped is None:
+                # Input contains prohibited characters
+                self.assertRaises(UnicodeError, nameprep, orig)
+            else:
+                prepped = unicode(prepped, "utf-8")
+                try:
+                    self.assertEquals(nameprep(orig), prepped)
+                except Exception,e:
+                    raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
+
 def test_main():
     suite = unittest.TestSuite()
     suite.addTest(unittest.makeSuite(UTF16Test))
     suite.addTest(unittest.makeSuite(EscapeDecodeTest))
     suite.addTest(unittest.makeSuite(RecodingTest))
+    suite.addTest(unittest.makeSuite(PunycodeTest))
+    suite.addTest(unittest.makeSuite(NameprepTest))
     test_support.run_suite(suite)
author	Martin v. Löwis <martin@v.loewis.de>	2003-04-18 10:39:54 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2003-04-18 10:39:54 (GMT)
commit	2548c730c17d766ca04b2bf633552655f7f96cdf (patch)
tree	b128f16abd8b4c3058d1be4093f30bfb5454b59e /Lib
parent	8d17a90b830ae9b9c672a504f01d4f93bac3d23d (diff)
download	cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.zip cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.gz cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.bz2