diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2003-04-18 10:39:54 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2003-04-18 10:39:54 (GMT) |
commit | 2548c730c17d766ca04b2bf633552655f7f96cdf (patch) | |
tree | b128f16abd8b4c3058d1be4093f30bfb5454b59e /Lib/encodings | |
parent | 8d17a90b830ae9b9c672a504f01d4f93bac3d23d (diff) | |
download | cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.zip cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.gz cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.bz2 |
Implement IDNA (Internationalized Domain Names in Applications).
Diffstat (limited to 'Lib/encodings')
-rw-r--r-- | Lib/encodings/idna.py | 187 | ||||
-rw-r--r-- | Lib/encodings/punycode.py | 222 |
2 files changed, 409 insertions, 0 deletions
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py new file mode 100644 index 0000000..7e4d04e --- /dev/null +++ b/Lib/encodings/idna.py @@ -0,0 +1,187 @@ +# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) + +import stringprep, unicodedata, re, codecs + +# IDNA section 3.1 +dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") + +# IDNA section 5 +ace_prefix = "xn--" +uace_prefix = unicode(ace_prefix, "ascii") + +# This assumes query strings, so AllowUnassigned is true +def nameprep(label): + # Map + newlabel = [] + for c in label: + if stringprep.in_table_b1(c): + # Map to nothing + continue + newlabel.append(stringprep.map_table_b2(c)) + label = u"".join(newlabel) + + # Normalize + label = unicodedata.normalize("NFKC", label) + + # Prohibit + for c in label: + if stringprep.in_table_c12(c) or \ + stringprep.in_table_c22(c) or \ + stringprep.in_table_c3(c) or \ + stringprep.in_table_c4(c) or \ + stringprep.in_table_c5(c) or \ + stringprep.in_table_c6(c) or \ + stringprep.in_table_c7(c) or \ + stringprep.in_table_c8(c) or \ + stringprep.in_table_c9(c): + raise UnicodeError, "Invalid character %s" % repr(c) + + # Check bidi + RandAL = map(stringprep.in_table_d1, label) + for c in RandAL: + if c: + # There is a RandAL char in the string. Must perform further + # tests: + # 1) The characters in section 5.8 MUST be prohibited. + # This is table C.8, which was already checked + # 2) If a string contains any RandALCat character, the string + # MUST NOT contain any LCat character. + if filter(stringprep.in_table_d2, label): + raise UnicodeError, "Violation of BIDI requirement 2" + + # 3) If a string contains any RandALCat character, a + # RandALCat character MUST be the first character of the + # string, and a RandALCat character MUST be the last + # character of the string. + if not RandAL[0] or not RandAL[-1]: + raise UnicodeError, "Violation of BIDI requirement 3" + + return label + +def ToASCII(label): + try: + # Step 1: try ASCII + label = label.encode("ascii") + except UnicodeError: + pass + else: + # Skip to step 3: UseSTD3ASCIIRules is false, so + # Skip to step 8. + if 0 < len(label) < 64: + return label + raise UnicodeError, "label too long" + + # Step 2: nameprep + label = nameprep(label) + + # Step 3: UseSTD3ASCIIRules is false + # Step 4: try ASCII + try: + label = label.encode("ascii") + except UnicodeError: + pass + else: + # Skip to step 8. + if 0 < len(label) < 64: + return label + raise UnicodeError, "label too long" + + # Step 5: Check ACE prefix + if label.startswith(uace_prefix): + raise UnicodeError, "Label starts with ACE prefix" + + # Step 6: Encode with PUNYCODE + label = label.encode("punycode") + + # Step 7: Prepend ACE prefix + label = ace_prefix + label + + # Step 8: Check size + if 0 < len(label) < 64: + return label + raise UnicodeError, "label too long" + +def ToUnicode(label): + # Step 1: Check for ASCII + if isinstance(label, str): + pure_ascii = True + else: + try: + label = label.encode("ascii") + pure_ascii = True + except UnicodeError: + pure_ascii = False + if not pure_ascii: + # Step 2: Perform nameprep + label = nameprep(label) + # It doesn't say this, but apparently, it should be ASCII now + try: + label = label.encode("ascii") + except UnicodeError: + raise UnicodeError, "Invalid character in IDN label" + # Step 3: Check for ACE prefix + if not label.startswith(ace_prefix): + return unicode(label, "ascii") + + # Step 4: Remove ACE prefix + label1 = label[len(ace_prefix):] + + # Step 5: Decode using PUNYCODE + result = label1.decode("punycode") + + # Step 6: Apply ToASCII + label2 = ToASCII(result) + + # Step 7: Compare the result of step 6 with the one of step 3 + # label2 will already be in lower case. + if label.lower() != label2: + raise UnicodeError, ("IDNA does not round-trip", label, label2) + + # Step 8: return the result of step 5 + return result + +### Codec APIs + +class Codec(codecs.Codec): + def encode(self,input,errors='strict'): + + if errors != 'strict': + # IDNA is quite clear that implementations must be strict + raise UnicodeError, "unsupported error handling "+errors + + result = [] + for label in dots.split(input): + result.append(ToASCII(label)) + # Join with U+002E + return ".".join(result), len(input) + + def decode(self,input,errors='strict'): + + if errors != 'strict': + raise UnicodeError, "Unsupported error handling "+errors + + # IDNA allows decoding to operate on Unicode strings, too. + if isinstance(input, unicode): + labels = dots.split(input) + else: + # Must be ASCII string + unicode(input, "ascii") + labels = input.split(".") + + result = [] + for label in labels: + result.append(ToUnicode(label)) + + return u".".join(result), len(input) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py new file mode 100644 index 0000000..e7f2d45 --- /dev/null +++ b/Lib/encodings/punycode.py @@ -0,0 +1,222 @@ +# -*- coding: iso-8859-1 -*- +""" Codec for the Punicode encoding, as specified in RFC 3492 + +Written by Martin v. Löwis. +""" + +import codecs + +##################### Encoding ##################################### + +def segregate(str): + """3.1 Basic code point segregation""" + base = [] + extended = {} + for c in str: + if ord(c) < 128: + base.append(c) + else: + extended[c] = 1 + extended = extended.keys() + extended.sort() + return "".join(base).encode("ascii"),extended + +def selective_len(str, max): + """Return the length of str, considering only characters below max.""" + res = 0 + for c in str: + if ord(c) < max: + res += 1 + return res + +def selective_find(str, char, index, pos): + """Return a pair (index, pos), indicating the next occurrence of + char in str. index is the position of the character considering + only ordinals up to and including char, and pos is the position in + the full string. index/pos is the starting position in the full + string.""" + + l = len(str) + while 1: + pos += 1 + if pos == l: + return (-1, -1) + c = str[pos] + if c == char: + return index+1, pos + elif c < char: + index += 1 + +def insertion_unsort(str, extended): + """3.2 Insertion unsort coding""" + oldchar = 0x80 + result = [] + oldindex = -1 + for c in extended: + index = pos = -1 + char = ord(c) + curlen = selective_len(str, char) + delta = (curlen+1) * (char - oldchar) + while 1: + index,pos = selective_find(str,c,index,pos) + if index == -1: + break + delta += index - oldindex + result.append(delta-1) + oldindex = index + delta = 0 + oldchar = char + + return result + +def T(j, bias): + # Punycode parameters: tmin = 1, tmax = 26, base = 36 + res = 36 * (j + 1) - bias + if res < 1: return 1 + if res > 26: return 26 + return res + +digits = "abcdefghijklmnopqrstuvwxyz0123456789" +def generate_generalized_integer(N, bias): + """3.3 Generalized variable-length integers""" + result = [] + j = 0 + while 1: + t = T(j, bias) + if N < t: + result.append(digits[N]) + return result + result.append(digits[t + ((N - t) % (36 - t))]) + N = (N - t) // (36 - t) + j += 1 + +def adapt(delta, first, numchars): + if first: + delta //= 700 + else: + delta //= 2 + delta += delta // numchars + # ((base - tmin) * tmax) // 2 == 455 + divisions = 0 + while delta > 455: + delta = delta // 35 # base - tmin + divisions += 36 + bias = divisions + (36 * delta // (delta + 38)) + return bias + + +def generate_integers(baselen, deltas): + """3.4 Bias adaptation""" + # Punycode parameters: initial bias = 72, damp = 700, skew = 38 + result = [] + bias = 72 + for points, delta in enumerate(deltas): + s = generate_generalized_integer(delta, bias) + result.extend(s) + bias = adapt(delta, points==0, baselen+points+1) + return "".join(result) + +def punycode_encode(text): + base, extended = segregate(text) + base = base.encode("ascii") + deltas = insertion_unsort(text, extended) + extended = generate_integers(len(base), deltas) + if base: + return base + "-" + extended + return extended + +##################### Decoding ##################################### + +def decode_generalized_number(extended, extpos, bias, errors): + """3.3 Generalized variable-length integers""" + result = 0 + w = 1 + j = 0 + while 1: + try: + char = ord(extended[extpos]) + except IndexError: + if errors == "strict": + raise UnicodeError, "incomplete punicode string" + return extpos + 1, None + extpos += 1 + if 0x41 <= char <= 0x5A: # A-Z + digit = char - 0x41 + elif 0x30 <= char <= 0x39: + digit = char - 22 # 0x30-26 + elif errors == "strict": + raise UnicodeError("Invalid extended code point '%s'" + % extended[extpos]) + else: + return extpos, None + t = T(j, bias) + result += digit * w + if digit < t: + return extpos, result + w = w * (36 - t) + j += 1 + + +def insertion_sort(base, extended, errors): + """3.2 Insertion unsort coding""" + char = 0x80 + pos = -1 + bias = 72 + extpos = 0 + while extpos < len(extended): + newpos, delta = decode_generalized_number(extended, extpos, + bias, errors) + if delta is None: + # There was an error in decoding. We can't continue because + # synchronization is lost. + return base + pos += delta+1 + char += pos // (len(base) + 1) + if char > 0x10FFFF: + if errors == "strict": + raise UnicodeError, ("Invalid character U+%x" % char) + char = ord('?') + pos = pos % (len(base) + 1) + base = base[:pos] + unichr(char) + base[pos:] + bias = adapt(delta, (extpos == 0), len(base)) + extpos = newpos + return base + +def punycode_decode(text, errors): + pos = text.rfind("-") + if pos == -1: + base = "" + extended = text + else: + base = text[:pos] + extended = text[pos+1:] + base = unicode(base, "ascii", errors) + extended = extended.upper() + return insertion_sort(base, extended, errors) + +### Codec APIs + +class Codec(codecs.Codec): + def encode(self,input,errors='strict'): + + res = punycode_encode(input) + return res, len(input) + + def decode(self,input,errors='strict'): + + if errors not in ('strict', 'replace', 'ignore'): + raise UnicodeError, "Unsupported error handling "+errors + res = punycode_decode(input, errors) + return res, len(input) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |