summaryrefslogtreecommitdiffstats
path: root/Lib/encodings
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2003-04-18 10:39:54 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2003-04-18 10:39:54 (GMT)
commit2548c730c17d766ca04b2bf633552655f7f96cdf (patch)
treeb128f16abd8b4c3058d1be4093f30bfb5454b59e /Lib/encodings
parent8d17a90b830ae9b9c672a504f01d4f93bac3d23d (diff)
downloadcpython-2548c730c17d766ca04b2bf633552655f7f96cdf.zip
cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.gz
cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.bz2
Implement IDNA (Internationalized Domain Names in Applications).
Diffstat (limited to 'Lib/encodings')
-rw-r--r--Lib/encodings/idna.py187
-rw-r--r--Lib/encodings/punycode.py222
2 files changed, 409 insertions, 0 deletions
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
new file mode 100644
index 0000000..7e4d04e
--- /dev/null
+++ b/Lib/encodings/idna.py
@@ -0,0 +1,187 @@
+# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
+
+import stringprep, unicodedata, re, codecs
+
+# IDNA section 3.1
+dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
+
+# IDNA section 5
+ace_prefix = "xn--"
+uace_prefix = unicode(ace_prefix, "ascii")
+
+# This assumes query strings, so AllowUnassigned is true
+def nameprep(label):
+ # Map
+ newlabel = []
+ for c in label:
+ if stringprep.in_table_b1(c):
+ # Map to nothing
+ continue
+ newlabel.append(stringprep.map_table_b2(c))
+ label = u"".join(newlabel)
+
+ # Normalize
+ label = unicodedata.normalize("NFKC", label)
+
+ # Prohibit
+ for c in label:
+ if stringprep.in_table_c12(c) or \
+ stringprep.in_table_c22(c) or \
+ stringprep.in_table_c3(c) or \
+ stringprep.in_table_c4(c) or \
+ stringprep.in_table_c5(c) or \
+ stringprep.in_table_c6(c) or \
+ stringprep.in_table_c7(c) or \
+ stringprep.in_table_c8(c) or \
+ stringprep.in_table_c9(c):
+ raise UnicodeError, "Invalid character %s" % repr(c)
+
+ # Check bidi
+ RandAL = map(stringprep.in_table_d1, label)
+ for c in RandAL:
+ if c:
+ # There is a RandAL char in the string. Must perform further
+ # tests:
+ # 1) The characters in section 5.8 MUST be prohibited.
+ # This is table C.8, which was already checked
+ # 2) If a string contains any RandALCat character, the string
+ # MUST NOT contain any LCat character.
+ if filter(stringprep.in_table_d2, label):
+ raise UnicodeError, "Violation of BIDI requirement 2"
+
+ # 3) If a string contains any RandALCat character, a
+ # RandALCat character MUST be the first character of the
+ # string, and a RandALCat character MUST be the last
+ # character of the string.
+ if not RandAL[0] or not RandAL[-1]:
+ raise UnicodeError, "Violation of BIDI requirement 3"
+
+ return label
+
+def ToASCII(label):
+ try:
+ # Step 1: try ASCII
+ label = label.encode("ascii")
+ except UnicodeError:
+ pass
+ else:
+ # Skip to step 3: UseSTD3ASCIIRules is false, so
+ # Skip to step 8.
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError, "label too long"
+
+ # Step 2: nameprep
+ label = nameprep(label)
+
+ # Step 3: UseSTD3ASCIIRules is false
+ # Step 4: try ASCII
+ try:
+ label = label.encode("ascii")
+ except UnicodeError:
+ pass
+ else:
+ # Skip to step 8.
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError, "label too long"
+
+ # Step 5: Check ACE prefix
+ if label.startswith(uace_prefix):
+ raise UnicodeError, "Label starts with ACE prefix"
+
+ # Step 6: Encode with PUNYCODE
+ label = label.encode("punycode")
+
+ # Step 7: Prepend ACE prefix
+ label = ace_prefix + label
+
+ # Step 8: Check size
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError, "label too long"
+
+def ToUnicode(label):
+ # Step 1: Check for ASCII
+ if isinstance(label, str):
+ pure_ascii = True
+ else:
+ try:
+ label = label.encode("ascii")
+ pure_ascii = True
+ except UnicodeError:
+ pure_ascii = False
+ if not pure_ascii:
+ # Step 2: Perform nameprep
+ label = nameprep(label)
+ # It doesn't say this, but apparently, it should be ASCII now
+ try:
+ label = label.encode("ascii")
+ except UnicodeError:
+ raise UnicodeError, "Invalid character in IDN label"
+ # Step 3: Check for ACE prefix
+ if not label.startswith(ace_prefix):
+ return unicode(label, "ascii")
+
+ # Step 4: Remove ACE prefix
+ label1 = label[len(ace_prefix):]
+
+ # Step 5: Decode using PUNYCODE
+ result = label1.decode("punycode")
+
+ # Step 6: Apply ToASCII
+ label2 = ToASCII(result)
+
+ # Step 7: Compare the result of step 6 with the one of step 3
+ # label2 will already be in lower case.
+ if label.lower() != label2:
+ raise UnicodeError, ("IDNA does not round-trip", label, label2)
+
+ # Step 8: return the result of step 5
+ return result
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+ def encode(self,input,errors='strict'):
+
+ if errors != 'strict':
+ # IDNA is quite clear that implementations must be strict
+ raise UnicodeError, "unsupported error handling "+errors
+
+ result = []
+ for label in dots.split(input):
+ result.append(ToASCII(label))
+ # Join with U+002E
+ return ".".join(result), len(input)
+
+ def decode(self,input,errors='strict'):
+
+ if errors != 'strict':
+ raise UnicodeError, "Unsupported error handling "+errors
+
+ # IDNA allows decoding to operate on Unicode strings, too.
+ if isinstance(input, unicode):
+ labels = dots.split(input)
+ else:
+ # Must be ASCII string
+ unicode(input, "ascii")
+ labels = input.split(".")
+
+ result = []
+ for label in labels:
+ result.append(ToUnicode(label))
+
+ return u".".join(result), len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+ pass
+
+class StreamReader(Codec,codecs.StreamReader):
+ pass
+
+### encodings module API
+
+def getregentry():
+
+ return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
new file mode 100644
index 0000000..e7f2d45
--- /dev/null
+++ b/Lib/encodings/punycode.py
@@ -0,0 +1,222 @@
+# -*- coding: iso-8859-1 -*-
+""" Codec for the Punicode encoding, as specified in RFC 3492
+
+Written by Martin v. Löwis.
+"""
+
+import codecs
+
+##################### Encoding #####################################
+
+def segregate(str):
+ """3.1 Basic code point segregation"""
+ base = []
+ extended = {}
+ for c in str:
+ if ord(c) < 128:
+ base.append(c)
+ else:
+ extended[c] = 1
+ extended = extended.keys()
+ extended.sort()
+ return "".join(base).encode("ascii"),extended
+
+def selective_len(str, max):
+ """Return the length of str, considering only characters below max."""
+ res = 0
+ for c in str:
+ if ord(c) < max:
+ res += 1
+ return res
+
+def selective_find(str, char, index, pos):
+ """Return a pair (index, pos), indicating the next occurrence of
+ char in str. index is the position of the character considering
+ only ordinals up to and including char, and pos is the position in
+ the full string. index/pos is the starting position in the full
+ string."""
+
+ l = len(str)
+ while 1:
+ pos += 1
+ if pos == l:
+ return (-1, -1)
+ c = str[pos]
+ if c == char:
+ return index+1, pos
+ elif c < char:
+ index += 1
+
+def insertion_unsort(str, extended):
+ """3.2 Insertion unsort coding"""
+ oldchar = 0x80
+ result = []
+ oldindex = -1
+ for c in extended:
+ index = pos = -1
+ char = ord(c)
+ curlen = selective_len(str, char)
+ delta = (curlen+1) * (char - oldchar)
+ while 1:
+ index,pos = selective_find(str,c,index,pos)
+ if index == -1:
+ break
+ delta += index - oldindex
+ result.append(delta-1)
+ oldindex = index
+ delta = 0
+ oldchar = char
+
+ return result
+
+def T(j, bias):
+ # Punycode parameters: tmin = 1, tmax = 26, base = 36
+ res = 36 * (j + 1) - bias
+ if res < 1: return 1
+ if res > 26: return 26
+ return res
+
+digits = "abcdefghijklmnopqrstuvwxyz0123456789"
+def generate_generalized_integer(N, bias):
+ """3.3 Generalized variable-length integers"""
+ result = []
+ j = 0
+ while 1:
+ t = T(j, bias)
+ if N < t:
+ result.append(digits[N])
+ return result
+ result.append(digits[t + ((N - t) % (36 - t))])
+ N = (N - t) // (36 - t)
+ j += 1
+
+def adapt(delta, first, numchars):
+ if first:
+ delta //= 700
+ else:
+ delta //= 2
+ delta += delta // numchars
+ # ((base - tmin) * tmax) // 2 == 455
+ divisions = 0
+ while delta > 455:
+ delta = delta // 35 # base - tmin
+ divisions += 36
+ bias = divisions + (36 * delta // (delta + 38))
+ return bias
+
+
+def generate_integers(baselen, deltas):
+ """3.4 Bias adaptation"""
+ # Punycode parameters: initial bias = 72, damp = 700, skew = 38
+ result = []
+ bias = 72
+ for points, delta in enumerate(deltas):
+ s = generate_generalized_integer(delta, bias)
+ result.extend(s)
+ bias = adapt(delta, points==0, baselen+points+1)
+ return "".join(result)
+
+def punycode_encode(text):
+ base, extended = segregate(text)
+ base = base.encode("ascii")
+ deltas = insertion_unsort(text, extended)
+ extended = generate_integers(len(base), deltas)
+ if base:
+ return base + "-" + extended
+ return extended
+
+##################### Decoding #####################################
+
+def decode_generalized_number(extended, extpos, bias, errors):
+ """3.3 Generalized variable-length integers"""
+ result = 0
+ w = 1
+ j = 0
+ while 1:
+ try:
+ char = ord(extended[extpos])
+ except IndexError:
+ if errors == "strict":
+ raise UnicodeError, "incomplete punicode string"
+ return extpos + 1, None
+ extpos += 1
+ if 0x41 <= char <= 0x5A: # A-Z
+ digit = char - 0x41
+ elif 0x30 <= char <= 0x39:
+ digit = char - 22 # 0x30-26
+ elif errors == "strict":
+ raise UnicodeError("Invalid extended code point '%s'"
+ % extended[extpos])
+ else:
+ return extpos, None
+ t = T(j, bias)
+ result += digit * w
+ if digit < t:
+ return extpos, result
+ w = w * (36 - t)
+ j += 1
+
+
+def insertion_sort(base, extended, errors):
+ """3.2 Insertion unsort coding"""
+ char = 0x80
+ pos = -1
+ bias = 72
+ extpos = 0
+ while extpos < len(extended):
+ newpos, delta = decode_generalized_number(extended, extpos,
+ bias, errors)
+ if delta is None:
+ # There was an error in decoding. We can't continue because
+ # synchronization is lost.
+ return base
+ pos += delta+1
+ char += pos // (len(base) + 1)
+ if char > 0x10FFFF:
+ if errors == "strict":
+ raise UnicodeError, ("Invalid character U+%x" % char)
+ char = ord('?')
+ pos = pos % (len(base) + 1)
+ base = base[:pos] + unichr(char) + base[pos:]
+ bias = adapt(delta, (extpos == 0), len(base))
+ extpos = newpos
+ return base
+
+def punycode_decode(text, errors):
+ pos = text.rfind("-")
+ if pos == -1:
+ base = ""
+ extended = text
+ else:
+ base = text[:pos]
+ extended = text[pos+1:]
+ base = unicode(base, "ascii", errors)
+ extended = extended.upper()
+ return insertion_sort(base, extended, errors)
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+ def encode(self,input,errors='strict'):
+
+ res = punycode_encode(input)
+ return res, len(input)
+
+ def decode(self,input,errors='strict'):
+
+ if errors not in ('strict', 'replace', 'ignore'):
+ raise UnicodeError, "Unsupported error handling "+errors
+ res = punycode_decode(input, errors)
+ return res, len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+ pass
+
+class StreamReader(Codec,codecs.StreamReader):
+ pass
+
+### encodings module API
+
+def getregentry():
+
+ return (Codec().encode,Codec().decode,StreamReader,StreamWriter)