summaryrefslogtreecommitdiffstats
path: root/Lib/encodings/idna.py
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2003-04-18 10:39:54 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2003-04-18 10:39:54 (GMT)
commit2548c730c17d766ca04b2bf633552655f7f96cdf (patch)
treeb128f16abd8b4c3058d1be4093f30bfb5454b59e /Lib/encodings/idna.py
parent8d17a90b830ae9b9c672a504f01d4f93bac3d23d (diff)
downloadcpython-2548c730c17d766ca04b2bf633552655f7f96cdf.zip
cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.gz
cpython-2548c730c17d766ca04b2bf633552655f7f96cdf.tar.bz2
Implement IDNA (Internationalized Domain Names in Applications).
Diffstat (limited to 'Lib/encodings/idna.py')
-rw-r--r--Lib/encodings/idna.py187
1 files changed, 187 insertions, 0 deletions
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
new file mode 100644
index 0000000..7e4d04e
--- /dev/null
+++ b/Lib/encodings/idna.py
@@ -0,0 +1,187 @@
+# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
+
+import stringprep, unicodedata, re, codecs
+
+# IDNA section 3.1
+dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
+
+# IDNA section 5
+ace_prefix = "xn--"
+uace_prefix = unicode(ace_prefix, "ascii")
+
+# This assumes query strings, so AllowUnassigned is true
+def nameprep(label):
+ # Map
+ newlabel = []
+ for c in label:
+ if stringprep.in_table_b1(c):
+ # Map to nothing
+ continue
+ newlabel.append(stringprep.map_table_b2(c))
+ label = u"".join(newlabel)
+
+ # Normalize
+ label = unicodedata.normalize("NFKC", label)
+
+ # Prohibit
+ for c in label:
+ if stringprep.in_table_c12(c) or \
+ stringprep.in_table_c22(c) or \
+ stringprep.in_table_c3(c) or \
+ stringprep.in_table_c4(c) or \
+ stringprep.in_table_c5(c) or \
+ stringprep.in_table_c6(c) or \
+ stringprep.in_table_c7(c) or \
+ stringprep.in_table_c8(c) or \
+ stringprep.in_table_c9(c):
+ raise UnicodeError, "Invalid character %s" % repr(c)
+
+ # Check bidi
+ RandAL = map(stringprep.in_table_d1, label)
+ for c in RandAL:
+ if c:
+ # There is a RandAL char in the string. Must perform further
+ # tests:
+ # 1) The characters in section 5.8 MUST be prohibited.
+ # This is table C.8, which was already checked
+ # 2) If a string contains any RandALCat character, the string
+ # MUST NOT contain any LCat character.
+ if filter(stringprep.in_table_d2, label):
+ raise UnicodeError, "Violation of BIDI requirement 2"
+
+ # 3) If a string contains any RandALCat character, a
+ # RandALCat character MUST be the first character of the
+ # string, and a RandALCat character MUST be the last
+ # character of the string.
+ if not RandAL[0] or not RandAL[-1]:
+ raise UnicodeError, "Violation of BIDI requirement 3"
+
+ return label
+
+def ToASCII(label):
+ try:
+ # Step 1: try ASCII
+ label = label.encode("ascii")
+ except UnicodeError:
+ pass
+ else:
+ # Skip to step 3: UseSTD3ASCIIRules is false, so
+ # Skip to step 8.
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError, "label too long"
+
+ # Step 2: nameprep
+ label = nameprep(label)
+
+ # Step 3: UseSTD3ASCIIRules is false
+ # Step 4: try ASCII
+ try:
+ label = label.encode("ascii")
+ except UnicodeError:
+ pass
+ else:
+ # Skip to step 8.
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError, "label too long"
+
+ # Step 5: Check ACE prefix
+ if label.startswith(uace_prefix):
+ raise UnicodeError, "Label starts with ACE prefix"
+
+ # Step 6: Encode with PUNYCODE
+ label = label.encode("punycode")
+
+ # Step 7: Prepend ACE prefix
+ label = ace_prefix + label
+
+ # Step 8: Check size
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError, "label too long"
+
+def ToUnicode(label):
+ # Step 1: Check for ASCII
+ if isinstance(label, str):
+ pure_ascii = True
+ else:
+ try:
+ label = label.encode("ascii")
+ pure_ascii = True
+ except UnicodeError:
+ pure_ascii = False
+ if not pure_ascii:
+ # Step 2: Perform nameprep
+ label = nameprep(label)
+ # It doesn't say this, but apparently, it should be ASCII now
+ try:
+ label = label.encode("ascii")
+ except UnicodeError:
+ raise UnicodeError, "Invalid character in IDN label"
+ # Step 3: Check for ACE prefix
+ if not label.startswith(ace_prefix):
+ return unicode(label, "ascii")
+
+ # Step 4: Remove ACE prefix
+ label1 = label[len(ace_prefix):]
+
+ # Step 5: Decode using PUNYCODE
+ result = label1.decode("punycode")
+
+ # Step 6: Apply ToASCII
+ label2 = ToASCII(result)
+
+ # Step 7: Compare the result of step 6 with the one of step 3
+ # label2 will already be in lower case.
+ if label.lower() != label2:
+ raise UnicodeError, ("IDNA does not round-trip", label, label2)
+
+ # Step 8: return the result of step 5
+ return result
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+ def encode(self,input,errors='strict'):
+
+ if errors != 'strict':
+ # IDNA is quite clear that implementations must be strict
+ raise UnicodeError, "unsupported error handling "+errors
+
+ result = []
+ for label in dots.split(input):
+ result.append(ToASCII(label))
+ # Join with U+002E
+ return ".".join(result), len(input)
+
+ def decode(self,input,errors='strict'):
+
+ if errors != 'strict':
+ raise UnicodeError, "Unsupported error handling "+errors
+
+ # IDNA allows decoding to operate on Unicode strings, too.
+ if isinstance(input, unicode):
+ labels = dots.split(input)
+ else:
+ # Must be ASCII string
+ unicode(input, "ascii")
+ labels = input.split(".")
+
+ result = []
+ for label in labels:
+ result.append(ToUnicode(label))
+
+ return u".".join(result), len(input)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+ pass
+
+class StreamReader(Codec,codecs.StreamReader):
+ pass
+
+### encodings module API
+
+def getregentry():
+
+ return (Codec().encode,Codec().decode,StreamReader,StreamWriter)