diff options
Diffstat (limited to 'Lib/encodings/idna.py')
-rw-r--r-- | Lib/encodings/idna.py | 117 |
1 files changed, 48 insertions, 69 deletions
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index ea40585..ea90d67 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -4,11 +4,11 @@ import stringprep, re, codecs from unicodedata import ucd_3_2_0 as unicodedata # IDNA section 3.1 -dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") +dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") # IDNA section 5 -ace_prefix = b"xn--" -sace_prefix = "xn--" +ace_prefix = "xn--" +uace_prefix = unicode(ace_prefix, "ascii") # This assumes query strings, so AllowUnassigned is true def nameprep(label): @@ -19,7 +19,7 @@ def nameprep(label): # Map to nothing continue newlabel.append(stringprep.map_table_b2(c)) - label = "".join(newlabel) + label = u"".join(newlabel) # Normalize label = unicodedata.normalize("NFKC", label) @@ -38,7 +38,7 @@ def nameprep(label): raise UnicodeError("Invalid character %r" % c) # Check bidi - RandAL = [stringprep.in_table_d1(x) for x in label] + RandAL = map(stringprep.in_table_d1, label) for c in RandAL: if c: # There is a RandAL char in the string. Must perform further @@ -47,7 +47,7 @@ def nameprep(label): # This is table C.8, which was already checked # 2) If a string contains any RandALCat character, the string # MUST NOT contain any LCat character. - if any(stringprep.in_table_d2(x) for x in label): + if filter(stringprep.in_table_d2, label): raise UnicodeError("Violation of BIDI requirement 2") # 3) If a string contains any RandALCat character, a @@ -88,7 +88,7 @@ def ToASCII(label): raise UnicodeError("label empty or too long") # Step 5: Check ACE prefix - if label.startswith(sace_prefix): + if label.startswith(uace_prefix): raise UnicodeError("Label starts with ACE prefix") # Step 6: Encode with PUNYCODE @@ -104,7 +104,7 @@ def ToASCII(label): def ToUnicode(label): # Step 1: Check for ASCII - if isinstance(label, bytes): + if isinstance(label, str): pure_ascii = True else: try: @@ -122,7 +122,7 @@ def ToUnicode(label): raise UnicodeError("Invalid character in IDN label") # Step 3: Check for ACE prefix if not label.startswith(ace_prefix): - return str(label, "ascii") + return unicode(label, "ascii") # Step 4: Remove ACE prefix label1 = label[len(ace_prefix):] @@ -135,7 +135,7 @@ def ToUnicode(label): # Step 7: Compare the result of step 6 with the one of step 3 # label2 will already be in lower case. - if str(label, "ascii").lower() != str(label2, "ascii"): + if label.lower() != label2: raise UnicodeError("IDNA does not round-trip", label, label2) # Step 8: return the result of step 5 @@ -144,76 +144,55 @@ def ToUnicode(label): ### Codec APIs class Codec(codecs.Codec): - def encode(self, input, errors='strict'): + def encode(self,input,errors='strict'): if errors != 'strict': # IDNA is quite clear that implementations must be strict raise UnicodeError("unsupported error handling "+errors) if not input: - return b'', 0 + return "", 0 - try: - result = input.encode('ascii') - except UnicodeEncodeError: - pass - else: - # ASCII name: fast path - labels = result.split(b'.') - for label in labels[:-1]: - if not (0 < len(label) < 64): - raise UnicodeError("label empty or too long") - if len(labels[-1]) >= 64: - raise UnicodeError("label too long") - return result, len(input) - - result = bytearray() + result = [] labels = dots.split(input) - if labels and not labels[-1]: - trailing_dot = b'.' + if labels and len(labels[-1])==0: + trailing_dot = '.' del labels[-1] else: - trailing_dot = b'' + trailing_dot = '' for label in labels: - if result: - # Join with U+002E - result.extend(b'.') - result.extend(ToASCII(label)) - return bytes(result+trailing_dot), len(input) + result.append(ToASCII(label)) + # Join with U+002E + return ".".join(result)+trailing_dot, len(input) - def decode(self, input, errors='strict'): + def decode(self,input,errors='strict'): if errors != 'strict': raise UnicodeError("Unsupported error handling "+errors) if not input: - return "", 0 + return u"", 0 # IDNA allows decoding to operate on Unicode strings, too. - if not isinstance(input, bytes): - # XXX obviously wrong, see #3232 - input = bytes(input) - - if ace_prefix not in input: - # Fast path - try: - return input.decode('ascii'), len(input) - except UnicodeDecodeError: - pass - - labels = input.split(b".") + if isinstance(input, unicode): + labels = dots.split(input) + else: + # Must be ASCII string + input = str(input) + unicode(input, "ascii") + labels = input.split(".") if labels and len(labels[-1]) == 0: - trailing_dot = '.' + trailing_dot = u'.' del labels[-1] else: - trailing_dot = '' + trailing_dot = u'' result = [] for label in labels: result.append(ToUnicode(label)) - return ".".join(result)+trailing_dot, len(input) + return u".".join(result)+trailing_dot, len(input) class IncrementalEncoder(codecs.BufferedIncrementalEncoder): def _buffer_encode(self, input, errors, final): @@ -222,33 +201,32 @@ class IncrementalEncoder(codecs.BufferedIncrementalEncoder): raise UnicodeError("unsupported error handling "+errors) if not input: - return (b'', 0) + return ("", 0) labels = dots.split(input) - trailing_dot = b'' + trailing_dot = u'' if labels: if not labels[-1]: - trailing_dot = b'.' + trailing_dot = '.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: - trailing_dot = b'.' + trailing_dot = '.' - result = bytearray() + result = [] size = 0 for label in labels: + result.append(ToASCII(label)) if size: - # Join with U+002E - result.extend(b'.') size += 1 - result.extend(ToASCII(label)) size += len(label) - result += trailing_dot + # Join with U+002E + result = ".".join(result) + trailing_dot size += len(trailing_dot) - return (bytes(result), size) + return (result, size) class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def _buffer_decode(self, input, errors, final): @@ -256,26 +234,27 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder): raise UnicodeError("Unsupported error handling "+errors) if not input: - return ("", 0) + return (u"", 0) # IDNA allows decoding to operate on Unicode strings, too. - if isinstance(input, str): + if isinstance(input, unicode): labels = dots.split(input) else: # Must be ASCII string - input = str(input, "ascii") + input = str(input) + unicode(input, "ascii") labels = input.split(".") - trailing_dot = '' + trailing_dot = u'' if labels: if not labels[-1]: - trailing_dot = '.' + trailing_dot = u'.' del labels[-1] elif not final: # Keep potentially unfinished label until the next call del labels[-1] if labels: - trailing_dot = '.' + trailing_dot = u'.' result = [] size = 0 @@ -285,7 +264,7 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder): size += 1 size += len(label) - result = ".".join(result) + trailing_dot + result = u".".join(result) + trailing_dot size += len(trailing_dot) return (result, size) |