diff options
-rw-r--r-- | Lib/codecs.py | 27 | ||||
-rw-r--r-- | Lib/encodings/idna.py | 78 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 73 |
3 files changed, 169 insertions, 9 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py index c138187..1518d75 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -181,6 +181,33 @@ class IncrementalEncoder(object): Resets the encoder to the initial state. """ +class BufferedIncrementalEncoder(IncrementalEncoder): + """ + This subclass of IncrementalEncoder can be used as the baseclass for an + incremental encoder if the encoder must keep some of the output in a + buffer between calls to encode(). + """ + def __init__(self, errors='strict'): + IncrementalEncoder.__init__(self, errors) + self.buffer = "" # unencoded input that is kept between calls to encode() + + def _buffer_encode(self, input, errors, final): + # Overwrite this method in subclasses: It must encode input + # and return an (output, length consumed) tuple + raise NotImplementedError + + def encode(self, input, final=False): + # encode input (taking the buffer into account) + data = self.buffer + input + (result, consumed) = self._buffer_encode(data, self.errors, final) + # keep unencoded input until the next call + self.buffer = data[consumed:] + return result + + def reset(self): + IncrementalEncoder.reset(self) + self.buffer = "" + class IncrementalDecoder(object): """ An IncrementalDecoder decodes an input in multiple steps. The input can be diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py index 1aa4e96..ea90d67 100644 --- a/Lib/encodings/idna.py +++ b/Lib/encodings/idna.py @@ -194,13 +194,79 @@ class Codec(codecs.Codec): return u".".join(result)+trailing_dot, len(input) -class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return Codec().encode(input, self.errors)[0] +class IncrementalEncoder(codecs.BufferedIncrementalEncoder): + def _buffer_encode(self, input, errors, final): + if errors != 'strict': + # IDNA is quite clear that implementations must be strict + raise UnicodeError("unsupported error handling "+errors) + + if not input: + return ("", 0) + + labels = dots.split(input) + trailing_dot = u'' + if labels: + if not labels[-1]: + trailing_dot = '.' + del labels[-1] + elif not final: + # Keep potentially unfinished label until the next call + del labels[-1] + if labels: + trailing_dot = '.' + + result = [] + size = 0 + for label in labels: + result.append(ToASCII(label)) + if size: + size += 1 + size += len(label) + + # Join with U+002E + result = ".".join(result) + trailing_dot + size += len(trailing_dot) + return (result, size) + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + if errors != 'strict': + raise UnicodeError("Unsupported error handling "+errors) + + if not input: + return (u"", 0) + + # IDNA allows decoding to operate on Unicode strings, too. + if isinstance(input, unicode): + labels = dots.split(input) + else: + # Must be ASCII string + input = str(input) + unicode(input, "ascii") + labels = input.split(".") + + trailing_dot = u'' + if labels: + if not labels[-1]: + trailing_dot = u'.' + del labels[-1] + elif not final: + # Keep potentially unfinished label until the next call + del labels[-1] + if labels: + trailing_dot = u'.' + + result = [] + size = 0 + for label in labels: + result.append(ToUnicode(label)) + if size: + size += 1 + size += len(label) -class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return Codec().decode(input, self.errors)[0] + result = u".".join(result) + trailing_dot + size += len(trailing_dot) + return (result, size) class StreamWriter(Codec,codecs.StreamWriter): pass diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 22d9060..6ea49cc 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -781,9 +781,18 @@ class NameprepTest(unittest.TestCase): except Exception,e: raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) -class CodecTest(unittest.TestCase): - def test_builtin(self): +class IDNACodecTest(unittest.TestCase): + def test_builtin_decode(self): self.assertEquals(unicode("python.org", "idna"), u"python.org") + self.assertEquals(unicode("python.org.", "idna"), u"python.org.") + self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org") + self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.") + + def test_builtin_encode(self): + self.assertEquals(u"python.org".encode("idna"), "python.org") + self.assertEquals("python.org.".encode("idna"), "python.org.") + self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org") + self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.") def test_stream(self): import StringIO @@ -791,6 +800,64 @@ class CodecTest(unittest.TestCase): r.read(3) self.assertEquals(r.read(), u"") + def test_incremental_decode(self): + self.assertEquals( + "".join(codecs.iterdecode("python.org", "idna")), + u"python.org" + ) + self.assertEquals( + "".join(codecs.iterdecode("python.org.", "idna")), + u"python.org." + ) + self.assertEquals( + "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), + u"pyth\xf6n.org." + ) + self.assertEquals( + "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), + u"pyth\xf6n.org." + ) + + decoder = codecs.getincrementaldecoder("idna")() + self.assertEquals(decoder.decode("xn--xam", ), u"") + self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.") + self.assertEquals(decoder.decode(u"rg"), u"") + self.assertEquals(decoder.decode(u"", True), u"org") + + decoder.reset() + self.assertEquals(decoder.decode("xn--xam", ), u"") + self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.") + self.assertEquals(decoder.decode("rg."), u"org.") + self.assertEquals(decoder.decode("", True), u"") + + def test_incremental_encode(self): + self.assertEquals( + "".join(codecs.iterencode(u"python.org", "idna")), + "python.org" + ) + self.assertEquals( + "".join(codecs.iterencode(u"python.org.", "idna")), + "python.org." + ) + self.assertEquals( + "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), + "xn--pythn-mua.org." + ) + self.assertEquals( + "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), + "xn--pythn-mua.org." + ) + + encoder = codecs.getincrementalencoder("idna")() + self.assertEquals(encoder.encode(u"\xe4x"), "") + self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.") + self.assertEquals(encoder.encode(u"", True), "org") + + encoder.reset() + self.assertEquals(encoder.encode(u"\xe4x"), "") + self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.") + self.assertEquals(encoder.encode(u"", True), "") + class CodecsModuleTest(unittest.TestCase): def test_decode(self): @@ -1158,7 +1225,7 @@ def test_main(): PunycodeTest, UnicodeInternalTest, NameprepTest, - CodecTest, + IDNACodecTest, CodecsModuleTest, StreamReaderTest, Str2StrTest, |