diff options
author | Walter Dörwald <walter@livinglogic.de> | 2004-09-07 20:24:22 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2004-09-07 20:24:22 (GMT) |
commit | 69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch) | |
tree | 088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib | |
parent | a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff) | |
download | cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2 |
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted).
codecs.StreamReader now implements buffering, which enables proper
readline support for the UTF-16 decoders. codecs.StreamReader.read()
has a new argument chars which specifies the number of characters to
return. codecs.StreamReader.readline() and codecs.StreamReader.readlines()
have a new argument keepends. Trailing "\n"s will be stripped from the lines
if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and
PyUnicode_DecodeUTF16Stateful.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/codecs.py | 112 | ||||
-rw-r--r-- | Lib/encodings/utf_16.py | 64 | ||||
-rw-r--r-- | Lib/encodings/utf_16_be.py | 20 | ||||
-rw-r--r-- | Lib/encodings/utf_16_le.py | 19 | ||||
-rw-r--r-- | Lib/encodings/utf_8.py | 18 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 118 |
6 files changed, 235 insertions, 116 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py index 92c6fef..f831dd6 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -228,12 +228,22 @@ class StreamReader(Codec): """ self.stream = stream self.errors = errors + self.bytebuffer = "" + self.charbuffer = u"" - def read(self, size=-1): + def decode(self, input, errors='strict'): + raise NotImplementedError + + def read(self, size=-1, chars=-1): """ Decodes data from the stream self.stream and returns the resulting object. + chars indicates the number of characters to read from the + stream. read() will never return more than chars + characters, but it might return less, if there are not enough + characters available. + size indicates the approximate maximum number of bytes to read from the stream for decoding purposes. The decoder can modify this setting as appropriate. The default value @@ -248,54 +258,70 @@ class StreamReader(Codec): on the stream, these should be read too. """ - # Unsliced reading: - if size < 0: - return self.decode(self.stream.read(), self.errors)[0] - - # Sliced reading: - read = self.stream.read - decode = self.decode - data = read(size) - i = 0 - while 1: - try: - object, decodedbytes = decode(data, self.errors) - except ValueError, why: - # This method is slow but should work under pretty much - # all conditions; at most 10 tries are made - i = i + 1 - newdata = read(1) - if not newdata or i > 10: - raise - data = data + newdata + # read until we get the required number of characters (if available) + done = False + while True: + # can the request can be satisfied from the character buffer? + if chars < 0: + if self.charbuffer: + done = True else: - return object - - def readline(self, size=None): + if len(self.charbuffer) >= chars: + done = True + if done: + if chars < 0: + result = self.charbuffer + self.charbuffer = u"" + break + else: + result = self.charbuffer[:chars] + self.charbuffer = self.charbuffer[chars:] + break + # we need more data + if size < 0: + newdata = self.stream.read() + else: + newdata = self.stream.read(size) + data = self.bytebuffer + newdata + object, decodedbytes = self.decode(data, self.errors) + # keep undecoded bytes until the next call + self.bytebuffer = data[decodedbytes:] + # put new characters in the character buffer + self.charbuffer += object + # there was no data available + if not newdata: + done = True + return result + + def readline(self, size=None, keepends=True): """ Read one line from the input stream and return the decoded data. - Note: Unlike the .readlines() method, this method inherits - the line breaking knowledge from the underlying stream's - .readline() method -- there is currently no support for - line breaking using the codec decoder due to lack of line - buffering. Subclasses should however, if possible, try to - implement this method using their own knowledge of line - breaking. - - size, if given, is passed as size argument to the stream's - .readline() method. + size, if given, is passed as size argument to the + read() method. """ if size is None: - line = self.stream.readline() - else: - line = self.stream.readline(size) - return self.decode(line, self.errors)[0] - - - def readlines(self, sizehint=None): + size = 10 + line = u"" + while True: + data = self.read(size) + line += data + pos = line.find("\n") + if pos>=0: + self.charbuffer = line[pos+1:] + self.charbuffer + if keepends: + line = line[:pos+1] + else: + line = line[:pos] + return line + elif not data: + return line + if size<8000: + size *= 2 + + def readlines(self, sizehint=None, keepends=True): """ Read all lines available on the input stream and return them as list of lines. @@ -307,8 +333,8 @@ class StreamReader(Codec): way to finding the true end-of-line. """ - data = self.stream.read() - return self.decode(data, self.errors)[0].splitlines(1) + data = self.read() + return self.splitlines(keepends) def reset(self): diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index 8c79c79..a33581c 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -10,54 +10,40 @@ import codecs, sys ### Codec APIs -class Codec(codecs.Codec): +encode = codecs.utf_16_encode - # Note: Binding these as C functions will result in the class not - # converting them to methods. This is intended. - encode = codecs.utf_16_encode - decode = codecs.utf_16_decode +def decode(input, errors='strict'): + return codecs.utf_16_decode(input, errors, True) -class StreamWriter(Codec,codecs.StreamWriter): +class StreamWriter(codecs.StreamWriter): def __init__(self, stream, errors='strict'): - self.bom_written = 0 + self.bom_written = False codecs.StreamWriter.__init__(self, stream, errors) - def write(self, data): - result = codecs.StreamWriter.write(self, data) - if not self.bom_written: - self.bom_written = 1 - if sys.byteorder == 'little': - self.encode = codecs.utf_16_le_encode - else: - self.encode = codecs.utf_16_be_encode + def encode(self, input, errors='strict'): + self.bom_written = True + result = codecs.utf_16_encode(input, errors) + if sys.byteorder == 'little': + self.encode = codecs.utf_16_le_encode + else: + self.encode = codecs.utf_16_be_encode return result -class StreamReader(Codec,codecs.StreamReader): - def __init__(self, stream, errors='strict'): - self.bom_read = 0 - codecs.StreamReader.__init__(self, stream, errors) - - def read(self, size=-1): - if not self.bom_read: - signature = self.stream.read(2) - if signature == codecs.BOM_BE: - self.decode = codecs.utf_16_be_decode - elif signature == codecs.BOM_LE: - self.decode = codecs.utf_16_le_decode - else: - raise UnicodeError,"UTF-16 stream does not start with BOM" - if size > 2: - size -= 2 - elif size >= 0: - size = 0 - self.bom_read = 1 - return codecs.StreamReader.read(self, size) - - def readline(self, size=None): - raise NotImplementedError, '.readline() is not implemented for UTF-16' +class StreamReader(codecs.StreamReader): + + def decode(self, input, errors='strict'): + (object, consumed, byteorder) = \ + codecs.utf_16_ex_decode(input, errors, 0, False) + if byteorder == -1: + self.decode = codecs.utf_16_le_decode + elif byteorder == 1: + self.decode = codecs.utf_16_be_decode + elif consumed>=2: + raise UnicodeError,"UTF-16 stream does not start with BOM" + return (object, consumed) ### encodings module API def getregentry(): - return (Codec.encode,Codec.decode,StreamReader,StreamWriter) + return (encode,decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py index dad540b..9a51f8c 100644 --- a/Lib/encodings/utf_16_be.py +++ b/Lib/encodings/utf_16_be.py @@ -10,23 +10,19 @@ import codecs ### Codec APIs -class Codec(codecs.Codec): +encode = codecs.utf_16_be_encode - # Note: Binding these as C functions will result in the class not - # converting them to methods. This is intended. - encode = codecs.utf_16_be_encode - decode = codecs.utf_16_be_decode - -class StreamWriter(Codec,codecs.StreamWriter): - pass +def decode(input, errors='strict'): + return codecs.utf_16_be_decode(input, errors, True) -class StreamReader(Codec,codecs.StreamReader): +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_16_be_encode - def readline(self, size=None): - raise NotImplementedError, '.readline() is not implemented for UTF-16-BE' +class StreamReader(codecs.StreamReader): + decode = codecs.utf_16_be_decode ### encodings module API def getregentry(): - return (Codec.encode,Codec.decode,StreamReader,StreamWriter) + return (encode,decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py index 8120d5b..95ca830 100644 --- a/Lib/encodings/utf_16_le.py +++ b/Lib/encodings/utf_16_le.py @@ -10,23 +10,20 @@ import codecs ### Codec APIs -class Codec(codecs.Codec): +encode = codecs.utf_16_le_encode - # Note: Binding these as C functions will result in the class not - # converting them to methods. This is intended. - encode = codecs.utf_16_le_encode - decode = codecs.utf_16_le_decode +def decode(input, errors='strict'): + return codecs.utf_16_le_decode(input, errors, True) -class StreamWriter(Codec,codecs.StreamWriter): - pass +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_16_le_encode -class StreamReader(Codec,codecs.StreamReader): +class StreamReader(codecs.StreamReader): + decode = codecs.utf_16_le_decode - def readline(self, size=None): - raise NotImplementedError, '.readline() is not implemented for UTF-16-LE' ### encodings module API def getregentry(): - return (Codec.encode,Codec.decode,StreamReader,StreamWriter) + return (encode,decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py index 89249a9..9cb0b4b 100644 --- a/Lib/encodings/utf_8.py +++ b/Lib/encodings/utf_8.py @@ -10,21 +10,19 @@ import codecs ### Codec APIs -class Codec(codecs.Codec): +encode = codecs.utf_8_encode - # Note: Binding these as C functions will result in the class not - # converting them to methods. This is intended. - encode = codecs.utf_8_encode - decode = codecs.utf_8_decode +def decode(input, errors='strict'): + return codecs.utf_8_decode(input, errors, True) -class StreamWriter(Codec,codecs.StreamWriter): - pass +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_8_encode -class StreamReader(Codec,codecs.StreamReader): - pass +class StreamReader(codecs.StreamReader): + decode = codecs.utf_8_decode ### encodings module API def getregentry(): - return (Codec.encode,Codec.decode,StreamReader,StreamWriter) + return (encode,decode,StreamReader,StreamWriter) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index c428c61..524c247 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3,7 +3,45 @@ import unittest import codecs import StringIO -class UTF16Test(unittest.TestCase): +class Queue(object): + """ + queue: write bytes at one end, read bytes from the other end + """ + def __init__(self): + self._buffer = "" + + def write(self, chars): + self._buffer += chars + + def read(self, size=-1): + if size<0: + s = self._buffer + self._buffer = "" + return s + else: + s = self._buffer[:size] + self._buffer = self._buffer[size:] + return s + +class PartialReadTest(unittest.TestCase): + def check_partial(self, encoding, input, partialresults): + # get a StreamReader for the encoding and feed the bytestring version + # of input to the reader byte by byte. Read every available from + # the StreamReader and check that the results equal the appropriate + # entries from partialresults. + q = Queue() + r = codecs.getreader(encoding)(q) + result = u"" + for (c, partialresult) in zip(input.encode(encoding), partialresults): + q.write(c) + result += r.read() + self.assertEqual(result, partialresult) + # check that there's nothing left in the buffers + self.assertEqual(r.read(), u"") + self.assertEqual(r.bytebuffer, "") + self.assertEqual(r.charbuffer, u"") + +class UTF16Test(PartialReadTest): spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' @@ -23,6 +61,81 @@ class UTF16Test(unittest.TestCase): f = reader(s) self.assertEquals(f.read(), u"spamspam") + def test_partial(self): + self.check_partial( + "utf-16", + u"\x00\xff\u0100\uffff", + [ + u"", # first byte of BOM read + u"", # second byte of BOM read => byteorder known + u"", + u"\x00", + u"\x00", + u"\x00\xff", + u"\x00\xff", + u"\x00\xff\u0100", + u"\x00\xff\u0100", + u"\x00\xff\u0100\uffff", + ] + ) + +class UTF16LETest(PartialReadTest): + + def test_partial(self): + self.check_partial( + "utf-16-le", + u"\x00\xff\u0100\uffff", + [ + u"", + u"\x00", + u"\x00", + u"\x00\xff", + u"\x00\xff", + u"\x00\xff\u0100", + u"\x00\xff\u0100", + u"\x00\xff\u0100\uffff", + ] + ) + +class UTF16BETest(PartialReadTest): + + def test_partial(self): + self.check_partial( + "utf-16-be", + u"\x00\xff\u0100\uffff", + [ + u"", + u"\x00", + u"\x00", + u"\x00\xff", + u"\x00\xff", + u"\x00\xff\u0100", + u"\x00\xff\u0100", + u"\x00\xff\u0100\uffff", + ] + ) + +class UTF8Test(PartialReadTest): + + def test_partial(self): + self.check_partial( + "utf-8", + u"\x00\xff\u07ff\u0800\uffff", + [ + u"\x00", + u"\x00", + u"\x00\xff", + u"\x00\xff", + u"\x00\xff\u07ff", + u"\x00\xff\u07ff", + u"\x00\xff\u07ff", + u"\x00\xff\u07ff\u0800", + u"\x00\xff\u07ff\u0800", + u"\x00\xff\u07ff\u0800", + u"\x00\xff\u07ff\u0800\uffff", + ] + ) + class EscapeDecodeTest(unittest.TestCase): def test_empty_escape_decode(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -348,6 +461,9 @@ class CodecsModuleTest(unittest.TestCase): def test_main(): test_support.run_unittest( UTF16Test, + UTF16LETest, + UTF16BETest, + UTF8Test, EscapeDecodeTest, RecodingTest, PunycodeTest, |