From 3abcb013b8195aea38f80968d4111b5ac7e68c0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Mon, 16 Apr 2007 22:10:50 +0000 Subject: Apply SF patch #1698994: Add getstate() and setstate() methods to incrementalcodecs. Also forward port r54786 (fix the incremental utf_8_sig decoder). --- Doc/lib/libcodecs.tex | 36 +++++++++++++++++++++ Lib/codecs.py | 66 ++++++++++++++++++++++++++++++------- Lib/encodings/utf_16.py | 45 ++++++++++++++++++++++++++ Lib/encodings/utf_8_sig.py | 52 +++++++++++++++++++++-------- Lib/test/test_codecs.py | 81 ++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 249 insertions(+), 31 deletions(-) diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index 05c0375..557364d 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -405,6 +405,21 @@ define in order to be compatible with the Python codec registry. Reset the encoder to the initial state. \end{methoddesc} +\begin{methoddesc}{getstate}{} + Return the current state of the encoder which must be an integer. + The implementation should make sure that \code{0} is the most common state. + (States that are more complicated than integers can be converted into an + integer by marshaling/pickling the state and encoding the bytes of the + resulting string into an integer). + \versionadded{3.0} +\end{methoddesc} + +\begin{methoddesc}{setstate}{state} + Set the state of the encoder to \var{state}. \var{state} must be an + encoder state returned by \method{getstate}. + \versionadded{3.0} +\end{methoddesc} + \subsubsection{IncrementalDecoder Objects \label{incremental-decoder-objects}} @@ -453,6 +468,27 @@ define in order to be compatible with the Python codec registry. Reset the decoder to the initial state. \end{methoddesc} +\begin{methoddesc}{getstate}{} + Return the current state of the decoder. This must be a tuple with two + items, the first must be the buffer containing the still undecoded input. + The second must be an integer and can be additional state info. + (The implementation should make sure that \code{0} is the most common + additional state info.) If this additional state info is \code{0} it must + be possible to set the decoder to the state which has no input buffered + and \code{0} as the additional state info, so that feeding the previously + buffered input to the decoder returns it to the previous state without + producing any output. (Additional state info that is more complicated + than integers can be converted into an integer by marshaling/pickling + the info and encoding the bytes of the resulting string into an integer.) + \versionadded{3.0} +\end{methoddesc} + +\begin{methoddesc}{setstate}{state} + Set the state of the encoder to \var{state}. \var{state} must be a + decoder state returned by \method{getstate}. + \versionadded{3.0} +\end{methoddesc} + The \class{StreamWriter} and \class{StreamReader} classes provide generic working interfaces which can be used to implement new diff --git a/Lib/codecs.py b/Lib/codecs.py index e4e14cf..185ad42 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -87,7 +87,9 @@ class CodecInfo(tuple): return self def __repr__(self): - return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) + return "<%s.%s object for encoding %s at 0x%x>" % \ + (self.__class__.__module__, self.__class__.__name__, + self.name, id(self)) class Codec: @@ -155,9 +157,9 @@ class Codec: class IncrementalEncoder(object): """ - An IncrementalEncoder encodes an input in multiple steps. The input can be - passed piece by piece to the encode() method. The IncrementalEncoder remembers - the state of the Encoding process between calls to encode(). + An IncrementalEncoder encodes an input in multiple steps. The input can + be passed piece by piece to the encode() method. The IncrementalEncoder + remembers the state of the encoding process between calls to encode(). """ def __init__(self, errors='strict'): """ @@ -181,6 +183,18 @@ class IncrementalEncoder(object): Resets the encoder to the initial state. """ + def getstate(self): + """ + Return the current state of the encoder. + """ + return 0 + + def setstate(self, state): + """ + Set the current state of the encoder. state must have been + returned by getstate(). + """ + class BufferedIncrementalEncoder(IncrementalEncoder): """ This subclass of IncrementalEncoder can be used as the baseclass for an @@ -189,7 +203,8 @@ class BufferedIncrementalEncoder(IncrementalEncoder): """ def __init__(self, errors='strict'): IncrementalEncoder.__init__(self, errors) - self.buffer = "" # unencoded input that is kept between calls to encode() + # unencoded input that is kept between calls to encode() + self.buffer = "" def _buffer_encode(self, input, errors, final): # Overwrite this method in subclasses: It must encode input @@ -208,10 +223,16 @@ class BufferedIncrementalEncoder(IncrementalEncoder): IncrementalEncoder.reset(self) self.buffer = "" + def getstate(self): + return self.buffer or 0 + + def setstate(self, state): + self.buffer = state or "" + class IncrementalDecoder(object): """ - An IncrementalDecoder decodes an input in multiple steps. The input can be - passed piece by piece to the decode() method. The IncrementalDecoder + An IncrementalDecoder decodes an input in multiple steps. The input can + be passed piece by piece to the decode() method. The IncrementalDecoder remembers the state of the decoding process between calls to decode(). """ def __init__(self, errors='strict'): @@ -235,15 +256,29 @@ class IncrementalDecoder(object): Resets the decoder to the initial state. """ + def getstate(self): + """ + Return the current state of the decoder. This must be a + (buffered_input, additional_state_info) tuple. + """ + return ("", 0) + + def setstate(self, state): + """ + Set the current state of the decoder. state must have been + returned by getstate(). + """ + class BufferedIncrementalDecoder(IncrementalDecoder): """ This subclass of IncrementalDecoder can be used as the baseclass for an - incremental decoder if the decoder must be able to handle incomplete byte - sequences. + incremental decoder if the decoder must be able to handle incomplete + byte sequences. """ def __init__(self, errors='strict'): IncrementalDecoder.__init__(self, errors) - self.buffer = "" # undecoded input that is kept between calls to decode() + # undecoded input that is kept between calls to decode() + self.buffer = "" def _buffer_decode(self, input, errors, final): # Overwrite this method in subclasses: It must decode input @@ -262,6 +297,14 @@ class BufferedIncrementalDecoder(IncrementalDecoder): IncrementalDecoder.reset(self) self.buffer = "" + def getstate(self): + # additional state info is always 0 + return (self.buffer, 0) + + def setstate(self, state): + # ignore additional state info + self.buffer = state[0] + # # The StreamWriter and StreamReader class provide generic working # interfaces which can be used to implement new encoding submodules @@ -424,7 +467,8 @@ class StreamReader(Codec): newchars, decodedbytes = self.decode(data, self.errors) except UnicodeDecodeError as exc: if firstline: - newchars, decodedbytes = self.decode(data[:exc.start], self.errors) + newchars, decodedbytes = \ + self.decode(data[:exc.start], self.errors) lines = newchars.splitlines(True) if len(lines)<=1: raise diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py index eff08f3..cf096b5 100644 --- a/Lib/encodings/utf_16.py +++ b/Lib/encodings/utf_16.py @@ -34,6 +34,22 @@ class IncrementalEncoder(codecs.IncrementalEncoder): codecs.IncrementalEncoder.reset(self) self.encoder = None + def getstate(self): + # state info we return to the caller: + # 0: stream is in natural order for this platform + # 2: endianness hasn't been determined yet + # (we're never writing in unnatural order) + return (2 if self.encoder is None else 0) + + def setstate(self, state): + if state: + self.encoder = None + else: + if sys.byteorder == 'little': + self.encoder = codecs.utf_16_le_encode + else: + self.encoder = codecs.utf_16_be_encode + class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def __init__(self, errors='strict'): codecs.BufferedIncrementalDecoder.__init__(self, errors) @@ -56,6 +72,35 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder): codecs.BufferedIncrementalDecoder.reset(self) self.decoder = None + def getstate(self): + # additonal state info from the base class must be None here, + # as it isn't passed along to the caller + state = codecs.BufferedIncrementalDecoder.getstate(self)[0] + # additional state info we pass to the caller: + # 0: stream is in natural order for this platform + # 1: stream is in unnatural order + # 2: endianness hasn't been determined yet + if self.decoder is None: + return (state, 2) + addstate = int((sys.byteorder == "big") != + (self.decoder is codecs.utf_16_be_decode)) + return (state, addstate) + + def setstate(self, state): + # state[1] will be ignored by BufferedIncrementalDecoder.setstate() + codecs.BufferedIncrementalDecoder.setstate(self, state) + state = state[1] + if state == 0: + self.decoder = (codecs.utf_16_be_decode + if sys.byteorder == "big" + else codecs.utf_16_le_decode) + elif state == 1: + self.decoder = (codecs.utf_16_le_decode + if sys.byteorder == "big" + else codecs.utf_16_be_decode) + else: + self.decoder = None + class StreamWriter(codecs.StreamWriter): def __init__(self, stream, errors='strict'): self.bom_written = False diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py index d751da6..a0cc1af 100644 --- a/Lib/encodings/utf_8_sig.py +++ b/Lib/encodings/utf_8_sig.py @@ -12,7 +12,8 @@ import codecs ### Codec APIs def encode(input, errors='strict'): - return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], + len(input)) def decode(input, errors='strict'): prefix = 0 @@ -25,38 +26,61 @@ def decode(input, errors='strict'): class IncrementalEncoder(codecs.IncrementalEncoder): def __init__(self, errors='strict'): codecs.IncrementalEncoder.__init__(self, errors) - self.first = True + self.first = 1 def encode(self, input, final=False): if self.first: - self.first = False - return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] + self.first = 0 + return codecs.BOM_UTF8 + \ + codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0] def reset(self): codecs.IncrementalEncoder.reset(self) - self.first = True + self.first = 1 + + def getstate(self): + return self.first + + def setstate(self, state): + self.first = state class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def __init__(self, errors='strict'): codecs.BufferedIncrementalDecoder.__init__(self, errors) - self.first = True + self.first = 1 def _buffer_decode(self, input, errors, final): - if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM + if self.first: if len(input) < 3: - # not enough data to decide if this really is a BOM - # => try again on the next call - return (u"", 0) - (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) - self.first = False - return (output, consumed+3) + if codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this really is a BOM + # => try again on the next call + return (u"", 0) + else: + self.first = 0 + else: + self.first = 0 + if input[:3] == codecs.BOM_UTF8: + (output, consumed) = \ + codecs.utf_8_decode(input[3:], errors, final) + return (output, consumed+3) return codecs.utf_8_decode(input, errors, final) def reset(self): codecs.BufferedIncrementalDecoder.reset(self) - self.first = True + self.first = 1 + + def getstate(self): + state = codecs.BufferedIncrementalDecoder.getstate(self) + # state[1] must be 0 here, as it isn't passed along to the caller + return (state[0], self.first) + + def setstate(self, state): + # state[1] will be ignored by BufferedIncrementalDecoder.setstate() + codecs.BufferedIncrementalDecoder.setstate(self, state) + self.first = state[1] class StreamWriter(codecs.StreamWriter): def reset(self): diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 44ce8eb..f7a9789 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -23,7 +23,40 @@ class Queue(object): self._buffer = self._buffer[size:] return s -class ReadTest(unittest.TestCase): +class MixInCheckStateHandling: + def check_state_handling_decode(self, encoding, u, s): + for i in xrange(len(s)+1): + d = codecs.getincrementaldecoder(encoding)() + part1 = d.decode(s[:i]) + state = d.getstate() + self.assert_(isinstance(state[1], int)) + # Check that the condition stated in the documentation for + # IncrementalDecoder.getstate() holds + if not state[1]: + # reset decoder to the default state without anything buffered + d.setstate((state[0][:0], 0)) + # Feeding the previous input may not produce any output + self.assert_(not d.decode(state[0])) + # The decoder must return to the same state + self.assertEqual(state, d.getstate()) + # Create a new decoder and set it to the state + # we extracted from the old one + d = codecs.getincrementaldecoder(encoding)() + d.setstate(state) + part2 = d.decode(s[i:], True) + self.assertEqual(u, part1+part2) + + def check_state_handling_encode(self, encoding, u, s): + for i in xrange(len(u)+1): + d = codecs.getincrementalencoder(encoding)() + part1 = d.encode(u[:i]) + state = d.getstate() + d = codecs.getincrementalencoder(encoding)() + d.setstate(state) + part2 = d.encode(u[i:], True) + self.assertEqual(s, part1+part2) + +class ReadTest(unittest.TestCase, MixInCheckStateHandling): def check_partial(self, input, partialresults): # get a StreamReader for the encoding and feed the bytestring version # of input to the reader byte by byte. Read every available from @@ -292,7 +325,14 @@ class UTF16Test(ReadTest): ) def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True) + self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, + "\xff", "strict", True) + + def test_decoder_state(self): + self.check_state_handling_decode(self.encoding, + u"spamspam", self.spamle) + self.check_state_handling_decode(self.encoding, + u"spamspam", self.spambe) class UTF16LETest(ReadTest): encoding = "utf-16-le" @@ -313,7 +353,8 @@ class UTF16LETest(ReadTest): ) def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True) + self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, + "\xff", "strict", True) class UTF16BETest(ReadTest): encoding = "utf-16-be" @@ -334,7 +375,8 @@ class UTF16BETest(ReadTest): ) def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True) + self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, + "\xff", "strict", True) class UTF8Test(ReadTest): encoding = "utf-8" @@ -357,6 +399,11 @@ class UTF8Test(ReadTest): ] ) + def test_decoder_state(self): + u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" + self.check_state_handling_decode(self.encoding, + u, u.encode(self.encoding)) + class UTF7Test(ReadTest): encoding = "utf-7" @@ -429,6 +476,16 @@ class UTF8SigTest(ReadTest): # SF bug #1601501: check that the codec works with a buffer unicode("\xef\xbb\xbf", "utf-8-sig") + def test_bom(self): + d = codecs.getincrementaldecoder("utf-8-sig")() + s = u"spam" + self.assertEqual(d.decode(s.encode("utf-8-sig")), s) + + def test_decoder_state(self): + u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" + self.check_state_handling_decode(self.encoding, + u, u.encode(self.encoding)) + class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -1066,7 +1123,11 @@ broken_unicode_with_streams = [ "punycode", "unicode_internal" ] -broken_incremental_coders = broken_unicode_with_streams[:] +broken_incremental_coders = broken_unicode_with_streams + [ + "idna", + "zlib_codec", + "bz2_codec", +] # The following encodings only support "strict" mode only_strict_mode = [ @@ -1091,7 +1152,7 @@ else: all_unicode_encodings.append("zlib_codec") broken_unicode_with_streams.append("zlib_codec") -class BasicUnicodeTest(unittest.TestCase): +class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): def test_basics(self): s = u"abc123" # all codecs should be able to encode these for encoding in all_unicode_encodings: @@ -1215,6 +1276,14 @@ class BasicUnicodeTest(unittest.TestCase): table_type = type(cp1140.encoding_table) self.assertEqual(table_type, table_type) + def test_decoder_state(self): + # Check that getstate() and setstate() handle the state properly + u = u"abc123" + for encoding in all_unicode_encodings: + if encoding not in broken_incremental_coders: + self.check_state_handling_decode(encoding, u, u.encode(encoding)) + self.check_state_handling_encode(encoding, u, u.encode(encoding)) + class BasicStrTest(unittest.TestCase): def test_basics(self): s = "abc123" -- cgit v0.12