5 files changed, 249 insertions, 31 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index 05c0375..557364d 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -405,6 +405,21 @@ define in order to be compatible with the Python codec registry.
   Reset the encoder to the initial state.
 \end{methoddesc}
 
+\begin{methoddesc}{getstate}{}
+  Return the current state of the encoder which must be an integer.
+  The implementation should make sure that \code{0} is the most common state.
+  (States that are more complicated than integers can be converted into an
+  integer by marshaling/pickling the state and encoding the bytes of the
+  resulting string into an integer).
+  \versionadded{3.0}
+\end{methoddesc}
+
+\begin{methoddesc}{setstate}{state}
+  Set the state of the encoder to \var{state}. \var{state} must be an
+  encoder state returned by \method{getstate}.
+  \versionadded{3.0}
+\end{methoddesc}
+
 
 \subsubsection{IncrementalDecoder Objects \label{incremental-decoder-objects}}
 
@@ -453,6 +468,27 @@ define in order to be compatible with the Python codec registry.
   Reset the decoder to the initial state.
 \end{methoddesc}
 
+\begin{methoddesc}{getstate}{}
+  Return the current state of the decoder. This must be a tuple with two
+  items, the first must be the buffer containing the still undecoded input.
+  The second must be an integer and can be additional state info.
+  (The implementation should make sure that \code{0} is the most common
+  additional state info.) If this additional state info is \code{0} it must
+  be possible to set the decoder to the state which has no input buffered
+  and \code{0} as the additional state info, so that feeding the previously
+  buffered input to the decoder returns it to the previous state without
+  producing any output. (Additional state info that is more complicated
+  than integers can be converted into an integer by marshaling/pickling
+  the info and encoding the bytes of the resulting string into an integer.)
+  \versionadded{3.0}
+\end{methoddesc}
+
+\begin{methoddesc}{setstate}{state}
+  Set the state of the encoder to \var{state}. \var{state} must be a
+  decoder state returned by \method{getstate}.
+  \versionadded{3.0}
+\end{methoddesc}
+
 
 The \class{StreamWriter} and \class{StreamReader} classes provide
 generic working interfaces which can be used to implement new
diff --git a/Lib/codecs.py b/Lib/codecs.py
index e4e14cf..185ad42 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -87,7 +87,9 @@ class CodecInfo(tuple):
         return self
 
     def __repr__(self):
-        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
+        return "<%s.%s object for encoding %s at 0x%x>" % \
+                (self.__class__.__module__, self.__class__.__name__,
+                 self.name, id(self))
 
 class Codec:
 
@@ -155,9 +157,9 @@ class Codec:
 
 class IncrementalEncoder(object):
     """
-    An IncrementalEncoder encodes an input in multiple steps. The input can be
-    passed piece by piece to the encode() method. The IncrementalEncoder remembers
-    the state of the Encoding process between calls to encode().
+    An IncrementalEncoder encodes an input in multiple steps. The input can
+    be passed piece by piece to the encode() method. The IncrementalEncoder
+    remembers the state of the encoding process between calls to encode().
     """
     def __init__(self, errors='strict'):
         """
@@ -181,6 +183,18 @@ class IncrementalEncoder(object):
         Resets the encoder to the initial state.
         """
 
+    def getstate(self):
+        """
+        Return the current state of the encoder.
+        """
+        return 0
+
+    def setstate(self, state):
+        """
+        Set the current state of the encoder. state must have been
+        returned by getstate().
+        """
+
 class BufferedIncrementalEncoder(IncrementalEncoder):
     """
     This subclass of IncrementalEncoder can be used as the baseclass for an
@@ -189,7 +203,8 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
     """
     def __init__(self, errors='strict'):
         IncrementalEncoder.__init__(self, errors)
-        self.buffer = "" # unencoded input that is kept between calls to encode()
+        # unencoded input that is kept between calls to encode()
+        self.buffer = ""
 
     def _buffer_encode(self, input, errors, final):
         # Overwrite this method in subclasses: It must encode input
@@ -208,10 +223,16 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
         IncrementalEncoder.reset(self)
         self.buffer = ""
 
+    def getstate(self):
+        return self.buffer or 0
+
+    def setstate(self, state):
+        self.buffer = state or ""
+
 class IncrementalDecoder(object):
     """
-    An IncrementalDecoder decodes an input in multiple steps. The input can be
-    passed piece by piece to the decode() method. The IncrementalDecoder
+    An IncrementalDecoder decodes an input in multiple steps. The input can
+    be passed piece by piece to the decode() method. The IncrementalDecoder
     remembers the state of the decoding process between calls to decode().
     """
     def __init__(self, errors='strict'):
@@ -235,15 +256,29 @@ class IncrementalDecoder(object):
         Resets the decoder to the initial state.
         """
 
+    def getstate(self):
+        """
+        Return the current state of the decoder. This must be a
+        (buffered_input, additional_state_info) tuple.
+        """
+        return ("", 0)
+
+    def setstate(self, state):
+        """
+        Set the current state of the decoder. state must have been
+        returned by getstate().
+        """
+
 class BufferedIncrementalDecoder(IncrementalDecoder):
     """
     This subclass of IncrementalDecoder can be used as the baseclass for an
-    incremental decoder if the decoder must be able to handle incomplete byte
-    sequences.
+    incremental decoder if the decoder must be able to handle incomplete
+    byte sequences.
     """
     def __init__(self, errors='strict'):
         IncrementalDecoder.__init__(self, errors)
-        self.buffer = "" # undecoded input that is kept between calls to decode()
+        # undecoded input that is kept between calls to decode()
+        self.buffer = ""
 
     def _buffer_decode(self, input, errors, final):
         # Overwrite this method in subclasses: It must decode input
@@ -262,6 +297,14 @@ class BufferedIncrementalDecoder(IncrementalDecoder):
         IncrementalDecoder.reset(self)
         self.buffer = ""
 
+    def getstate(self):
+        # additional state info is always 0
+        return (self.buffer, 0)
+
+    def setstate(self, state):
+        # ignore additional state info
+        self.buffer = state[0]
+
 #
 # The StreamWriter and StreamReader class provide generic working
 # interfaces which can be used to implement new encoding submodules
@@ -424,7 +467,8 @@ class StreamReader(Codec):
                 newchars, decodedbytes = self.decode(data, self.errors)
             except UnicodeDecodeError as exc:
                 if firstline:
-                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
+                    newchars, decodedbytes = \
+                        self.decode(data[:exc.start], self.errors)
                     lines = newchars.splitlines(True)
                     if len(lines)<=1:
                         raise
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index eff08f3..cf096b5 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -34,6 +34,22 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
         codecs.IncrementalEncoder.reset(self)
         self.encoder = None
 
+    def getstate(self):
+        # state info we return to the caller:
+        # 0: stream is in natural order for this platform
+        # 2: endianness hasn't been determined yet
+        # (we're never writing in unnatural order)
+        return (2 if self.encoder is None else 0)
+
+    def setstate(self, state):
+        if state:
+            self.encoder = None
+        else:
+            if sys.byteorder == 'little':
+                self.encoder = codecs.utf_16_le_encode
+            else:
+                self.encoder = codecs.utf_16_be_encode
+
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
     def __init__(self, errors='strict'):
         codecs.BufferedIncrementalDecoder.__init__(self, errors)
@@ -56,6 +72,35 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
         codecs.BufferedIncrementalDecoder.reset(self)
         self.decoder = None
 
+    def getstate(self):
+        # additonal state info from the base class must be None here,
+        # as it isn't passed along to the caller
+        state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
+        # additional state info we pass to the caller:
+        # 0: stream is in natural order for this platform
+        # 1: stream is in unnatural order
+        # 2: endianness hasn't been determined yet
+        if self.decoder is None:
+            return (state, 2)
+        addstate = int((sys.byteorder == "big") !=
+                       (self.decoder is codecs.utf_16_be_decode))
+        return (state, addstate)
+
+    def setstate(self, state):
+        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+        codecs.BufferedIncrementalDecoder.setstate(self, state)
+        state = state[1]
+        if state == 0:
+            self.decoder = (codecs.utf_16_be_decode
+                            if sys.byteorder == "big"
+                            else codecs.utf_16_le_decode)
+        elif state == 1:
+            self.decoder = (codecs.utf_16_le_decode
+                            if sys.byteorder == "big"
+                            else codecs.utf_16_be_decode)
+        else:
+            self.decoder = None
+
 class StreamWriter(codecs.StreamWriter):
     def __init__(self, stream, errors='strict'):
         self.bom_written = False
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
index d751da6..a0cc1af 100644
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -12,7 +12,8 @@ import codecs
 ### Codec APIs
 
 def encode(input, errors='strict'):
-    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
+    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
+            len(input))
 
 def decode(input, errors='strict'):
     prefix = 0
@@ -25,38 +26,61 @@ def decode(input, errors='strict'):
 class IncrementalEncoder(codecs.IncrementalEncoder):
     def __init__(self, errors='strict'):
         codecs.IncrementalEncoder.__init__(self, errors)
-        self.first = True
+        self.first = 1
 
     def encode(self, input, final=False):
         if self.first:
-            self.first = False
-            return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
+            self.first = 0
+            return codecs.BOM_UTF8 + \
+                   codecs.utf_8_encode(input, self.errors)[0]
         else:
             return codecs.utf_8_encode(input, self.errors)[0]
 
     def reset(self):
         codecs.IncrementalEncoder.reset(self)
-        self.first = True
+        self.first = 1
+
+    def getstate(self):
+        return self.first
+
+    def setstate(self, state):
+        self.first = state
 
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
     def __init__(self, errors='strict'):
         codecs.BufferedIncrementalDecoder.__init__(self, errors)
-        self.first = True
+        self.first = 1
 
     def _buffer_decode(self, input, errors, final):
-        if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
+        if self.first:
             if len(input) < 3:
-                # not enough data to decide if this really is a BOM
-                # => try again on the next call
-                return (u"", 0)
-            (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
-            self.first = False
-            return (output, consumed+3)
+                if codecs.BOM_UTF8.startswith(input):
+                    # not enough data to decide if this really is a BOM
+                    # => try again on the next call
+                    return (u"", 0)
+                else:
+                    self.first = 0
+            else:
+                self.first = 0
+                if input[:3] == codecs.BOM_UTF8:
+                    (output, consumed) = \
+                       codecs.utf_8_decode(input[3:], errors, final)
+                    return (output, consumed+3)
         return codecs.utf_8_decode(input, errors, final)
 
     def reset(self):
         codecs.BufferedIncrementalDecoder.reset(self)
-        self.first = True
+        self.first = 1
+
+    def getstate(self):
+        state = codecs.BufferedIncrementalDecoder.getstate(self)
+        # state[1] must be 0 here, as it isn't passed along to the caller
+        return (state[0], self.first)
+
+    def setstate(self, state):
+        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+        codecs.BufferedIncrementalDecoder.setstate(self, state)
+        self.first = state[1]
 
 class StreamWriter(codecs.StreamWriter):
     def reset(self):
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 44ce8eb..f7a9789 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -23,7 +23,40 @@ class Queue(object):
             self._buffer = self._buffer[size:]
             return s
 
-class ReadTest(unittest.TestCase):
+class MixInCheckStateHandling:
+    def check_state_handling_decode(self, encoding, u, s):
+        for i in xrange(len(s)+1):
+            d = codecs.getincrementaldecoder(encoding)()
+            part1 = d.decode(s[:i])
+            state = d.getstate()
+            self.assert_(isinstance(state[1], int))
+            # Check that the condition stated in the documentation for
+            # IncrementalDecoder.getstate() holds
+            if not state[1]:
+                # reset decoder to the default state without anything buffered
+                d.setstate((state[0][:0], 0))
+                # Feeding the previous input may not produce any output
+                self.assert_(not d.decode(state[0]))
+                # The decoder must return to the same state
+                self.assertEqual(state, d.getstate())
+            # Create a new decoder and set it to the state
+            # we extracted from the old one
+            d = codecs.getincrementaldecoder(encoding)()
+            d.setstate(state)
+            part2 = d.decode(s[i:], True)
+            self.assertEqual(u, part1+part2)
+
+    def check_state_handling_encode(self, encoding, u, s):
+        for i in xrange(len(u)+1):
+            d = codecs.getincrementalencoder(encoding)()
+            part1 = d.encode(u[:i])
+            state = d.getstate()
+            d = codecs.getincrementalencoder(encoding)()
+            d.setstate(state)
+            part2 = d.encode(u[i:], True)
+            self.assertEqual(s, part1+part2)
+
+class ReadTest(unittest.TestCase, MixInCheckStateHandling):
     def check_partial(self, input, partialresults):
         # get a StreamReader for the encoding and feed the bytestring version
         # of input to the reader byte by byte. Read every available from
@@ -292,7 +325,14 @@ class UTF16Test(ReadTest):
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
+        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
+                          "\xff", "strict", True)
+
+    def test_decoder_state(self):
+        self.check_state_handling_decode(self.encoding,
+                                         u"spamspam", self.spamle)
+        self.check_state_handling_decode(self.encoding,
+                                         u"spamspam", self.spambe)
 
 class UTF16LETest(ReadTest):
     encoding = "utf-16-le"
@@ -313,7 +353,8 @@ class UTF16LETest(ReadTest):
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
+        self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
+                          "\xff", "strict", True)
 
 class UTF16BETest(ReadTest):
     encoding = "utf-16-be"
@@ -334,7 +375,8 @@ class UTF16BETest(ReadTest):
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
+        self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
+                          "\xff", "strict", True)
 
 class UTF8Test(ReadTest):
     encoding = "utf-8"
@@ -357,6 +399,11 @@ class UTF8Test(ReadTest):
             ]
         )
 
+    def test_decoder_state(self):
+        u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
+        self.check_state_handling_decode(self.encoding,
+                                         u, u.encode(self.encoding))
+
 class UTF7Test(ReadTest):
     encoding = "utf-7"
 
@@ -429,6 +476,16 @@ class UTF8SigTest(ReadTest):
         # SF bug #1601501: check that the codec works with a buffer
         unicode("\xef\xbb\xbf", "utf-8-sig")
 
+    def test_bom(self):
+        d = codecs.getincrementaldecoder("utf-8-sig")()
+        s = u"spam"
+        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
+
+    def test_decoder_state(self):
+        u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
+        self.check_state_handling_decode(self.encoding,
+                                         u, u.encode(self.encoding))
+
 class EscapeDecodeTest(unittest.TestCase):
     def test_empty(self):
         self.assertEquals(codecs.escape_decode(""), ("", 0))
@@ -1066,7 +1123,11 @@ broken_unicode_with_streams = [
     "punycode",
     "unicode_internal"
 ]
-broken_incremental_coders = broken_unicode_with_streams[:]
+broken_incremental_coders = broken_unicode_with_streams + [
+    "idna",
+    "zlib_codec",
+    "bz2_codec",
+]
 
 # The following encodings only support "strict" mode
 only_strict_mode = [
@@ -1091,7 +1152,7 @@ else:
     all_unicode_encodings.append("zlib_codec")
     broken_unicode_with_streams.append("zlib_codec")
 
-class BasicUnicodeTest(unittest.TestCase):
+class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
     def test_basics(self):
         s = u"abc123" # all codecs should be able to encode these
         for encoding in all_unicode_encodings:
@@ -1215,6 +1276,14 @@ class BasicUnicodeTest(unittest.TestCase):
         table_type = type(cp1140.encoding_table)
         self.assertEqual(table_type, table_type)
 
+    def test_decoder_state(self):
+        # Check that getstate() and setstate() handle the state properly
+        u = u"abc123"
+        for encoding in all_unicode_encodings:
+            if encoding not in broken_incremental_coders:
+                self.check_state_handling_decode(encoding, u, u.encode(encoding))
+                self.check_state_handling_encode(encoding, u, u.encode(encoding))
+
 class BasicStrTest(unittest.TestCase):
     def test_basics(self):
         s = "abc123"