summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/lib/libcodecs.tex36
-rw-r--r--Lib/codecs.py66
-rw-r--r--Lib/encodings/utf_16.py45
-rw-r--r--Lib/encodings/utf_8_sig.py52
-rw-r--r--Lib/test/test_codecs.py81
5 files changed, 249 insertions, 31 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index 05c0375..557364d 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -405,6 +405,21 @@ define in order to be compatible with the Python codec registry.
Reset the encoder to the initial state.
\end{methoddesc}
+\begin{methoddesc}{getstate}{}
+ Return the current state of the encoder which must be an integer.
+ The implementation should make sure that \code{0} is the most common state.
+ (States that are more complicated than integers can be converted into an
+ integer by marshaling/pickling the state and encoding the bytes of the
+ resulting string into an integer).
+ \versionadded{3.0}
+\end{methoddesc}
+
+\begin{methoddesc}{setstate}{state}
+ Set the state of the encoder to \var{state}. \var{state} must be an
+ encoder state returned by \method{getstate}.
+ \versionadded{3.0}
+\end{methoddesc}
+
\subsubsection{IncrementalDecoder Objects \label{incremental-decoder-objects}}
@@ -453,6 +468,27 @@ define in order to be compatible with the Python codec registry.
Reset the decoder to the initial state.
\end{methoddesc}
+\begin{methoddesc}{getstate}{}
+ Return the current state of the decoder. This must be a tuple with two
+ items, the first must be the buffer containing the still undecoded input.
+ The second must be an integer and can be additional state info.
+ (The implementation should make sure that \code{0} is the most common
+ additional state info.) If this additional state info is \code{0} it must
+ be possible to set the decoder to the state which has no input buffered
+ and \code{0} as the additional state info, so that feeding the previously
+ buffered input to the decoder returns it to the previous state without
+ producing any output. (Additional state info that is more complicated
+ than integers can be converted into an integer by marshaling/pickling
+ the info and encoding the bytes of the resulting string into an integer.)
+ \versionadded{3.0}
+\end{methoddesc}
+
+\begin{methoddesc}{setstate}{state}
+ Set the state of the encoder to \var{state}. \var{state} must be a
+ decoder state returned by \method{getstate}.
+ \versionadded{3.0}
+\end{methoddesc}
+
The \class{StreamWriter} and \class{StreamReader} classes provide
generic working interfaces which can be used to implement new
diff --git a/Lib/codecs.py b/Lib/codecs.py
index e4e14cf..185ad42 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -87,7 +87,9 @@ class CodecInfo(tuple):
return self
def __repr__(self):
- return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
+ return "<%s.%s object for encoding %s at 0x%x>" % \
+ (self.__class__.__module__, self.__class__.__name__,
+ self.name, id(self))
class Codec:
@@ -155,9 +157,9 @@ class Codec:
class IncrementalEncoder(object):
"""
- An IncrementalEncoder encodes an input in multiple steps. The input can be
- passed piece by piece to the encode() method. The IncrementalEncoder remembers
- the state of the Encoding process between calls to encode().
+ An IncrementalEncoder encodes an input in multiple steps. The input can
+ be passed piece by piece to the encode() method. The IncrementalEncoder
+ remembers the state of the encoding process between calls to encode().
"""
def __init__(self, errors='strict'):
"""
@@ -181,6 +183,18 @@ class IncrementalEncoder(object):
Resets the encoder to the initial state.
"""
+ def getstate(self):
+ """
+ Return the current state of the encoder.
+ """
+ return 0
+
+ def setstate(self, state):
+ """
+ Set the current state of the encoder. state must have been
+ returned by getstate().
+ """
+
class BufferedIncrementalEncoder(IncrementalEncoder):
"""
This subclass of IncrementalEncoder can be used as the baseclass for an
@@ -189,7 +203,8 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
"""
def __init__(self, errors='strict'):
IncrementalEncoder.__init__(self, errors)
- self.buffer = "" # unencoded input that is kept between calls to encode()
+ # unencoded input that is kept between calls to encode()
+ self.buffer = ""
def _buffer_encode(self, input, errors, final):
# Overwrite this method in subclasses: It must encode input
@@ -208,10 +223,16 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
IncrementalEncoder.reset(self)
self.buffer = ""
+ def getstate(self):
+ return self.buffer or 0
+
+ def setstate(self, state):
+ self.buffer = state or ""
+
class IncrementalDecoder(object):
"""
- An IncrementalDecoder decodes an input in multiple steps. The input can be
- passed piece by piece to the decode() method. The IncrementalDecoder
+ An IncrementalDecoder decodes an input in multiple steps. The input can
+ be passed piece by piece to the decode() method. The IncrementalDecoder
remembers the state of the decoding process between calls to decode().
"""
def __init__(self, errors='strict'):
@@ -235,15 +256,29 @@ class IncrementalDecoder(object):
Resets the decoder to the initial state.
"""
+ def getstate(self):
+ """
+ Return the current state of the decoder. This must be a
+ (buffered_input, additional_state_info) tuple.
+ """
+ return ("", 0)
+
+ def setstate(self, state):
+ """
+ Set the current state of the decoder. state must have been
+ returned by getstate().
+ """
+
class BufferedIncrementalDecoder(IncrementalDecoder):
"""
This subclass of IncrementalDecoder can be used as the baseclass for an
- incremental decoder if the decoder must be able to handle incomplete byte
- sequences.
+ incremental decoder if the decoder must be able to handle incomplete
+ byte sequences.
"""
def __init__(self, errors='strict'):
IncrementalDecoder.__init__(self, errors)
- self.buffer = "" # undecoded input that is kept between calls to decode()
+ # undecoded input that is kept between calls to decode()
+ self.buffer = ""
def _buffer_decode(self, input, errors, final):
# Overwrite this method in subclasses: It must decode input
@@ -262,6 +297,14 @@ class BufferedIncrementalDecoder(IncrementalDecoder):
IncrementalDecoder.reset(self)
self.buffer = ""
+ def getstate(self):
+ # additional state info is always 0
+ return (self.buffer, 0)
+
+ def setstate(self, state):
+ # ignore additional state info
+ self.buffer = state[0]
+
#
# The StreamWriter and StreamReader class provide generic working
# interfaces which can be used to implement new encoding submodules
@@ -424,7 +467,8 @@ class StreamReader(Codec):
newchars, decodedbytes = self.decode(data, self.errors)
except UnicodeDecodeError as exc:
if firstline:
- newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
+ newchars, decodedbytes = \
+ self.decode(data[:exc.start], self.errors)
lines = newchars.splitlines(True)
if len(lines)<=1:
raise
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index eff08f3..cf096b5 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -34,6 +34,22 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
codecs.IncrementalEncoder.reset(self)
self.encoder = None
+ def getstate(self):
+ # state info we return to the caller:
+ # 0: stream is in natural order for this platform
+ # 2: endianness hasn't been determined yet
+ # (we're never writing in unnatural order)
+ return (2 if self.encoder is None else 0)
+
+ def setstate(self, state):
+ if state:
+ self.encoder = None
+ else:
+ if sys.byteorder == 'little':
+ self.encoder = codecs.utf_16_le_encode
+ else:
+ self.encoder = codecs.utf_16_be_encode
+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def __init__(self, errors='strict'):
codecs.BufferedIncrementalDecoder.__init__(self, errors)
@@ -56,6 +72,35 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
codecs.BufferedIncrementalDecoder.reset(self)
self.decoder = None
+ def getstate(self):
+ # additonal state info from the base class must be None here,
+ # as it isn't passed along to the caller
+ state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
+ # additional state info we pass to the caller:
+ # 0: stream is in natural order for this platform
+ # 1: stream is in unnatural order
+ # 2: endianness hasn't been determined yet
+ if self.decoder is None:
+ return (state, 2)
+ addstate = int((sys.byteorder == "big") !=
+ (self.decoder is codecs.utf_16_be_decode))
+ return (state, addstate)
+
+ def setstate(self, state):
+ # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+ codecs.BufferedIncrementalDecoder.setstate(self, state)
+ state = state[1]
+ if state == 0:
+ self.decoder = (codecs.utf_16_be_decode
+ if sys.byteorder == "big"
+ else codecs.utf_16_le_decode)
+ elif state == 1:
+ self.decoder = (codecs.utf_16_le_decode
+ if sys.byteorder == "big"
+ else codecs.utf_16_be_decode)
+ else:
+ self.decoder = None
+
class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
self.bom_written = False
diff --git a/Lib/encodings/utf_8_sig.py b/Lib/encodings/utf_8_sig.py
index d751da6..a0cc1af 100644
--- a/Lib/encodings/utf_8_sig.py
+++ b/Lib/encodings/utf_8_sig.py
@@ -12,7 +12,8 @@ import codecs
### Codec APIs
def encode(input, errors='strict'):
- return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
+ return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
+ len(input))
def decode(input, errors='strict'):
prefix = 0
@@ -25,38 +26,61 @@ def decode(input, errors='strict'):
class IncrementalEncoder(codecs.IncrementalEncoder):
def __init__(self, errors='strict'):
codecs.IncrementalEncoder.__init__(self, errors)
- self.first = True
+ self.first = 1
def encode(self, input, final=False):
if self.first:
- self.first = False
- return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
+ self.first = 0
+ return codecs.BOM_UTF8 + \
+ codecs.utf_8_encode(input, self.errors)[0]
else:
return codecs.utf_8_encode(input, self.errors)[0]
def reset(self):
codecs.IncrementalEncoder.reset(self)
- self.first = True
+ self.first = 1
+
+ def getstate(self):
+ return self.first
+
+ def setstate(self, state):
+ self.first = state
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def __init__(self, errors='strict'):
codecs.BufferedIncrementalDecoder.__init__(self, errors)
- self.first = True
+ self.first = 1
def _buffer_decode(self, input, errors, final):
- if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
+ if self.first:
if len(input) < 3:
- # not enough data to decide if this really is a BOM
- # => try again on the next call
- return (u"", 0)
- (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
- self.first = False
- return (output, consumed+3)
+ if codecs.BOM_UTF8.startswith(input):
+ # not enough data to decide if this really is a BOM
+ # => try again on the next call
+ return (u"", 0)
+ else:
+ self.first = 0
+ else:
+ self.first = 0
+ if input[:3] == codecs.BOM_UTF8:
+ (output, consumed) = \
+ codecs.utf_8_decode(input[3:], errors, final)
+ return (output, consumed+3)
return codecs.utf_8_decode(input, errors, final)
def reset(self):
codecs.BufferedIncrementalDecoder.reset(self)
- self.first = True
+ self.first = 1
+
+ def getstate(self):
+ state = codecs.BufferedIncrementalDecoder.getstate(self)
+ # state[1] must be 0 here, as it isn't passed along to the caller
+ return (state[0], self.first)
+
+ def setstate(self, state):
+ # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+ codecs.BufferedIncrementalDecoder.setstate(self, state)
+ self.first = state[1]
class StreamWriter(codecs.StreamWriter):
def reset(self):
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 44ce8eb..f7a9789 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -23,7 +23,40 @@ class Queue(object):
self._buffer = self._buffer[size:]
return s
-class ReadTest(unittest.TestCase):
+class MixInCheckStateHandling:
+ def check_state_handling_decode(self, encoding, u, s):
+ for i in xrange(len(s)+1):
+ d = codecs.getincrementaldecoder(encoding)()
+ part1 = d.decode(s[:i])
+ state = d.getstate()
+ self.assert_(isinstance(state[1], int))
+ # Check that the condition stated in the documentation for
+ # IncrementalDecoder.getstate() holds
+ if not state[1]:
+ # reset decoder to the default state without anything buffered
+ d.setstate((state[0][:0], 0))
+ # Feeding the previous input may not produce any output
+ self.assert_(not d.decode(state[0]))
+ # The decoder must return to the same state
+ self.assertEqual(state, d.getstate())
+ # Create a new decoder and set it to the state
+ # we extracted from the old one
+ d = codecs.getincrementaldecoder(encoding)()
+ d.setstate(state)
+ part2 = d.decode(s[i:], True)
+ self.assertEqual(u, part1+part2)
+
+ def check_state_handling_encode(self, encoding, u, s):
+ for i in xrange(len(u)+1):
+ d = codecs.getincrementalencoder(encoding)()
+ part1 = d.encode(u[:i])
+ state = d.getstate()
+ d = codecs.getincrementalencoder(encoding)()
+ d.setstate(state)
+ part2 = d.encode(u[i:], True)
+ self.assertEqual(s, part1+part2)
+
+class ReadTest(unittest.TestCase, MixInCheckStateHandling):
def check_partial(self, input, partialresults):
# get a StreamReader for the encoding and feed the bytestring version
# of input to the reader byte by byte. Read every available from
@@ -292,7 +325,14 @@ class UTF16Test(ReadTest):
)
def test_errors(self):
- self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
+ "\xff", "strict", True)
+
+ def test_decoder_state(self):
+ self.check_state_handling_decode(self.encoding,
+ u"spamspam", self.spamle)
+ self.check_state_handling_decode(self.encoding,
+ u"spamspam", self.spambe)
class UTF16LETest(ReadTest):
encoding = "utf-16-le"
@@ -313,7 +353,8 @@ class UTF16LETest(ReadTest):
)
def test_errors(self):
- self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
+ "\xff", "strict", True)
class UTF16BETest(ReadTest):
encoding = "utf-16-be"
@@ -334,7 +375,8 @@ class UTF16BETest(ReadTest):
)
def test_errors(self):
- self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
+ self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
+ "\xff", "strict", True)
class UTF8Test(ReadTest):
encoding = "utf-8"
@@ -357,6 +399,11 @@ class UTF8Test(ReadTest):
]
)
+ def test_decoder_state(self):
+ u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
+ self.check_state_handling_decode(self.encoding,
+ u, u.encode(self.encoding))
+
class UTF7Test(ReadTest):
encoding = "utf-7"
@@ -429,6 +476,16 @@ class UTF8SigTest(ReadTest):
# SF bug #1601501: check that the codec works with a buffer
unicode("\xef\xbb\xbf", "utf-8-sig")
+ def test_bom(self):
+ d = codecs.getincrementaldecoder("utf-8-sig")()
+ s = u"spam"
+ self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
+
+ def test_decoder_state(self):
+ u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
+ self.check_state_handling_decode(self.encoding,
+ u, u.encode(self.encoding))
+
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
self.assertEquals(codecs.escape_decode(""), ("", 0))
@@ -1066,7 +1123,11 @@ broken_unicode_with_streams = [
"punycode",
"unicode_internal"
]
-broken_incremental_coders = broken_unicode_with_streams[:]
+broken_incremental_coders = broken_unicode_with_streams + [
+ "idna",
+ "zlib_codec",
+ "bz2_codec",
+]
# The following encodings only support "strict" mode
only_strict_mode = [
@@ -1091,7 +1152,7 @@ else:
all_unicode_encodings.append("zlib_codec")
broken_unicode_with_streams.append("zlib_codec")
-class BasicUnicodeTest(unittest.TestCase):
+class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self):
s = u"abc123" # all codecs should be able to encode these
for encoding in all_unicode_encodings:
@@ -1215,6 +1276,14 @@ class BasicUnicodeTest(unittest.TestCase):
table_type = type(cp1140.encoding_table)
self.assertEqual(table_type, table_type)
+ def test_decoder_state(self):
+ # Check that getstate() and setstate() handle the state properly
+ u = u"abc123"
+ for encoding in all_unicode_encodings:
+ if encoding not in broken_incremental_coders:
+ self.check_state_handling_decode(encoding, u, u.encode(encoding))
+ self.check_state_handling_encode(encoding, u, u.encode(encoding))
+
class BasicStrTest(unittest.TestCase):
def test_basics(self):
s = "abc123"