diff options
author | Guido van Rossum <guido@python.org> | 2007-04-17 02:38:04 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2007-04-17 02:38:04 (GMT) |
commit | d76e7796c97a8f4d555a687609bd96f01f68b7a1 (patch) | |
tree | 7e93e6ec7945842c719a09598b6cfe7f3e6732b2 /Lib | |
parent | 3abcb013b8195aea38f80968d4111b5ac7e68c0b (diff) | |
download | cpython-d76e7796c97a8f4d555a687609bd96f01f68b7a1.zip cpython-d76e7796c97a8f4d555a687609bd96f01f68b7a1.tar.gz cpython-d76e7796c97a8f4d555a687609bd96f01f68b7a1.tar.bz2 |
Instead of pickling the whole decoder, use the new getstate/setstate API.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/io.py | 60 | ||||
-rw-r--r-- | Lib/test/test_io.py | 30 |
2 files changed, 57 insertions, 33 deletions
@@ -18,7 +18,7 @@ XXX don't use assert to validate input requirements XXX whenever an argument is None, use the default value XXX read/write ops should check readable/writable XXX buffered readinto should work with arbitrary buffer objects -XXX use incremental encoder for text output, at least for UTF-16 +XXX use incremental encoder for text output, at least for UTF-16 and UTF-8-SIG """ __author__ = ("Guido van Rossum <guido@python.org>, " @@ -36,11 +36,6 @@ import codecs import _fileio import warnings -try: - import cPickle as pickle -except ImportError: - import pickle - # XXX Shouldn't we use st_blksize whenever we can? DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes @@ -957,17 +952,16 @@ class TextIOWrapper(TextIOBase): self._newline = newline or os.linesep self._fix_newlines = newline is None self._decoder = None - self._decoder_in_rest_pickle = None self._pending = "" self._snapshot = None self._seekable = self._telling = self.buffer.seekable() # A word about _snapshot. This attribute is either None, or a - # tuple (decoder_pickle, readahead, pending) where decoder_pickle - # is a pickled decoder state, readahead is the chunk of bytes that - # was read, and pending is the characters that were rendered by - # the decoder after feeding it those bytes. We use this to - # reconstruct intermediate decoder states in tell(). + # tuple (decoder_state, readahead, pending) where decoder_state is + # the second (integer) item of the decoder state, readahead is the + # chunk of bytes that was read, and pending is the characters that + # were rendered by the decoder after feeding it those bytes. We + # use this to reconstruct intermediate decoder states in tell(). def _seekable(self): return self._seekable @@ -1005,10 +999,6 @@ class TextIOWrapper(TextIOBase): raise IOError("Can't find an incremental decoder for encoding %s" % self._encoding) decoder = self._decoder = make_decoder() # XXX: errors - if isinstance(decoder, codecs.BufferedIncrementalDecoder): - # XXX Hack: make the codec use bytes instead of strings - decoder.buffer = b"" - self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell() return decoder def _read_chunk(self): @@ -1017,15 +1007,13 @@ class TextIOWrapper(TextIOBase): readahead = self.buffer.read1(self._CHUNK_SIZE) pending = self._decoder.decode(readahead, not readahead) return readahead, pending - decoder_state = pickle.dumps(self._decoder, 2) + decoder_buffer, decoder_state = self._decoder.getstate() readahead = self.buffer.read1(self._CHUNK_SIZE) pending = self._decoder.decode(readahead, not readahead) - self._snapshot = (decoder_state, readahead, pending) + self._snapshot = (decoder_state, decoder_buffer + readahead, pending) return readahead, pending def _encode_decoder_state(self, ds, pos): - if ds == self._decoder_in_rest_pickle: - return pos x = 0 for i in bytes(ds): x = x<<8 | i @@ -1048,7 +1036,8 @@ class TextIOWrapper(TextIOBase): raise IOError("Telling position disabled by next() call") self.flush() position = self.buffer.tell() - if self._decoder is None or self._snapshot is None: + decoder = self._decoder + if decoder is None or self._snapshot is None: assert self._pending == "" return position decoder_state, readahead, pending = self._snapshot @@ -1056,15 +1045,21 @@ class TextIOWrapper(TextIOBase): needed = len(pending) - len(self._pending) if not needed: return self._encode_decoder_state(decoder_state, position) - decoder = pickle.loads(decoder_state) - n = 0 - bb = bytes(1) - for i, bb[0] in enumerate(readahead): - n += len(decoder.decode(bb)) - if n >= needed: - decoder_state = pickle.dumps(decoder, 2) - return self._encode_decoder_state(decoder_state, position+i+1) - raise IOError("Can't reconstruct logical file position") + saved_state = decoder.getstate() + try: + decoder.setstate(("", decoder_state)) + n = 0 + bb = bytes(1) + for i, bb[0] in enumerate(readahead): + n += len(decoder.decode(bb)) + if n >= needed: + decoder_buffer, decoder_state = decoder.getstate() + return self._encode_decoder_state( + decoder_state, + position + (i+1) - len(decoder_buffer)) + raise IOError("Can't reconstruct logical file position") + finally: + decoder.setstate(saved_state) def seek(self, pos, whence=0): if not self._seekable: @@ -1097,12 +1092,11 @@ class TextIOWrapper(TextIOBase): self._pending = "" self._decoder = None return pos - decoder = pickle.loads(ds) + decoder = self._decoder or self._get_decoder() + decoder.set_state(("", ds)) self.buffer.seek(pos) self._snapshot = (ds, b"", "") self._pending = "" - if not self._decoder_in_rest_pickle: - self._get_decoder() # For its side effect self._decoder = decoder return orig_pos diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 1f6be02..0b63fee 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -581,6 +581,36 @@ class TextIOWrapperTest(unittest.TestCase): self.assertEquals(f.tell(), p2) f.close() + def testSeeking(self): + chunk_size = io.TextIOWrapper._CHUNK_SIZE + prefix_size = chunk_size - 2 + u_prefix = u"a" * prefix_size + prefix = bytes(u_prefix.encode("utf-8")) + self.assertEquals(len(u_prefix), len(prefix)) + u_suffix = u"\u8888\n" + suffix = bytes(u_suffix.encode("utf-8")) + line = prefix + suffix + f = io.open(test_support.TESTFN, "wb") + f.write(line*2) + f.close() + f = io.open(test_support.TESTFN, "r", encoding="utf-8") + s = f.read(prefix_size) + self.assertEquals(s, prefix) + self.assertEquals(f.tell(), prefix_size) + self.assertEquals(f.readline(), u_suffix) + + def testSeekingToo(self): + # Regression test for a specific bug + data = b'\xe0\xbf\xbf\n' + f = io.open(test_support.TESTFN, "wb") + f.write(data) + f.close() + f = io.open(test_support.TESTFN, "r", encoding="utf-8") + f._CHUNK_SIZE # Just test that it exists + f._CHUNK_SIZE = 2 + f.readline() + f.tell() + def timingTest(self): timer = time.time enc = "utf8" |