diff options
-rw-r--r-- | Lib/io.py | 146 | ||||
-rw-r--r-- | Lib/test/test_io.py | 38 |
2 files changed, 69 insertions, 115 deletions
@@ -1180,14 +1180,14 @@ class TextIOWrapper(TextIOBase): self._encoder = None self._decoder = None self._decoded_text = "" # buffer for text produced by decoder - self._decoded_text_offset = 0 # offset to text returned by read() self._snapshot = None # info for reconstructing decoder state self._seekable = self._telling = self.buffer.seekable() # A word about _snapshot. This attribute is either None, or a tuple - # (decoder_state, next_input) where decoder_state is the second - # (integer) item of the decoder state, and next_input is the chunk - # of bytes that comes after the snapshot point in the input. + # (decoder_state, input_chunk, decoded_chars) where decoder_state is + # the second (integer) item of the decoder state, input_chunk is the + # chunk of bytes that was read, and decoded_chars is the number of + # characters rendered by the decoder after feeding it those bytes. # We use this to reconstruct intermediate decoder states in tell(). # Naming convention: @@ -1271,10 +1271,10 @@ class TextIOWrapper(TextIOBase): """ Read and decode the next chunk of data from the BufferedReader. - The return value is True unless EOF was reached. The decoded string - is placed in self._decoded_text (replacing its previous value). - (The entire input chunk is sent to the decoder, though some of it - may remain buffered in the decoder, yet to be converted.) + Return a tuple of two elements: all the bytes that were read, and + the decoded string produced by the decoder. (The entire input + chunk is sent to the decoder, but some of it may remain buffered + in the decoder, yet to be converted.) """ if self._decoder is None: @@ -1283,9 +1283,8 @@ class TextIOWrapper(TextIOBase): # No one should call tell(), so don't bother taking a snapshot. input_chunk = self.buffer.read1(self._CHUNK_SIZE) eof = not input_chunk - self._decoded_text = self._decoder.decode(input_chunk, eof) - self._decoded_text_offset = 0 - return not eof + decoded = self._decoder.decode(input_chunk, eof) + return (input_chunk, decoded) # The cookie returned by tell() cannot include the contents of # the decoder's buffer, so we need to snapshot a point in the @@ -1299,15 +1298,16 @@ class TextIOWrapper(TextIOBase): input_chunk = self.buffer.read1(self._CHUNK_SIZE) eof = not input_chunk - self._decoded_text = self._decoder.decode(input_chunk, eof) - self._decoded_text_offset = 0 + decoded = self._decoder.decode(input_chunk, eof) - # At the snapshot point, len(dec_buffer) bytes ago, the next input - # to be passed to the decoder is dec_buffer + input_chunk. - self._snapshot = (dec_flags, dec_buffer + input_chunk) - return not eof + # At the snapshot point len(dec_buffer) bytes ago, the next input + # to be passed to the decoder is dec_buffer + input_chunk. Save + # len(decoded) so that later, tell() can figure out how much + # decoded data has been used up by TextIOWrapper.read(). + self._snapshot = (dec_flags, dec_buffer + input_chunk, len(decoded)) + return (input_chunk, decoded) - def _pack_cookie(self, position, dec_flags=0, + def _encode_tell_cookie(self, position, dec_flags=0, feed_bytes=0, need_eof=0, skip_chars=0): # The meaning of a tell() cookie is: seek to position, set the # decoder flags to dec_flags, read feed_bytes bytes, feed them @@ -1317,7 +1317,7 @@ class TextIOWrapper(TextIOBase): return (position | (dec_flags<<64) | (feed_bytes<<128) | (skip_chars<<192) | bool(need_eof)<<256) - def _unpack_cookie(self, bigint): + def _decode_tell_cookie(self, bigint): rest, position = divmod(bigint, 1<<64) rest, dec_flags = divmod(rest, 1<<64) rest, feed_bytes = divmod(rest, 1<<64) @@ -1339,14 +1339,14 @@ class TextIOWrapper(TextIOBase): return position # Skip backward to the snapshot point (see _read_chunk). - dec_flags, next_input = self._snapshot + dec_flags, next_input, decoded_chars = self._snapshot position -= len(next_input) - # How many decoded characters have been returned since the snapshot? - skip_chars = self._decoded_text_offset + # How many decoded characters have been consumed since the snapshot? + skip_chars = decoded_chars - len(self._decoded_text) if skip_chars == 0: # We haven't moved from the snapshot point. - return self._pack_cookie(position, dec_flags) + return self._encode_tell_cookie(position, dec_flags) # Walk the decoder forward, one byte at a time, to find the minimum # input necessary to give us the decoded characters we need to skip. @@ -1373,8 +1373,8 @@ class TextIOWrapper(TextIOBase): if decoded_chars >= skip_chars: break else: - # We didn't get enough decoded data; signal EOF to get more. - decoded = decoder.decode(b"", final=True) + # We didn't get enough decoded data; send EOF to get more. + decoded = decoder.decode(b"", True) decoded_chars += len(decoded) need_eof = 1 if decoded_chars < skip_chars: @@ -1385,7 +1385,7 @@ class TextIOWrapper(TextIOBase): position += safe_fed_bytes fed_bytes -= safe_fed_bytes skip_chars -= safe_decoded_chars - return self._pack_cookie( + return self._encode_tell_cookie( position, dec_flags, fed_bytes, need_eof, skip_chars) finally: decoder.setstate(saved_state) @@ -1405,7 +1405,8 @@ class TextIOWrapper(TextIOBase): raise IOError("can't do nonzero end-relative seeks") self.flush() position = self.buffer.seek(0, 2) - self._clear_decoded_text() + self._decoded_text = "" + self._snapshot = None if self._decoder: self._decoder.reset() return position @@ -1418,70 +1419,48 @@ class TextIOWrapper(TextIOBase): # Seek back to the snapshot point. position, dec_flags, feed_bytes, need_eof, skip_chars = \ - self._unpack_cookie(cookie) + self._decode_tell_cookie(cookie) self.buffer.seek(position) - self._clear_decoded_text() + self._decoded_text = "" + self._snapshot = None if self._decoder or dec_flags or feed_bytes or need_eof: # Restore the decoder flags to their values from the snapshot. self._decoder = self._decoder or self._get_decoder() self._decoder.setstate((b"", dec_flags)) - self._snapshot = (dec_flags, b'') if feed_bytes or need_eof: # Feed feed_bytes bytes to the decoder. input_chunk = self.buffer.read(feed_bytes) - self._decoded_text = self._decoder.decode(input_chunk, need_eof) - if len(self._decoded_text) < skip_chars: + decoded = self._decoder.decode(input_chunk, need_eof) + if len(decoded) < skip_chars: raise IOError("can't restore logical file position") # Skip skip_chars of the decoded characters. - self._decoded_text_offset = skip_chars + self._decoded_text = decoded[skip_chars:] # Restore the snapshot. - self._snapshot = (dec_flags, input_chunk) + self._snapshot = (dec_flags, input_chunk, len(decoded)) return cookie - def _clear_decoded_text(self): - """Reset the _decoded_text buffer.""" - self._decoded_text = '' - self._decoded_text_offset = 0 - self._snapshot = None - - def _emit_decoded_text(self, n=None): - """Advance into the _decoded_text buffer.""" - offset = self._decoded_text_offset - if n is None: - text = self._decoded_text[offset:] - else: - text = self._decoded_text[offset:offset + n] - self._decoded_text_offset += len(text) - return text - - def _unemit_decoded_text(self, n): - """Rewind the _decoded_text buffer.""" - if self._decoded_text_offset < n: - raise AssertionError("unemit out of bounds") - self._decoded_text_offset -= n - def read(self, n=None): if n is None: n = -1 decoder = self._decoder or self._get_decoder() + result = self._decoded_text if n < 0: - # Read everything. - result = (self._emit_decoded_text() + - decoder.decode(self.buffer.read(), final=True)) - self._clear_decoded_text() + result += decoder.decode(self.buffer.read(), True) + self._decoded_text = "" + self._snapshot = None return result else: - # Keep reading chunks until we have n characters to return. - eof = False - result = self._emit_decoded_text(n) - while len(result) < n and not eof: - eof = not self._read_chunk() - result += self._emit_decoded_text(n - len(result)) - return result + while len(result) < n: + input_chunk, decoded = self._read_chunk() + result += decoded + if not input_chunk: + break + self._decoded_text = result[n:] + return result[:n] def __next__(self): self._telling = False @@ -1495,20 +1474,21 @@ class TextIOWrapper(TextIOBase): def readline(self, limit=None): if limit is None: limit = -1 + if limit >= 0: + # XXX Hack to support limit argument, for backwards compatibility + line = self.readline() + if len(line) <= limit: + return line + line, self._decoded_text = \ + line[:limit], line[limit:] + self._decoded_text + return line - # Grab all the decoded text (we will rewind any extra bits later). - line = self._emit_decoded_text() - + line = self._decoded_text start = 0 decoder = self._decoder or self._get_decoder() pos = endpos = None while True: - if limit >= 0 and len(line) >= limit: - # Length limit has been reached. - endpos = limit - break - if self._readtranslate: # Newlines are already translated, only search for \n pos = line.find('\n', start) @@ -1558,18 +1538,20 @@ class TextIOWrapper(TextIOBase): # No line ending seen yet - get more data more_line = '' - while self._read_chunk(): - if self._decoded_text: + while True: + readahead, pending = self._read_chunk() + more_line = pending + if more_line or not readahead: break - if self._decoded_text: - line += self._emit_decoded_text() + if more_line: + line += more_line else: # end of file - self._clear_decoded_text() + self._decoded_text = '' + self._snapshot = None return line - # Rewind _decoded_text to just after the line ending we found. - self._unemit_decoded_text(len(line) - endpos) + self._decoded_text = line[endpos:] return line[:endpos] @property diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 27814a0..49404e1 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -590,9 +590,7 @@ class StatefulIncrementalDecoderTest(unittest.TestCase): # I=0, O=3 (b'i.o3.x.xyz.toolong.', False, 'x--.xyz.too.'), # I=6, O=3 - (b'i.o3.i6.abcdefghijklmnop', True, 'abc.ghi.mno.'), - # I=5, O=8 with newlines - (b'i.o8.i5.abc\ndef\nghy\nz', True, 'abc\nd---.ef\ngh---.y\nz-----.') + (b'i.o3.i6.abcdefghijklmnop', True, 'abc.ghi.mno.') ] def testDecoder(self): @@ -892,8 +890,8 @@ class TextIOWrapperTest(unittest.TestCase): return codecs.CodecInfo( name='test_decoder', encode=None, decode=None, incrementalencoder=None, - incrementaldecoder=StatefulIncrementalDecoder, - streamreader=None, streamwriter=None) + streamreader=None, streamwriter=None, + incrementaldecoder=StatefulIncrementalDecoder) def testSeekAndTellWithData(data, min_pos=0): """Tell/seek to various points within a data stream and ensure @@ -905,42 +903,16 @@ class TextIOWrapperTest(unittest.TestCase): decoded = f.read() f.close() - # Use read() to move to various positions in the input; - # then tell, read some more data, and seek back. - for i in range(min_pos, len(decoded) + 1): # to read before tell - for j in [1, 5, len(decoded)]: # to read after tell + for i in range(min_pos, len(decoded) + 1): # seek positions + for j in [1, 5, len(decoded) - i]: # read lengths f = io.open(test_support.TESTFN, encoding='test_decoder') self.assertEquals(f.read(i), decoded[:i]) cookie = f.tell() self.assertEquals(f.read(j), decoded[i:i + j]) f.seek(cookie) - self.assertEquals(f.tell(), cookie) self.assertEquals(f.read(), decoded[i:]) f.close() - lines = len(decoded.split('\n')) - - # Use readline() to move to various positions in the input; - # then tell, read some more data, and seek back. - for limit in [-1, 4, 128]: # 'limit' argument for readline() - for j in [1, 5, len(decoded)]: # to read after tell() - f = io.open(test_support.TESTFN, encoding='test_decoder') - text = '' - for k in range(lines): # repeatedly call readline() - line = f.readline(limit=limit) - if limit >= 0: - self.assert_(len(line) <= limit) - text += line - i = len(text) - self.assertEquals(text, decoded[:i]) - cookie = f.tell() - self.assertEquals(f.read(j), decoded[i:i + j]) - f.seek(cookie) - self.assertEquals(f.tell(), cookie) - self.assertEquals(f.read(), decoded[i:]) - f.seek(cookie) - f.close() - # Register a special incremental decoder for testing. codecs.register(lookupTestDecoder) self.codecEnabled = 1 |