diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2011-02-25 20:27:33 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2011-02-25 20:27:33 (GMT) |
commit | 211b81dd0916ce6e83f23b222bc06d45b904a838 (patch) | |
tree | badd58c036b3f0395de14a4a16a148640a1f2367 /Lib/_pyio.py | |
parent | d2751fb48203f5a8e1493b45974a260bb3e249bc (diff) | |
download | cpython-211b81dd0916ce6e83f23b222bc06d45b904a838.zip cpython-211b81dd0916ce6e83f23b222bc06d45b904a838.tar.gz cpython-211b81dd0916ce6e83f23b222bc06d45b904a838.tar.bz2 |
Issue #11114: Fix catastrophic performance of tell() on text files (up
to 1000x faster in some cases). It is still one to two order of magnitudes
slower than binary tell().
Diffstat (limited to 'Lib/_pyio.py')
-rw-r--r-- | Lib/_pyio.py | 58 |
1 files changed, 50 insertions, 8 deletions
diff --git a/Lib/_pyio.py b/Lib/_pyio.py index a5d6135..ad5bfcc 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1488,6 +1488,7 @@ class TextIOWrapper(TextIOBase): self._decoded_chars_used = 0 # offset into _decoded_chars for read() self._snapshot = None # info for reconstructing decoder state self._seekable = self._telling = self.buffer.seekable() + self._b2cratio = 0.0 if self._seekable and self.writable(): position = self.buffer.tell() @@ -1655,7 +1656,12 @@ class TextIOWrapper(TextIOBase): # Read a chunk, decode it, and put the result in self._decoded_chars. input_chunk = self.buffer.read1(self._CHUNK_SIZE) eof = not input_chunk - self._set_decoded_chars(self._decoder.decode(input_chunk, eof)) + decoded_chars = self._decoder.decode(input_chunk, eof) + self._set_decoded_chars(decoded_chars) + if decoded_chars: + self._b2cratio = len(input_chunk) / len(self._decoded_chars) + else: + self._b2cratio = 0.0 if self._telling: # At the snapshot point, len(dec_buffer) bytes before the read, @@ -1709,20 +1715,56 @@ class TextIOWrapper(TextIOBase): # forward until it gives us enough decoded characters. saved_state = decoder.getstate() try: + # Fast search for an acceptable start point, close to our + # current pos. + # Rationale: calling decoder.decode() has a large overhead + # regardless of chunk size; we want the number of such calls to + # be O(1) in most situations (common decoders, non-crazy input). + # Actually, it will be exactly 1 for fixed-size codecs (all + # 8-bit codecs, also UTF-16 and UTF-32). + skip_bytes = int(self._b2cratio * chars_to_skip) + skip_back = 1 + assert skip_bytes <= len(next_input) + while skip_bytes > 0: + decoder.setstate((b'', dec_flags)) + # Decode up to temptative start point + n = len(decoder.decode(next_input[:skip_bytes])) + if n <= chars_to_skip: + b, d = decoder.getstate() + if not b: + # Before pos and no bytes buffered in decoder => OK + dec_flags = d + chars_to_skip -= n + break + # Skip back by buffered amount and reset heuristic + skip_bytes -= len(b) + skip_back = 1 + else: + # We're too far ahead, skip back a bit + skip_bytes -= skip_back + skip_back = skip_back * 2 + else: + skip_bytes = 0 + decoder.setstate((b'', dec_flags)) + # Note our initial start point. - decoder.setstate((b'', dec_flags)) - start_pos = position - start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0 - need_eof = 0 + start_pos = position + skip_bytes + start_flags = dec_flags + if chars_to_skip == 0: + # We haven't moved from the start point. + return self._pack_cookie(start_pos, start_flags) # Feed the decoder one byte at a time. As we go, note the # nearest "safe start point" before the current location # (a point where the decoder has nothing buffered, so seek() # can safely start from there and advance to this location). - next_byte = bytearray(1) - for next_byte[0] in next_input: + bytes_fed = 0 + need_eof = 0 + # Chars decoded since `start_pos` + chars_decoded = 0 + for i in range(skip_bytes, len(next_input)): bytes_fed += 1 - chars_decoded += len(decoder.decode(next_byte)) + chars_decoded += len(decoder.decode(next_input[i:i+1])) dec_buffer, dec_flags = decoder.getstate() if not dec_buffer and chars_decoded <= chars_to_skip: # Decoder buffer is empty, so this is a safe start point. |