summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/io.py146
-rw-r--r--Lib/test/test_io.py38
2 files changed, 69 insertions, 115 deletions
diff --git a/Lib/io.py b/Lib/io.py
index dbff96e..d3c9f85 100644
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -1180,14 +1180,14 @@ class TextIOWrapper(TextIOBase):
self._encoder = None
self._decoder = None
self._decoded_text = "" # buffer for text produced by decoder
- self._decoded_text_offset = 0 # offset to text returned by read()
self._snapshot = None # info for reconstructing decoder state
self._seekable = self._telling = self.buffer.seekable()
# A word about _snapshot. This attribute is either None, or a tuple
- # (decoder_state, next_input) where decoder_state is the second
- # (integer) item of the decoder state, and next_input is the chunk
- # of bytes that comes after the snapshot point in the input.
+ # (decoder_state, input_chunk, decoded_chars) where decoder_state is
+ # the second (integer) item of the decoder state, input_chunk is the
+ # chunk of bytes that was read, and decoded_chars is the number of
+ # characters rendered by the decoder after feeding it those bytes.
# We use this to reconstruct intermediate decoder states in tell().
# Naming convention:
@@ -1271,10 +1271,10 @@ class TextIOWrapper(TextIOBase):
"""
Read and decode the next chunk of data from the BufferedReader.
- The return value is True unless EOF was reached. The decoded string
- is placed in self._decoded_text (replacing its previous value).
- (The entire input chunk is sent to the decoder, though some of it
- may remain buffered in the decoder, yet to be converted.)
+ Return a tuple of two elements: all the bytes that were read, and
+ the decoded string produced by the decoder. (The entire input
+ chunk is sent to the decoder, but some of it may remain buffered
+ in the decoder, yet to be converted.)
"""
if self._decoder is None:
@@ -1283,9 +1283,8 @@ class TextIOWrapper(TextIOBase):
# No one should call tell(), so don't bother taking a snapshot.
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
eof = not input_chunk
- self._decoded_text = self._decoder.decode(input_chunk, eof)
- self._decoded_text_offset = 0
- return not eof
+ decoded = self._decoder.decode(input_chunk, eof)
+ return (input_chunk, decoded)
# The cookie returned by tell() cannot include the contents of
# the decoder's buffer, so we need to snapshot a point in the
@@ -1299,15 +1298,16 @@ class TextIOWrapper(TextIOBase):
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
eof = not input_chunk
- self._decoded_text = self._decoder.decode(input_chunk, eof)
- self._decoded_text_offset = 0
+ decoded = self._decoder.decode(input_chunk, eof)
- # At the snapshot point, len(dec_buffer) bytes ago, the next input
- # to be passed to the decoder is dec_buffer + input_chunk.
- self._snapshot = (dec_flags, dec_buffer + input_chunk)
- return not eof
+ # At the snapshot point len(dec_buffer) bytes ago, the next input
+ # to be passed to the decoder is dec_buffer + input_chunk. Save
+ # len(decoded) so that later, tell() can figure out how much
+ # decoded data has been used up by TextIOWrapper.read().
+ self._snapshot = (dec_flags, dec_buffer + input_chunk, len(decoded))
+ return (input_chunk, decoded)
- def _pack_cookie(self, position, dec_flags=0,
+ def _encode_tell_cookie(self, position, dec_flags=0,
feed_bytes=0, need_eof=0, skip_chars=0):
# The meaning of a tell() cookie is: seek to position, set the
# decoder flags to dec_flags, read feed_bytes bytes, feed them
@@ -1317,7 +1317,7 @@ class TextIOWrapper(TextIOBase):
return (position | (dec_flags<<64) | (feed_bytes<<128) |
(skip_chars<<192) | bool(need_eof)<<256)
- def _unpack_cookie(self, bigint):
+ def _decode_tell_cookie(self, bigint):
rest, position = divmod(bigint, 1<<64)
rest, dec_flags = divmod(rest, 1<<64)
rest, feed_bytes = divmod(rest, 1<<64)
@@ -1339,14 +1339,14 @@ class TextIOWrapper(TextIOBase):
return position
# Skip backward to the snapshot point (see _read_chunk).
- dec_flags, next_input = self._snapshot
+ dec_flags, next_input, decoded_chars = self._snapshot
position -= len(next_input)
- # How many decoded characters have been returned since the snapshot?
- skip_chars = self._decoded_text_offset
+ # How many decoded characters have been consumed since the snapshot?
+ skip_chars = decoded_chars - len(self._decoded_text)
if skip_chars == 0:
# We haven't moved from the snapshot point.
- return self._pack_cookie(position, dec_flags)
+ return self._encode_tell_cookie(position, dec_flags)
# Walk the decoder forward, one byte at a time, to find the minimum
# input necessary to give us the decoded characters we need to skip.
@@ -1373,8 +1373,8 @@ class TextIOWrapper(TextIOBase):
if decoded_chars >= skip_chars:
break
else:
- # We didn't get enough decoded data; signal EOF to get more.
- decoded = decoder.decode(b"", final=True)
+ # We didn't get enough decoded data; send EOF to get more.
+ decoded = decoder.decode(b"", True)
decoded_chars += len(decoded)
need_eof = 1
if decoded_chars < skip_chars:
@@ -1385,7 +1385,7 @@ class TextIOWrapper(TextIOBase):
position += safe_fed_bytes
fed_bytes -= safe_fed_bytes
skip_chars -= safe_decoded_chars
- return self._pack_cookie(
+ return self._encode_tell_cookie(
position, dec_flags, fed_bytes, need_eof, skip_chars)
finally:
decoder.setstate(saved_state)
@@ -1405,7 +1405,8 @@ class TextIOWrapper(TextIOBase):
raise IOError("can't do nonzero end-relative seeks")
self.flush()
position = self.buffer.seek(0, 2)
- self._clear_decoded_text()
+ self._decoded_text = ""
+ self._snapshot = None
if self._decoder:
self._decoder.reset()
return position
@@ -1418,70 +1419,48 @@ class TextIOWrapper(TextIOBase):
# Seek back to the snapshot point.
position, dec_flags, feed_bytes, need_eof, skip_chars = \
- self._unpack_cookie(cookie)
+ self._decode_tell_cookie(cookie)
self.buffer.seek(position)
- self._clear_decoded_text()
+ self._decoded_text = ""
+ self._snapshot = None
if self._decoder or dec_flags or feed_bytes or need_eof:
# Restore the decoder flags to their values from the snapshot.
self._decoder = self._decoder or self._get_decoder()
self._decoder.setstate((b"", dec_flags))
- self._snapshot = (dec_flags, b'')
if feed_bytes or need_eof:
# Feed feed_bytes bytes to the decoder.
input_chunk = self.buffer.read(feed_bytes)
- self._decoded_text = self._decoder.decode(input_chunk, need_eof)
- if len(self._decoded_text) < skip_chars:
+ decoded = self._decoder.decode(input_chunk, need_eof)
+ if len(decoded) < skip_chars:
raise IOError("can't restore logical file position")
# Skip skip_chars of the decoded characters.
- self._decoded_text_offset = skip_chars
+ self._decoded_text = decoded[skip_chars:]
# Restore the snapshot.
- self._snapshot = (dec_flags, input_chunk)
+ self._snapshot = (dec_flags, input_chunk, len(decoded))
return cookie
- def _clear_decoded_text(self):
- """Reset the _decoded_text buffer."""
- self._decoded_text = ''
- self._decoded_text_offset = 0
- self._snapshot = None
-
- def _emit_decoded_text(self, n=None):
- """Advance into the _decoded_text buffer."""
- offset = self._decoded_text_offset
- if n is None:
- text = self._decoded_text[offset:]
- else:
- text = self._decoded_text[offset:offset + n]
- self._decoded_text_offset += len(text)
- return text
-
- def _unemit_decoded_text(self, n):
- """Rewind the _decoded_text buffer."""
- if self._decoded_text_offset < n:
- raise AssertionError("unemit out of bounds")
- self._decoded_text_offset -= n
-
def read(self, n=None):
if n is None:
n = -1
decoder = self._decoder or self._get_decoder()
+ result = self._decoded_text
if n < 0:
- # Read everything.
- result = (self._emit_decoded_text() +
- decoder.decode(self.buffer.read(), final=True))
- self._clear_decoded_text()
+ result += decoder.decode(self.buffer.read(), True)
+ self._decoded_text = ""
+ self._snapshot = None
return result
else:
- # Keep reading chunks until we have n characters to return.
- eof = False
- result = self._emit_decoded_text(n)
- while len(result) < n and not eof:
- eof = not self._read_chunk()
- result += self._emit_decoded_text(n - len(result))
- return result
+ while len(result) < n:
+ input_chunk, decoded = self._read_chunk()
+ result += decoded
+ if not input_chunk:
+ break
+ self._decoded_text = result[n:]
+ return result[:n]
def __next__(self):
self._telling = False
@@ -1495,20 +1474,21 @@ class TextIOWrapper(TextIOBase):
def readline(self, limit=None):
if limit is None:
limit = -1
+ if limit >= 0:
+ # XXX Hack to support limit argument, for backwards compatibility
+ line = self.readline()
+ if len(line) <= limit:
+ return line
+ line, self._decoded_text = \
+ line[:limit], line[limit:] + self._decoded_text
+ return line
- # Grab all the decoded text (we will rewind any extra bits later).
- line = self._emit_decoded_text()
-
+ line = self._decoded_text
start = 0
decoder = self._decoder or self._get_decoder()
pos = endpos = None
while True:
- if limit >= 0 and len(line) >= limit:
- # Length limit has been reached.
- endpos = limit
- break
-
if self._readtranslate:
# Newlines are already translated, only search for \n
pos = line.find('\n', start)
@@ -1558,18 +1538,20 @@ class TextIOWrapper(TextIOBase):
# No line ending seen yet - get more data
more_line = ''
- while self._read_chunk():
- if self._decoded_text:
+ while True:
+ readahead, pending = self._read_chunk()
+ more_line = pending
+ if more_line or not readahead:
break
- if self._decoded_text:
- line += self._emit_decoded_text()
+ if more_line:
+ line += more_line
else:
# end of file
- self._clear_decoded_text()
+ self._decoded_text = ''
+ self._snapshot = None
return line
- # Rewind _decoded_text to just after the line ending we found.
- self._unemit_decoded_text(len(line) - endpos)
+ self._decoded_text = line[endpos:]
return line[:endpos]
@property
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 27814a0..49404e1 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -590,9 +590,7 @@ class StatefulIncrementalDecoderTest(unittest.TestCase):
# I=0, O=3
(b'i.o3.x.xyz.toolong.', False, 'x--.xyz.too.'),
# I=6, O=3
- (b'i.o3.i6.abcdefghijklmnop', True, 'abc.ghi.mno.'),
- # I=5, O=8 with newlines
- (b'i.o8.i5.abc\ndef\nghy\nz', True, 'abc\nd---.ef\ngh---.y\nz-----.')
+ (b'i.o3.i6.abcdefghijklmnop', True, 'abc.ghi.mno.')
]
def testDecoder(self):
@@ -892,8 +890,8 @@ class TextIOWrapperTest(unittest.TestCase):
return codecs.CodecInfo(
name='test_decoder', encode=None, decode=None,
incrementalencoder=None,
- incrementaldecoder=StatefulIncrementalDecoder,
- streamreader=None, streamwriter=None)
+ streamreader=None, streamwriter=None,
+ incrementaldecoder=StatefulIncrementalDecoder)
def testSeekAndTellWithData(data, min_pos=0):
"""Tell/seek to various points within a data stream and ensure
@@ -905,42 +903,16 @@ class TextIOWrapperTest(unittest.TestCase):
decoded = f.read()
f.close()
- # Use read() to move to various positions in the input;
- # then tell, read some more data, and seek back.
- for i in range(min_pos, len(decoded) + 1): # to read before tell
- for j in [1, 5, len(decoded)]: # to read after tell
+ for i in range(min_pos, len(decoded) + 1): # seek positions
+ for j in [1, 5, len(decoded) - i]: # read lengths
f = io.open(test_support.TESTFN, encoding='test_decoder')
self.assertEquals(f.read(i), decoded[:i])
cookie = f.tell()
self.assertEquals(f.read(j), decoded[i:i + j])
f.seek(cookie)
- self.assertEquals(f.tell(), cookie)
self.assertEquals(f.read(), decoded[i:])
f.close()
- lines = len(decoded.split('\n'))
-
- # Use readline() to move to various positions in the input;
- # then tell, read some more data, and seek back.
- for limit in [-1, 4, 128]: # 'limit' argument for readline()
- for j in [1, 5, len(decoded)]: # to read after tell()
- f = io.open(test_support.TESTFN, encoding='test_decoder')
- text = ''
- for k in range(lines): # repeatedly call readline()
- line = f.readline(limit=limit)
- if limit >= 0:
- self.assert_(len(line) <= limit)
- text += line
- i = len(text)
- self.assertEquals(text, decoded[:i])
- cookie = f.tell()
- self.assertEquals(f.read(j), decoded[i:i + j])
- f.seek(cookie)
- self.assertEquals(f.tell(), cookie)
- self.assertEquals(f.read(), decoded[i:])
- f.seek(cookie)
- f.close()
-
# Register a special incremental decoder for testing.
codecs.register(lookupTestDecoder)
self.codecEnabled = 1