diff options
| author | Ka-Ping Yee <ping@zesty.ca> | 2008-03-20 10:37:32 (GMT) | 
|---|---|---|
| committer | Ka-Ping Yee <ping@zesty.ca> | 2008-03-20 10:37:32 (GMT) | 
| commit | 593cd6b843ea0ccbd3466bbd7e888e65835ec2aa (patch) | |
| tree | c8d957e29c246da1a7043757be72ece7f39dd5ad /Lib/io.py | |
| parent | dbe28e573e5c1227777db9dd630fac7732b81374 (diff) | |
| download | cpython-593cd6b843ea0ccbd3466bbd7e888e65835ec2aa.zip cpython-593cd6b843ea0ccbd3466bbd7e888e65835ec2aa.tar.gz cpython-593cd6b843ea0ccbd3466bbd7e888e65835ec2aa.tar.bz2  | |
Clean up the TextIOWrapper code; pick better names; improve documentation.
Diffstat (limited to 'Lib/io.py')
| -rw-r--r-- | Lib/io.py | 244 | 
1 files changed, 121 insertions, 123 deletions
@@ -1179,20 +1179,19 @@ class TextIOWrapper(TextIOBase):          self._writenl = newline or os.linesep          self._encoder = None          self._decoder = None -        self._decoded_text = ""  # buffer for text produced by decoder -        self._decoded_text_offset = 0  # offset to text returned by read() +        self._decoded_chars = ''  # buffer for text returned from decoder +        self._decoded_chars_used = 0  # offset into _decoded_chars for read()          self._snapshot = None  # info for reconstructing decoder state          self._seekable = self._telling = self.buffer.seekable() -    # A word about _snapshot.  This attribute is either None, or a tuple -    # (decoder_state, next_input) where decoder_state is the second -    # (integer) item of the decoder state, and next_input is the chunk -    # of bytes that comes after the snapshot point in the input. -    # We use this to reconstruct intermediate decoder states in tell(). +    # self._snapshot is either None, or a tuple (dec_flags, next_input) +    # where dec_flags is the second (integer) item of the decoder state +    # and next_input is the chunk of input bytes that comes next after the +    # snapshot point.  We use this to reconstruct decoder states in tell().      # Naming convention: -    #   - integer variables ending in "_bytes" count input bytes -    #   - integer variables ending in "_chars" count decoded characters +    #   - "bytes_..." for integer variables that count input bytes +    #   - "chars_..." for integer variables that count decoded characters      def __repr__(self):          return '<TIOW %x>' % id(self) @@ -1267,62 +1266,79 @@ class TextIOWrapper(TextIOBase):          self._decoder = decoder          return decoder +    # The following three methods implement an ADT for _decoded_chars. +    # Text returned from the decoder is buffered here until the client +    # requests it by calling our read() or readline() method. +    def _set_decoded_chars(self, chars): +        """Set the _decoded_chars buffer.""" +        self._decoded_chars = chars +        self._decoded_chars_used = 0 + +    def _get_decoded_chars(self, n=None): +        """Advance into the _decoded_chars buffer.""" +        offset = self._decoded_chars_used +        if n is None: +            chars = self._decoded_chars[offset:] +        else: +            chars = self._decoded_chars[offset:offset + n] +        self._decoded_chars_used += len(chars) +        return chars + +    def _rewind_decoded_chars(self, n): +        """Rewind the _decoded_chars buffer.""" +        if self._decoded_chars_used < n: +            raise AssertionError("rewind decoded_chars out of bounds") +        self._decoded_chars_used -= n +      def _read_chunk(self):          """          Read and decode the next chunk of data from the BufferedReader.          The return value is True unless EOF was reached.  The decoded string -        is placed in self._decoded_text (replacing its previous value). -        (The entire input chunk is sent to the decoder, though some of it -        may remain buffered in the decoder, yet to be converted.) +        is placed in self._decoded_chars (replacing its previous value). +        The entire input chunk is sent to the decoder, though some of it +        may remain buffered in the decoder, yet to be converted.          """          if self._decoder is None:              raise ValueError("no decoder") -        if not self._telling: -            # No one should call tell(), so don't bother taking a snapshot. -            input_chunk = self.buffer.read1(self._CHUNK_SIZE) -            eof = not input_chunk -            self._decoded_text = self._decoder.decode(input_chunk, eof) -            self._decoded_text_offset = 0 -            return not eof - -        # The cookie returned by tell() cannot include the contents of -        # the decoder's buffer, so we need to snapshot a point in the -        # input where the decoder has nothing in its input buffer. - -        dec_buffer, dec_flags = self._decoder.getstate() -        # The state tuple returned by getstate() contains the decoder's -        # input buffer and an integer representing any other state.  Thus, -        # there is a valid snapshot point len(decoder_buffer) bytes ago in -        # the input, with the state tuple (b'', decoder_state). +        if self._telling: +            # To prepare for tell(), we need to snapshot a point in the +            # file where the decoder's input buffer is empty. + +            dec_buffer, dec_flags = self._decoder.getstate() +            # Given this, we know there was a valid snapshot point +            # len(dec_buffer) bytes ago with decoder state (b'', dec_flags). + +        # Read a chunk, decode it, and put the result in self._decoded_chars.          input_chunk = self.buffer.read1(self._CHUNK_SIZE)          eof = not input_chunk -        self._decoded_text = self._decoder.decode(input_chunk, eof) -        self._decoded_text_offset = 0 +        self._set_decoded_chars(self._decoder.decode(input_chunk, eof)) + +        if self._telling: +            # At the snapshot point, len(dec_buffer) bytes before the read, +            # the next input to be decoded is dec_buffer + input_chunk. +            self._snapshot = (dec_flags, dec_buffer + input_chunk) -        # At the snapshot point, len(dec_buffer) bytes ago, the next input -        # to be passed to the decoder is dec_buffer + input_chunk. -        self._snapshot = (dec_flags, dec_buffer + input_chunk)          return not eof      def _pack_cookie(self, position, dec_flags=0, -                            feed_bytes=0, need_eof=0, skip_chars=0): +                           bytes_to_feed=0, need_eof=0, chars_to_skip=0):          # The meaning of a tell() cookie is: seek to position, set the -        # decoder flags to dec_flags, read feed_bytes bytes, feed them +        # decoder flags to dec_flags, read bytes_to_feed bytes, feed them          # into the decoder with need_eof as the EOF flag, then skip -        # skip_chars characters of the decoded result.  For most simple -        # decoders, this should often just be the position. -        return (position | (dec_flags<<64) | (feed_bytes<<128) | -                (skip_chars<<192) | bool(need_eof)<<256) +        # chars_to_skip characters of the decoded result.  For most simple +        # decoders, tell() will often just give a byte offset in the file. +        return (position | (dec_flags<<64) | (bytes_to_feed<<128) | +               (chars_to_skip<<192) | bool(need_eof)<<256)      def _unpack_cookie(self, bigint):          rest, position = divmod(bigint, 1<<64)          rest, dec_flags = divmod(rest, 1<<64) -        rest, feed_bytes = divmod(rest, 1<<64) -        need_eof, skip_chars = divmod(rest, 1<<64) -        return position, dec_flags, feed_bytes, need_eof, skip_chars +        rest, bytes_to_feed = divmod(rest, 1<<64) +        need_eof, chars_to_skip = divmod(rest, 1<<64) +        return position, dec_flags, bytes_to_feed, need_eof, chars_to_skip      def tell(self):          if not self._seekable: @@ -1333,7 +1349,7 @@ class TextIOWrapper(TextIOBase):          position = self.buffer.tell()          decoder = self._decoder          if decoder is None or self._snapshot is None: -            if self._decoded_text: +            if self._decoded_chars:                  # This should never happen.                  raise AssertionError("pending decoded text")              return position @@ -1342,51 +1358,48 @@ class TextIOWrapper(TextIOBase):          dec_flags, next_input = self._snapshot          position -= len(next_input) -        # How many decoded characters have been returned since the snapshot? -        skip_chars = self._decoded_text_offset -        if skip_chars == 0: +        # How many decoded characters have been used up since the snapshot? +        chars_to_skip = self._decoded_chars_used +        if chars_to_skip == 0:              # We haven't moved from the snapshot point.              return self._pack_cookie(position, dec_flags) -        # Walk the decoder forward, one byte at a time, to find the minimum -        # input necessary to give us the decoded characters we need to skip. -        # As we go, look for the "safe point" nearest to the current location -        # (i.e. a point where the decoder has nothing buffered, so we can -        # safely start from there when trying to return to this location). +        # Starting from the snapshot position, we will walk the decoder +        # forward until it gives us enough decoded characters.          saved_state = decoder.getstate()          try: -            decoder.setstate((b"", dec_flags)) -            fed_bytes = 0 -            decoded_chars = 0 +            # Note our initial start point. +            decoder.setstate((b'', dec_flags)) +            start_pos = position +            start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0              need_eof = 0 -            last_safe_point = (dec_flags, 0, 0) +            # Feed the decoder one byte at a time.  As we go, note the +            # nearest "safe start point" before the current location +            # (a point where the decoder has nothing buffered, so seek() +            # can safely start from there and advance to this location).              next_byte = bytearray(1)              for next_byte[0] in next_input: -                decoded = decoder.decode(next_byte) -                fed_bytes += 1 -                decoded_chars += len(decoded) +                bytes_fed += 1 +                chars_decoded += len(decoder.decode(next_byte))                  dec_buffer, dec_flags = decoder.getstate() -                if not dec_buffer and decoded_chars <= skip_chars: -                    # Decoder buffer is empty, so it's safe to start from here. -                    last_safe_point = (dec_flags, fed_bytes, decoded_chars) -                if decoded_chars >= skip_chars: +                if not dec_buffer and chars_decoded <= chars_to_skip: +                    # Decoder buffer is empty, so this is a safe start point. +                    start_pos += bytes_fed +                    chars_to_skip -= chars_decoded +                    start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0 +                if chars_decoded >= chars_to_skip:                      break              else:                  # We didn't get enough decoded data; signal EOF to get more. -                decoded = decoder.decode(b"", final=True) -                decoded_chars += len(decoded) +                chars_decoded += len(decoder.decode(b'', final=True))                  need_eof = 1 -                if decoded_chars < skip_chars: +                if chars_decoded < chars_to_skip:                      raise IOError("can't reconstruct logical file position") -            # Advance the starting position to the last safe point. -            dec_flags, safe_fed_bytes, safe_decoded_chars = last_safe_point -            position += safe_fed_bytes -            fed_bytes -= safe_fed_bytes -            skip_chars -= safe_decoded_chars +            # The returned cookie corresponds to the last safe start point.              return self._pack_cookie( -                position, dec_flags, fed_bytes, need_eof, skip_chars) +                start_pos, start_flags, bytes_fed, need_eof, chars_to_skip)          finally:              decoder.setstate(saved_state) @@ -1405,7 +1418,8 @@ class TextIOWrapper(TextIOBase):                  raise IOError("can't do nonzero end-relative seeks")              self.flush()              position = self.buffer.seek(0, 2) -            self._clear_decoded_text() +            self._set_decoded_chars('') +            self._snapshot = None              if self._decoder:                  self._decoder.reset()              return position @@ -1416,53 +1430,35 @@ class TextIOWrapper(TextIOBase):              raise ValueError("negative seek position %r" % (cookie,))          self.flush() -        # Seek back to the snapshot point. -        position, dec_flags, feed_bytes, need_eof, skip_chars = \ +        # The strategy of seek() is to go back to the safe start point +        # and replay the effect of read(chars_to_skip) from there. +        start_pos, dec_flags, bytes_to_feed, need_eof, chars_to_skip = \              self._unpack_cookie(cookie) -        self.buffer.seek(position) -        self._clear_decoded_text() -        if self._decoder or dec_flags or feed_bytes or need_eof: -            # Restore the decoder flags to their values from the snapshot. +        # Seek back to the safe start point. +        self.buffer.seek(start_pos) +        self._set_decoded_chars('') +        self._snapshot = None + +        # Restore the decoder to its state from the safe start point. +        if self._decoder or dec_flags or chars_to_skip:              self._decoder = self._decoder or self._get_decoder() -            self._decoder.setstate((b"", dec_flags)) +            self._decoder.setstate((b'', dec_flags))              self._snapshot = (dec_flags, b'') -        if feed_bytes or need_eof: -            # Feed feed_bytes bytes to the decoder. -            input_chunk = self.buffer.read(feed_bytes) -            self._decoded_text = self._decoder.decode(input_chunk, need_eof) -            if len(self._decoded_text) < skip_chars: -                raise IOError("can't restore logical file position") - -            # Skip skip_chars of the decoded characters. -            self._decoded_text_offset = skip_chars - -            # Restore the snapshot. +        if chars_to_skip: +            # Just like _read_chunk, feed the decoder and save a snapshot. +            input_chunk = self.buffer.read(bytes_to_feed) +            self._set_decoded_chars( +                self._decoder.decode(input_chunk, need_eof))              self._snapshot = (dec_flags, input_chunk) -        return cookie - -    def _clear_decoded_text(self): -        """Reset the _decoded_text buffer.""" -        self._decoded_text = '' -        self._decoded_text_offset = 0 -        self._snapshot = None -    def _emit_decoded_text(self, n=None): -        """Advance into the _decoded_text buffer.""" -        offset = self._decoded_text_offset -        if n is None: -            text = self._decoded_text[offset:] -        else: -            text = self._decoded_text[offset:offset + n] -        self._decoded_text_offset += len(text) -        return text +            # Skip chars_to_skip of the decoded characters. +            if len(self._decoded_chars) < chars_to_skip: +                raise IOError("can't restore logical file position") +            self._decoded_chars_used = chars_to_skip -    def _unemit_decoded_text(self, n): -        """Rewind the _decoded_text buffer.""" -        if self._decoded_text_offset < n: -            raise AssertionError("unemit out of bounds") -        self._decoded_text_offset -= n +        return cookie      def read(self, n=None):          if n is None: @@ -1470,17 +1466,18 @@ class TextIOWrapper(TextIOBase):          decoder = self._decoder or self._get_decoder()          if n < 0:              # Read everything. -            result = (self._emit_decoded_text() + +            result = (self._get_decoded_chars() +                        decoder.decode(self.buffer.read(), final=True)) -            self._clear_decoded_text() +            self._set_decoded_chars('') +            self._snapshot = None              return result          else:              # Keep reading chunks until we have n characters to return.              eof = False -            result = self._emit_decoded_text(n) +            result = self._get_decoded_chars(n)              while len(result) < n and not eof:                  eof = not self._read_chunk() -                result += self._emit_decoded_text(n - len(result)) +                result += self._get_decoded_chars(n - len(result))              return result      def __next__(self): @@ -1497,7 +1494,7 @@ class TextIOWrapper(TextIOBase):              limit = -1          # Grab all the decoded text (we will rewind any extra bits later). -        line = self._emit_decoded_text() +        line = self._get_decoded_chars()          start = 0          decoder = self._decoder or self._get_decoder() @@ -1558,20 +1555,21 @@ class TextIOWrapper(TextIOBase):              # No line ending seen yet - get more data              more_line = ''              while self._read_chunk(): -                if self._decoded_text: +                if self._decoded_chars:                      break -            if self._decoded_text: -                line += self._emit_decoded_text() +            if self._decoded_chars: +                line += self._get_decoded_chars()              else:                  # end of file -                self._clear_decoded_text() +                self._set_decoded_chars('') +                self._snapshot = None                  return line          if limit >= 0 and endpos > limit:              endpos = limit  # don't exceed limit -        # Rewind _decoded_text to just after the line ending we found. -        self._unemit_decoded_text(len(line) - endpos) +        # Rewind _decoded_chars to just after the line ending we found. +        self._rewind_decoded_chars(len(line) - endpos)          return line[:endpos]      @property  | 
