diff options
author | Guido van Rossum <guido@python.org> | 2007-04-11 01:09:03 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2007-04-11 01:09:03 (GMT) |
commit | 9b76da6a8f93c0211a97187f000b693d0cdc6638 (patch) | |
tree | 85f42a0bba607d2c273e4425ca4e46cb1fa8e45b | |
parent | 8742977b33f2af9b92265c1b332af0f6798bf2b6 (diff) | |
download | cpython-9b76da6a8f93c0211a97187f000b693d0cdc6638.zip cpython-9b76da6a8f93c0211a97187f000b693d0cdc6638.tar.gz cpython-9b76da6a8f93c0211a97187f000b693d0cdc6638.tar.bz2 |
Checkpoint so I can continue to work on this at a different box.
There is somewhat working (but slow) code supporting seek/tell for text files,
but extensive testing exposes a bug I can't nail down.
-rw-r--r-- | Lib/io.py | 170 | ||||
-rw-r--r-- | Lib/test/test_io.py | 108 |
2 files changed, 233 insertions, 45 deletions
@@ -13,8 +13,9 @@ variable are part of the specification. XXX need to default buffer size to 1 if isatty() XXX need to support 1 meaning line-buffered -XXX change behavior of blocking I/O XXX don't use assert to validate input requirements +XXX whenever an argument is None, use the default value +XXX read/write ops should check readable/writable """ __author__ = ("Guido van Rossum <guido@python.org>, " @@ -29,9 +30,11 @@ __all__ = ["BlockingIOError", "open", "IOBase", "RawIOBase", "FileIO", import os import sys import codecs +import pickle import _fileio import warnings +# XXX Shouldn't we use st_blksize whenever we can? DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes @@ -44,18 +47,22 @@ class BlockingIOError(IOError): self.characters_written = characters_written -def open(file, mode="r", buffering=None, *, encoding=None): +def open(file, mode="r", buffering=None, *, encoding=None, newline=None): """Replacement for the built-in open function. Args: file: string giving the name of the file to be opened; - or integer file descriptor of the file to be wrapped (*) - mode: optional mode string; see below + or integer file descriptor of the file to be wrapped (*). + mode: optional mode string; see below. buffering: optional int >= 0 giving the buffer size; values can be: 0 = unbuffered, 1 = line buffered, - larger = fully buffered - encoding: optional string giving the text encoding (*must* be given - as a keyword argument) + larger = fully buffered. + Keywords (for text modes only; *must* be given as keyword arguments): + encoding: optional string giving the text encoding. + newline: optional newlines specifier; must be None, '\n' or '\r\n'; + specifies the line ending expected on input and written on + output. If None, use universal newlines on input and + use os.linesep on output. (*) If a file descriptor is given, it is closed when the returned I/O object is closed. If you don't want this to happen, use @@ -79,6 +86,7 @@ def open(file, mode="r", buffering=None, *, encoding=None): binary stream, a buffered binary stream, or a buffered text stream, open for reading and/or writing. """ + # XXX Don't use asserts for these checks; raise TypeError or ValueError assert isinstance(file, (basestring, int)), repr(file) assert isinstance(mode, basestring), repr(mode) assert buffering is None or isinstance(buffering, int), repr(buffering) @@ -101,7 +109,9 @@ def open(file, mode="r", buffering=None, *, encoding=None): if not (reading or writing or appending): raise ValueError("must have exactly one of read/write/append mode") if binary and encoding is not None: - raise ValueError("binary mode doesn't take an encoding") + raise ValueError("binary mode doesn't take an encoding argument") + if binary and newline is not None: + raise ValueError("binary mode doesn't take a newline argument") raw = FileIO(file, (reading and "r" or "") + (writing and "w" or "") + @@ -132,9 +142,7 @@ def open(file, mode="r", buffering=None, *, encoding=None): buffer = BufferedReader(raw, buffering) if binary: return buffer - # XXX What about newline conventions? - textio = TextIOWrapper(buffer, encoding) - return textio + return TextIOWrapper(buffer, encoding, newline) class IOBase: @@ -795,6 +803,8 @@ class TextIOBase(IOBase): """Base class for text I/O. This class provides a character and line based interface to stream I/O. + + There is no readinto() method, as character strings are immutable. """ def read(self, n: int = -1) -> str: @@ -805,10 +815,18 @@ class TextIOBase(IOBase): """ self._unsupported("read") - def write(self, s: str): - """write(s: str) -> None. Write string s to stream.""" + def write(self, s: str) -> int: + """write(s: str) -> int. Write string s to stream.""" self._unsupported("write") + def truncate(self, pos: int = None) -> int: + """truncate(pos: int = None) -> int. Truncate size to pos.""" + self.flush() + if pos is None: + pos = self.tell() + self.seek(pos) + return self.buffer.truncate() + def readline(self) -> str: """readline() -> str. Read until newline or EOF. @@ -816,12 +834,12 @@ class TextIOBase(IOBase): """ self._unsupported("readline") - def __iter__(self): + def __iter__(self) -> "TextIOBase": # That's a forward reference """__iter__() -> Iterator. Return line iterator (actually just self). """ return self - def next(self): + def next(self) -> str: """Same as readline() except raises StopIteration on immediate EOF.""" line = self.readline() if not line: @@ -855,11 +873,11 @@ class TextIOWrapper(TextIOBase): Character and line based layer over a BufferedIOBase object. """ - # XXX tell(), seek() + _CHUNK_SIZE = 64 def __init__(self, buffer, encoding=None, newline=None): - if newline not in (None, '\n', '\r\n'): - raise IOError("illegal newline %s" % newline) # XXX: ValueError? + if newline not in (None, "\n", "\r\n"): + raise ValueError("illegal newline value: %r" % (newline,)) if encoding is None: # XXX This is questionable encoding = sys.getfilesystemencoding() or "latin-1" @@ -869,7 +887,20 @@ class TextIOWrapper(TextIOBase): self._newline = newline or os.linesep self._fix_newlines = newline is None self._decoder = None - self._pending = '' + self._decoder_in_rest_pickle = None + self._pending = "" + self._snapshot = None + self._seekable = self.buffer.seekable() + + # A word about _snapshot. This attribute is either None, or a + # tuple (position, decoder_pickle, readahead) where position is a + # position of the underlying buffer, decoder_pickle is a pickled + # decoder state, and readahead is the chunk of bytes that was read + # from that position. We use this to reconstruct intermediate + # decoder states in tell(). + + def _seekable(self): + return self._seekable def flush(self): self.buffer.flush() @@ -886,35 +917,124 @@ class TextIOWrapper(TextIOBase): return self.buffer.fileno() def write(self, s: str): + # XXX What if we were just reading? b = s.encode(self._encoding) if isinstance(b, str): b = bytes(b) n = self.buffer.write(b) if "\n" in s: self.flush() - return n + self._snapshot = self._decoder = None + return len(s) def _get_decoder(self): make_decoder = codecs.getincrementaldecoder(self._encoding) if make_decoder is None: - raise IOError(".readline() not supported for encoding %s" % + raise IOError("Can't find an incremental decoder for encoding %s" % self._encoding) decoder = self._decoder = make_decoder() # XXX: errors if isinstance(decoder, codecs.BufferedIncrementalDecoder): # XXX Hack: make the codec use bytes instead of strings decoder.buffer = b"" + self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell() return decoder + def _read_chunk(self): + if not self._seekable: + return self.buffer.read(self._CHUNK_SIZE) + assert self._decoder is not None + position = self.buffer.tell() + decoder_state = pickle.dumps(self._decoder, 2) + readahead = self.buffer.read(self._CHUNK_SIZE) + self._snapshot = (position, decoder_state, readahead) + return readahead + + def _encode_decoder_state(self, ds, pos): + if ds == self._decoder_in_rest_pickle: + return pos + x = 0 + for i in bytes(ds): + x = x<<8 | i + return (x<<64) | pos + + def _decode_decoder_state(self, pos): + x, pos = divmod(pos, 1<<64) + if not x: + return None, pos + b = b"" + while x: + b.append(x&0xff) + x >>= 8 + return str(b[::-1]), pos + + def tell(self): + if not self._seekable: + raise IOError("Underlying stream is not seekable") + self.flush() + if self._decoder is None or self._snapshot is None: + assert self._pending == "" + return self.buffer.tell() + position, decoder_state, readahead = self._snapshot + decoder = pickle.loads(decoder_state) + characters = "" + sequence = [] + for i, b in enumerate(readahead): + c = decoder.decode(bytes([b])) + if c: + characters += c + sequence.append((characters, i+1, pickle.dumps(decoder, 2))) + for ch, i, st in sequence: + if ch + self._pending == characters: + return self._encode_decoder_state(st, position + i) + raise IOError("Can't reconstruct logical file position") + + def seek(self, pos, whence=0): + if not self._seekable: + raise IOError("Underlying stream is not seekable") + if whence == 1: + if pos != 0: + raise IOError("Can't do nonzero cur-relative seeks") + return self.tell() + if whence == 2: + if pos != 0: + raise IOError("Can't do nonzero end-relative seeks") + self.flush() + pos = self.buffer.seek(0, 2) + self._snapshot = None + self._pending = "" + self._decoder = None + return pos + if whence != 0: + raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" % + (whence,)) + if pos < 0: + raise ValueError("Negative seek position %r" % (pos,)) + orig_pos = pos + ds, pos = self._decode_decoder_state(pos) + if not ds: + self.buffer.seek(pos) + self._snapshot = None + self._pending = "" + self._decoder = None + return pos + decoder = pickle.loads(ds) + self.buffer.seek(pos) + self._snapshot = (pos, ds, "") + self._pending = "" + self._decoder = None + return orig_pos + def read(self, n: int = -1): decoder = self._decoder or self._get_decoder() res = self._pending if n < 0: res += decoder.decode(self.buffer.read(), True) self._pending = "" + self._snapshot = None return res else: while len(res) < n: - data = self.buffer.read(64) + data = self._read_chunk() res += decoder.decode(data, not data) if not data: break @@ -923,7 +1043,7 @@ class TextIOWrapper(TextIOBase): def readline(self, limit=None): if limit is not None: - # XXX Hack to support limit arg + # XXX Hack to support limit argument, for backwards compatibility line = self.readline() if len(line) <= limit: return line @@ -951,7 +1071,7 @@ class TextIOWrapper(TextIOBase): # We've seen \r - is it standalone, \r\n or \r at end of line? if endpos + 1 < len(line): - if line[endpos+1] == '\n': + if line[endpos+1] == "\n": ending = "\r\n" else: ending = "\r" @@ -963,7 +1083,7 @@ class TextIOWrapper(TextIOBase): # No line ending seen yet - get more data while True: - data = self.buffer.read(64) + data = self._read_chunk() more_line = decoder.decode(data, not data) if more_line or not data: break diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 27fd56f..d19b2a0 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -93,6 +93,32 @@ class IOTest(unittest.TestCase): self.assertEqual(f.truncate(12), 12) self.assertEqual(f.tell(), 12) + def read_ops(self, f, buffered=False): + data = f.read(5) + self.assertEqual(data, b"hello") + self.assertEqual(f.readinto(data), 5) + self.assertEqual(data, b" worl") + self.assertEqual(f.readinto(data), 2) + self.assertEqual(len(data), 5) + self.assertEqual(data[:2], b"d\n") + self.assertEqual(f.seek(0), 0) + self.assertEqual(f.read(20), b"hello world\n") + self.assertEqual(f.read(1), b"") + self.assertEqual(f.readinto(b"x"), 0) + self.assertEqual(f.seek(-6, 2), 6) + self.assertEqual(f.read(5), b"world") + self.assertEqual(f.read(0), b"") + self.assertEqual(f.readinto(b""), 0) + self.assertEqual(f.seek(-6, 1), 5) + self.assertEqual(f.read(5), b" worl") + self.assertEqual(f.tell(), 10) + if buffered: + f.seek(0) + self.assertEqual(f.read(), b"hello world\n") + f.seek(6) + self.assertEqual(f.read(), b"world\n") + self.assertEqual(f.read(), b"") + LARGE = 2**31 def large_file_ops(self, f): @@ -112,24 +138,6 @@ class IOTest(unittest.TestCase): self.assertEqual(f.seek(-1, 2), self.LARGE) self.assertEqual(f.read(2), b"x") - def read_ops(self, f): - data = f.read(5) - self.assertEqual(data, b"hello") - n = f.readinto(data) - self.assertEqual(n, 5) - self.assertEqual(data, b" worl") - n = f.readinto(data) - self.assertEqual(n, 2) - self.assertEqual(len(data), 5) - self.assertEqual(data[:2], b"d\n") - f.seek(0) - self.assertEqual(f.read(20), b"hello world\n") - f.seek(-6, 2) - self.assertEqual(f.read(5), b"world") - f.seek(-6, 1) - self.assertEqual(f.read(5), b" worl") - self.assertEqual(f.tell(), 10) - def test_raw_file_io(self): f = io.open(test_support.TESTFN, "wb", buffering=0) self.assertEqual(f.readable(), False) @@ -155,7 +163,7 @@ class IOTest(unittest.TestCase): self.assertEqual(f.readable(), True) self.assertEqual(f.writable(), False) self.assertEqual(f.seekable(), True) - self.read_ops(f) + self.read_ops(f, True) f.close() def test_raw_bytes_io(self): @@ -164,7 +172,7 @@ class IOTest(unittest.TestCase): data = f.getvalue() self.assertEqual(data, b"hello world\n") f = io.BytesIO(data) - self.read_ops(f) + self.read_ops(f, True) def test_large_file_ops(self): # On Windows and Mac OSX this test comsumes large resources; It takes @@ -445,6 +453,10 @@ class BufferedRandomTest(unittest.TestCase): class TextIOWrapperTest(unittest.TestCase): + +## def tearDown(self): +## test_support.unlink(test_support.TESTFN) + def testNewlines(self): input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ] @@ -486,6 +498,62 @@ class TextIOWrapperTest(unittest.TestCase): self.assertEquals(got_line, exp_line) self.assertEquals(len(got_lines), len(exp_lines)) + # Systematic tests of the text I/O API + + def testBasicIO(self): + for chunksize in (1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65): + for enc in "ascii", "latin1", "utf8" :# , "utf-16-be", "utf-16-le": + f = io.open(test_support.TESTFN, "w+", encoding=enc) + f._CHUNK_SIZE = chunksize + self.assertEquals(f.write("abc"), 3) + f.close() + f = io.open(test_support.TESTFN, "r+", encoding=enc) + f._CHUNK_SIZE = chunksize + self.assertEquals(f.tell(), 0) + self.assertEquals(f.read(), "abc") + cookie = f.tell() + self.assertEquals(f.seek(0), 0) + self.assertEquals(f.read(2), "ab") + self.assertEquals(f.read(1), "c") + self.assertEquals(f.read(1), "") + self.assertEquals(f.read(), "") + self.assertEquals(f.tell(), cookie) + self.assertEquals(f.seek(0), 0) + self.assertEquals(f.seek(0, 2), cookie) + self.assertEquals(f.write("def"), 3) + self.assertEquals(f.seek(cookie), cookie) + self.assertEquals(f.read(), "def") + if enc.startswith("utf"): + self.multi_line_test(f, enc) + f.close() + + def multi_line_test(self, f, enc): + f.seek(0) + f.truncate() + sample = u"s\xff\u0fff\uffff" + wlines = [] + for size in (0, 1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65, + 100, 200, 300, 400, 500, 1000): + chars = [] + for i in xrange(size): + chars.append(sample[i % len(sample)]) + line = u"".join(chars) + "\n" + wlines.append((f.tell(), line)) + f.write(line) + wendpos = f.tell() + f.seek(0) + rlines = [] + while True: + pos = f.tell() + line = f.readline() + if not line: + rendpos = pos + break + rlines.append((pos, line)) + self.assertEquals(rendpos, wendpos) + self.assertEquals(rlines, wlines) + + # XXX Tests for open() def test_main(): |