summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2007-04-11 01:09:03 (GMT)
committerGuido van Rossum <guido@python.org>2007-04-11 01:09:03 (GMT)
commit9b76da6a8f93c0211a97187f000b693d0cdc6638 (patch)
tree85f42a0bba607d2c273e4425ca4e46cb1fa8e45b /Lib
parent8742977b33f2af9b92265c1b332af0f6798bf2b6 (diff)
downloadcpython-9b76da6a8f93c0211a97187f000b693d0cdc6638.zip
cpython-9b76da6a8f93c0211a97187f000b693d0cdc6638.tar.gz
cpython-9b76da6a8f93c0211a97187f000b693d0cdc6638.tar.bz2
Checkpoint so I can continue to work on this at a different box.
There is somewhat working (but slow) code supporting seek/tell for text files, but extensive testing exposes a bug I can't nail down.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/io.py170
-rw-r--r--Lib/test/test_io.py108
2 files changed, 233 insertions, 45 deletions
diff --git a/Lib/io.py b/Lib/io.py
index 9c6738a..b2860f4 100644
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -13,8 +13,9 @@ variable are part of the specification.
XXX need to default buffer size to 1 if isatty()
XXX need to support 1 meaning line-buffered
-XXX change behavior of blocking I/O
XXX don't use assert to validate input requirements
+XXX whenever an argument is None, use the default value
+XXX read/write ops should check readable/writable
"""
__author__ = ("Guido van Rossum <guido@python.org>, "
@@ -29,9 +30,11 @@ __all__ = ["BlockingIOError", "open", "IOBase", "RawIOBase", "FileIO",
import os
import sys
import codecs
+import pickle
import _fileio
import warnings
+# XXX Shouldn't we use st_blksize whenever we can?
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
@@ -44,18 +47,22 @@ class BlockingIOError(IOError):
self.characters_written = characters_written
-def open(file, mode="r", buffering=None, *, encoding=None):
+def open(file, mode="r", buffering=None, *, encoding=None, newline=None):
"""Replacement for the built-in open function.
Args:
file: string giving the name of the file to be opened;
- or integer file descriptor of the file to be wrapped (*)
- mode: optional mode string; see below
+ or integer file descriptor of the file to be wrapped (*).
+ mode: optional mode string; see below.
buffering: optional int >= 0 giving the buffer size; values
can be: 0 = unbuffered, 1 = line buffered,
- larger = fully buffered
- encoding: optional string giving the text encoding (*must* be given
- as a keyword argument)
+ larger = fully buffered.
+ Keywords (for text modes only; *must* be given as keyword arguments):
+ encoding: optional string giving the text encoding.
+ newline: optional newlines specifier; must be None, '\n' or '\r\n';
+ specifies the line ending expected on input and written on
+ output. If None, use universal newlines on input and
+ use os.linesep on output.
(*) If a file descriptor is given, it is closed when the returned
I/O object is closed. If you don't want this to happen, use
@@ -79,6 +86,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
binary stream, a buffered binary stream, or a buffered text
stream, open for reading and/or writing.
"""
+ # XXX Don't use asserts for these checks; raise TypeError or ValueError
assert isinstance(file, (basestring, int)), repr(file)
assert isinstance(mode, basestring), repr(mode)
assert buffering is None or isinstance(buffering, int), repr(buffering)
@@ -101,7 +109,9 @@ def open(file, mode="r", buffering=None, *, encoding=None):
if not (reading or writing or appending):
raise ValueError("must have exactly one of read/write/append mode")
if binary and encoding is not None:
- raise ValueError("binary mode doesn't take an encoding")
+ raise ValueError("binary mode doesn't take an encoding argument")
+ if binary and newline is not None:
+ raise ValueError("binary mode doesn't take a newline argument")
raw = FileIO(file,
(reading and "r" or "") +
(writing and "w" or "") +
@@ -132,9 +142,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
buffer = BufferedReader(raw, buffering)
if binary:
return buffer
- # XXX What about newline conventions?
- textio = TextIOWrapper(buffer, encoding)
- return textio
+ return TextIOWrapper(buffer, encoding, newline)
class IOBase:
@@ -795,6 +803,8 @@ class TextIOBase(IOBase):
"""Base class for text I/O.
This class provides a character and line based interface to stream I/O.
+
+ There is no readinto() method, as character strings are immutable.
"""
def read(self, n: int = -1) -> str:
@@ -805,10 +815,18 @@ class TextIOBase(IOBase):
"""
self._unsupported("read")
- def write(self, s: str):
- """write(s: str) -> None. Write string s to stream."""
+ def write(self, s: str) -> int:
+ """write(s: str) -> int. Write string s to stream."""
self._unsupported("write")
+ def truncate(self, pos: int = None) -> int:
+ """truncate(pos: int = None) -> int. Truncate size to pos."""
+ self.flush()
+ if pos is None:
+ pos = self.tell()
+ self.seek(pos)
+ return self.buffer.truncate()
+
def readline(self) -> str:
"""readline() -> str. Read until newline or EOF.
@@ -816,12 +834,12 @@ class TextIOBase(IOBase):
"""
self._unsupported("readline")
- def __iter__(self):
+ def __iter__(self) -> "TextIOBase": # That's a forward reference
"""__iter__() -> Iterator. Return line iterator (actually just self).
"""
return self
- def next(self):
+ def next(self) -> str:
"""Same as readline() except raises StopIteration on immediate EOF."""
line = self.readline()
if not line:
@@ -855,11 +873,11 @@ class TextIOWrapper(TextIOBase):
Character and line based layer over a BufferedIOBase object.
"""
- # XXX tell(), seek()
+ _CHUNK_SIZE = 64
def __init__(self, buffer, encoding=None, newline=None):
- if newline not in (None, '\n', '\r\n'):
- raise IOError("illegal newline %s" % newline) # XXX: ValueError?
+ if newline not in (None, "\n", "\r\n"):
+ raise ValueError("illegal newline value: %r" % (newline,))
if encoding is None:
# XXX This is questionable
encoding = sys.getfilesystemencoding() or "latin-1"
@@ -869,7 +887,20 @@ class TextIOWrapper(TextIOBase):
self._newline = newline or os.linesep
self._fix_newlines = newline is None
self._decoder = None
- self._pending = ''
+ self._decoder_in_rest_pickle = None
+ self._pending = ""
+ self._snapshot = None
+ self._seekable = self.buffer.seekable()
+
+ # A word about _snapshot. This attribute is either None, or a
+ # tuple (position, decoder_pickle, readahead) where position is a
+ # position of the underlying buffer, decoder_pickle is a pickled
+ # decoder state, and readahead is the chunk of bytes that was read
+ # from that position. We use this to reconstruct intermediate
+ # decoder states in tell().
+
+ def _seekable(self):
+ return self._seekable
def flush(self):
self.buffer.flush()
@@ -886,35 +917,124 @@ class TextIOWrapper(TextIOBase):
return self.buffer.fileno()
def write(self, s: str):
+ # XXX What if we were just reading?
b = s.encode(self._encoding)
if isinstance(b, str):
b = bytes(b)
n = self.buffer.write(b)
if "\n" in s:
self.flush()
- return n
+ self._snapshot = self._decoder = None
+ return len(s)
def _get_decoder(self):
make_decoder = codecs.getincrementaldecoder(self._encoding)
if make_decoder is None:
- raise IOError(".readline() not supported for encoding %s" %
+ raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
# XXX Hack: make the codec use bytes instead of strings
decoder.buffer = b""
+ self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
return decoder
+ def _read_chunk(self):
+ if not self._seekable:
+ return self.buffer.read(self._CHUNK_SIZE)
+ assert self._decoder is not None
+ position = self.buffer.tell()
+ decoder_state = pickle.dumps(self._decoder, 2)
+ readahead = self.buffer.read(self._CHUNK_SIZE)
+ self._snapshot = (position, decoder_state, readahead)
+ return readahead
+
+ def _encode_decoder_state(self, ds, pos):
+ if ds == self._decoder_in_rest_pickle:
+ return pos
+ x = 0
+ for i in bytes(ds):
+ x = x<<8 | i
+ return (x<<64) | pos
+
+ def _decode_decoder_state(self, pos):
+ x, pos = divmod(pos, 1<<64)
+ if not x:
+ return None, pos
+ b = b""
+ while x:
+ b.append(x&0xff)
+ x >>= 8
+ return str(b[::-1]), pos
+
+ def tell(self):
+ if not self._seekable:
+ raise IOError("Underlying stream is not seekable")
+ self.flush()
+ if self._decoder is None or self._snapshot is None:
+ assert self._pending == ""
+ return self.buffer.tell()
+ position, decoder_state, readahead = self._snapshot
+ decoder = pickle.loads(decoder_state)
+ characters = ""
+ sequence = []
+ for i, b in enumerate(readahead):
+ c = decoder.decode(bytes([b]))
+ if c:
+ characters += c
+ sequence.append((characters, i+1, pickle.dumps(decoder, 2)))
+ for ch, i, st in sequence:
+ if ch + self._pending == characters:
+ return self._encode_decoder_state(st, position + i)
+ raise IOError("Can't reconstruct logical file position")
+
+ def seek(self, pos, whence=0):
+ if not self._seekable:
+ raise IOError("Underlying stream is not seekable")
+ if whence == 1:
+ if pos != 0:
+ raise IOError("Can't do nonzero cur-relative seeks")
+ return self.tell()
+ if whence == 2:
+ if pos != 0:
+ raise IOError("Can't do nonzero end-relative seeks")
+ self.flush()
+ pos = self.buffer.seek(0, 2)
+ self._snapshot = None
+ self._pending = ""
+ self._decoder = None
+ return pos
+ if whence != 0:
+ raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
+ (whence,))
+ if pos < 0:
+ raise ValueError("Negative seek position %r" % (pos,))
+ orig_pos = pos
+ ds, pos = self._decode_decoder_state(pos)
+ if not ds:
+ self.buffer.seek(pos)
+ self._snapshot = None
+ self._pending = ""
+ self._decoder = None
+ return pos
+ decoder = pickle.loads(ds)
+ self.buffer.seek(pos)
+ self._snapshot = (pos, ds, "")
+ self._pending = ""
+ self._decoder = None
+ return orig_pos
+
def read(self, n: int = -1):
decoder = self._decoder or self._get_decoder()
res = self._pending
if n < 0:
res += decoder.decode(self.buffer.read(), True)
self._pending = ""
+ self._snapshot = None
return res
else:
while len(res) < n:
- data = self.buffer.read(64)
+ data = self._read_chunk()
res += decoder.decode(data, not data)
if not data:
break
@@ -923,7 +1043,7 @@ class TextIOWrapper(TextIOBase):
def readline(self, limit=None):
if limit is not None:
- # XXX Hack to support limit arg
+ # XXX Hack to support limit argument, for backwards compatibility
line = self.readline()
if len(line) <= limit:
return line
@@ -951,7 +1071,7 @@ class TextIOWrapper(TextIOBase):
# We've seen \r - is it standalone, \r\n or \r at end of line?
if endpos + 1 < len(line):
- if line[endpos+1] == '\n':
+ if line[endpos+1] == "\n":
ending = "\r\n"
else:
ending = "\r"
@@ -963,7 +1083,7 @@ class TextIOWrapper(TextIOBase):
# No line ending seen yet - get more data
while True:
- data = self.buffer.read(64)
+ data = self._read_chunk()
more_line = decoder.decode(data, not data)
if more_line or not data:
break
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 27fd56f..d19b2a0 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -93,6 +93,32 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.truncate(12), 12)
self.assertEqual(f.tell(), 12)
+ def read_ops(self, f, buffered=False):
+ data = f.read(5)
+ self.assertEqual(data, b"hello")
+ self.assertEqual(f.readinto(data), 5)
+ self.assertEqual(data, b" worl")
+ self.assertEqual(f.readinto(data), 2)
+ self.assertEqual(len(data), 5)
+ self.assertEqual(data[:2], b"d\n")
+ self.assertEqual(f.seek(0), 0)
+ self.assertEqual(f.read(20), b"hello world\n")
+ self.assertEqual(f.read(1), b"")
+ self.assertEqual(f.readinto(b"x"), 0)
+ self.assertEqual(f.seek(-6, 2), 6)
+ self.assertEqual(f.read(5), b"world")
+ self.assertEqual(f.read(0), b"")
+ self.assertEqual(f.readinto(b""), 0)
+ self.assertEqual(f.seek(-6, 1), 5)
+ self.assertEqual(f.read(5), b" worl")
+ self.assertEqual(f.tell(), 10)
+ if buffered:
+ f.seek(0)
+ self.assertEqual(f.read(), b"hello world\n")
+ f.seek(6)
+ self.assertEqual(f.read(), b"world\n")
+ self.assertEqual(f.read(), b"")
+
LARGE = 2**31
def large_file_ops(self, f):
@@ -112,24 +138,6 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.seek(-1, 2), self.LARGE)
self.assertEqual(f.read(2), b"x")
- def read_ops(self, f):
- data = f.read(5)
- self.assertEqual(data, b"hello")
- n = f.readinto(data)
- self.assertEqual(n, 5)
- self.assertEqual(data, b" worl")
- n = f.readinto(data)
- self.assertEqual(n, 2)
- self.assertEqual(len(data), 5)
- self.assertEqual(data[:2], b"d\n")
- f.seek(0)
- self.assertEqual(f.read(20), b"hello world\n")
- f.seek(-6, 2)
- self.assertEqual(f.read(5), b"world")
- f.seek(-6, 1)
- self.assertEqual(f.read(5), b" worl")
- self.assertEqual(f.tell(), 10)
-
def test_raw_file_io(self):
f = io.open(test_support.TESTFN, "wb", buffering=0)
self.assertEqual(f.readable(), False)
@@ -155,7 +163,7 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.readable(), True)
self.assertEqual(f.writable(), False)
self.assertEqual(f.seekable(), True)
- self.read_ops(f)
+ self.read_ops(f, True)
f.close()
def test_raw_bytes_io(self):
@@ -164,7 +172,7 @@ class IOTest(unittest.TestCase):
data = f.getvalue()
self.assertEqual(data, b"hello world\n")
f = io.BytesIO(data)
- self.read_ops(f)
+ self.read_ops(f, True)
def test_large_file_ops(self):
# On Windows and Mac OSX this test comsumes large resources; It takes
@@ -445,6 +453,10 @@ class BufferedRandomTest(unittest.TestCase):
class TextIOWrapperTest(unittest.TestCase):
+
+## def tearDown(self):
+## test_support.unlink(test_support.TESTFN)
+
def testNewlines(self):
input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
@@ -486,6 +498,62 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEquals(got_line, exp_line)
self.assertEquals(len(got_lines), len(exp_lines))
+ # Systematic tests of the text I/O API
+
+ def testBasicIO(self):
+ for chunksize in (1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65):
+ for enc in "ascii", "latin1", "utf8" :# , "utf-16-be", "utf-16-le":
+ f = io.open(test_support.TESTFN, "w+", encoding=enc)
+ f._CHUNK_SIZE = chunksize
+ self.assertEquals(f.write("abc"), 3)
+ f.close()
+ f = io.open(test_support.TESTFN, "r+", encoding=enc)
+ f._CHUNK_SIZE = chunksize
+ self.assertEquals(f.tell(), 0)
+ self.assertEquals(f.read(), "abc")
+ cookie = f.tell()
+ self.assertEquals(f.seek(0), 0)
+ self.assertEquals(f.read(2), "ab")
+ self.assertEquals(f.read(1), "c")
+ self.assertEquals(f.read(1), "")
+ self.assertEquals(f.read(), "")
+ self.assertEquals(f.tell(), cookie)
+ self.assertEquals(f.seek(0), 0)
+ self.assertEquals(f.seek(0, 2), cookie)
+ self.assertEquals(f.write("def"), 3)
+ self.assertEquals(f.seek(cookie), cookie)
+ self.assertEquals(f.read(), "def")
+ if enc.startswith("utf"):
+ self.multi_line_test(f, enc)
+ f.close()
+
+ def multi_line_test(self, f, enc):
+ f.seek(0)
+ f.truncate()
+ sample = u"s\xff\u0fff\uffff"
+ wlines = []
+ for size in (0, 1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65,
+ 100, 200, 300, 400, 500, 1000):
+ chars = []
+ for i in xrange(size):
+ chars.append(sample[i % len(sample)])
+ line = u"".join(chars) + "\n"
+ wlines.append((f.tell(), line))
+ f.write(line)
+ wendpos = f.tell()
+ f.seek(0)
+ rlines = []
+ while True:
+ pos = f.tell()
+ line = f.readline()
+ if not line:
+ rendpos = pos
+ break
+ rlines.append((pos, line))
+ self.assertEquals(rendpos, wendpos)
+ self.assertEquals(rlines, wlines)
+
+
# XXX Tests for open()
def test_main():