summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2007-04-06 17:31:18 (GMT)
committerGuido van Rossum <guido@python.org>2007-04-06 17:31:18 (GMT)
commit78892e46131d01c6f6e6dd7276143e50ffac442d (patch)
tree6fd83de5a189dd3eeb5b3131788273520993b780
parent0e074483e7b1f99488c57296b34452fec718a895 (diff)
downloadcpython-78892e46131d01c6f6e6dd7276143e50ffac442d.zip
cpython-78892e46131d01c6f6e6dd7276143e50ffac442d.tar.gz
cpython-78892e46131d01c6f6e6dd7276143e50ffac442d.tar.bz2
Added a working Text I/O layer, by Mark Russell.
This is essentially a checkpoint.
-rw-r--r--Lib/io.py283
-rw-r--r--Lib/test/test_io.py130
2 files changed, 366 insertions, 47 deletions
diff --git a/Lib/io.py b/Lib/io.py
index 4e91691..23e29ba 100644
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -3,7 +3,7 @@
This is an early prototype; eventually some of this will be
reimplemented in C and the rest may be turned into a package.
-See PEP XXX; for now: http://docs.google.com/Doc?id=dfksfvqd_1cn5g5m
+See PEP 3116.
XXX need to default buffer size to 1 if isatty()
XXX need to support 1 meaning line-buffered
@@ -11,20 +11,24 @@ XXX change behavior of blocking I/O
"""
__author__ = ("Guido van Rossum <guido@python.org>, "
- "Mike Verdone <mike.verdone@gmail.com>")
+ "Mike Verdone <mike.verdone@gmail.com>, "
+ "Mark Russell <mark.russell@zen.co.uk>")
__all__ = ["open", "RawIOBase", "FileIO", "SocketIO", "BytesIO",
"BufferedReader", "BufferedWriter", "BufferedRWPair",
- "BufferedRandom", "EOF"]
+ "BufferedRandom"]
import os
+import sys
+import codecs
+import warnings
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
DEFAULT_MAX_BUFFER_SIZE = 16 * 1024 # bytes
-EOF = b'' # XXX This is wrong because it's mutable
class BlockingIO(IOError):
+
def __init__(self, errno, strerror, characters_written):
IOError.__init__(self, errno, strerror)
self.characters_written = characters_written
@@ -125,14 +129,12 @@ class RawIOBase:
seeked.
The read() method is implemented by calling readinto(); derived
- classes that want to support readon only need to implement
+ classes that want to support read() only need to implement
readinto() as a primitive operation.
"""
- # XXX Add individual method docstrings
-
def read(self, n):
- """Read and return up to n bytes.
+ """read(n: int) -> bytes. Read and return up to n bytes.
Returns an empty bytes array on EOF, or None if the object is
set not to block and has no data to read.
@@ -143,43 +145,80 @@ class RawIOBase:
return b
def readinto(self, b):
+ """readinto(b: bytes) -> None. Read up to len(b) bytes into b.
+
+ Returns number of bytes read (0 for EOF), or None if the object
+ is set not to block as has no data to read.
+ """
raise IOError(".readinto() not supported")
def write(self, b):
- """Write the given buffer to the IO stream.
+ """write(b: bytes) -> int. Write the given buffer to the IO stream.
- Returns the number of bytes written.
+ Returns the number of bytes written, which may be less than len(b).
"""
raise IOError(".write() not supported")
def seek(self, pos, whence=0):
+ """seek(pos: int, whence: int = 0) -> None. Change stream position.
+
+ Seek to byte offset pos relative to position indicated by whence:
+ 0 Start of stream (the default). pos should be >= 0;
+ 1 Current position - whence may be negative;
+ 2 End of stream - whence usually negative.
+ """
raise IOError(".seek() not supported")
def tell(self):
+ """tell() -> int. Return current stream position."""
raise IOError(".tell() not supported")
def truncate(self, pos=None):
+ """truncate(size: int = None) -> None. Truncate file to size bytes.
+
+ Size defaults to the current IO position as reported by tell().
+ """
raise IOError(".truncate() not supported")
def close(self):
+ """close() -> None. Close IO object."""
pass
def seekable(self):
+ """seekable() -> bool. Return whether object supports random access.
+
+ If False, seek(), tell() and truncate() will raise IOError.
+ This method may need to do a test seek().
+ """
return False
def readable(self):
+ """readable() -> bool. Return whether object was opened for reading.
+
+ If False, read() will raise IOError.
+ """
return False
def writable(self):
+ """writable() -> bool. Return whether object was opened for writing.
+
+ If False, write() and truncate() will raise IOError.
+ """
return False
def __enter__(self):
+ """Context management protocol. Returns self."""
return self
def __exit__(self, *args):
+ """Context management protocol. Same as close()"""
self.close()
def fileno(self):
+ """fileno() -> int. Return underlying file descriptor if there is one.
+
+ Raises IOError if the IO object does not use a file descriptor.
+ """
raise IOError(".fileno() not supported")
@@ -252,6 +291,8 @@ try:
import _fileio
except ImportError:
# Let's use the Python version
+ warnings.warn("Can't import _fileio, using slower Python lookalike",
+ RuntimeWarning)
FileIO = _PyFileIO
else:
# Create a trivial subclass with the proper inheritance structure
@@ -295,17 +336,13 @@ class BufferedIOBase(RawIOBase):
"""XXX Docstring."""
-class BytesIO(BufferedIOBase):
+class _MemoryBufferMixin:
- """Buffered I/O implementation using a bytes buffer, like StringIO."""
+ # XXX docstring
- # XXX More docs
-
- def __init__(self, inital_bytes=None):
- self._buffer = b""
+ def __init__(self, buffer):
+ self._buffer = buffer
self._pos = 0
- if inital_bytes is not None:
- self._buffer += inital_bytes
def getvalue(self):
return self._buffer
@@ -362,6 +399,35 @@ class BytesIO(BufferedIOBase):
return True
+class BytesIO(_MemoryBufferMixin, BufferedIOBase):
+
+ """Buffered I/O implementation using a bytes buffer, like StringIO."""
+
+ # XXX More docs
+
+ def __init__(self, inital_bytes=None):
+ buffer = b""
+ if inital_bytes is not None:
+ buffer += inital_bytes
+ _MemoryBufferMixin.__init__(self, buffer)
+
+
+class StringIO(_MemoryBufferMixin, BufferedIOBase):
+
+ """Buffered I/O implementation using a string buffer, like StringIO."""
+
+ # XXX More docs
+
+ # XXX Reuses the same code as BytesIO, just with a string rather
+ # that bytes as the _buffer value. That won't work in C of course.
+
+ def __init__(self, inital_string=None):
+ buffer = ""
+ if inital_string is not None:
+ buffer += inital_string
+ _MemoryBufferMixin.__init__(self, buffer)
+
+
class BufferedIOBase(RawIOBase):
"""Base class for buffered IO objects."""
@@ -375,15 +441,17 @@ class BufferedReader(BufferedIOBase):
"""Buffer for a readable sequential RawIO object.
- Does not allow random access (seek, tell).
+ Does not allow random access (seek, tell). (Use BufferedRandom
+ for that.)
"""
- def __init__(self, raw, unused_buffer_size=None):
+ def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE):
"""Create a new buffered reader using the given readable raw IO object.
"""
assert raw.readable()
self.raw = raw
self._read_buf = b""
+ self.buffer_size = buffer_size
if hasattr(raw, 'fileno'):
self.fileno = raw.fileno
@@ -395,11 +463,13 @@ class BufferedReader(BufferedIOBase):
mode. If n is None, read until EOF or until read() would
block.
"""
- assert n is None or n > 0
- nodata_val = EOF
- while (len(self._read_buf) < n) if (n is not None) else True:
- current = self.raw.read(n)
- if current in (EOF, None):
+ assert n is None or n > 0, '.read(): Bad read size %r' % n
+ nodata_val = b""
+ while n is None or len(self._read_buf) < n:
+ to_read = None if n is None else max(n, self.buffer_size)
+ current = self.raw.read(to_read)
+
+ if current in (b"", None):
nodata_val = current
break
self._read_buf += current
@@ -428,6 +498,8 @@ class BufferedReader(BufferedIOBase):
class BufferedWriter(BufferedIOBase):
+ # XXX docstring
+
def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE,
max_buffer_size=DEFAULT_MAX_BUFFER_SIZE):
assert raw.writable()
@@ -488,6 +560,8 @@ class BufferedRWPair(BufferedReader, BufferedWriter):
A buffered reader object and buffered writer object put together to
form a sequential IO object that can read and write.
+
+ This is typically used with a socket or two-way pipe.
"""
def __init__(self, reader, writer, buffer_size=DEFAULT_BUFFER_SIZE,
@@ -528,6 +602,8 @@ class BufferedRWPair(BufferedReader, BufferedWriter):
class BufferedRandom(BufferedReader, BufferedWriter):
+ # XXX docstring
+
def __init__(self, raw, buffer_size=DEFAULT_BUFFER_SIZE,
max_buffer_size=DEFAULT_MAX_BUFFER_SIZE):
assert raw.seekable()
@@ -561,7 +637,9 @@ class BufferedRandom(BufferedReader, BufferedWriter):
return BufferedReader.read(self, n)
def write(self, b):
- self._read_buf = b""
+ if self._read_buf:
+ self.raw.seek(-len(self._read_buf), 1) # Undo readahead
+ self._read_buf = b""
return BufferedWriter.write(self, b)
def flush(self):
@@ -569,3 +647,156 @@ class BufferedRandom(BufferedReader, BufferedWriter):
def close(self):
self.raw.close()
+
+
+class TextIOBase(BufferedIOBase):
+
+ """Base class for text I/O.
+
+ This class provides a character and line based interface to stream I/O.
+ """
+
+ def read(self, n: int = -1) -> str:
+ """read(n: int = -1) -> str. Read at most n characters from stream.
+
+ Read from underlying buffer until we have n characters or we hit EOF.
+ If n is negative or omitted, read until EOF.
+ """
+ raise IOError(".read() not supported")
+
+ def write(self, s: str):
+ """write(s: str) -> None. Write string s to stream.
+ """
+ raise IOError(".write() not supported")
+
+ def readline(self) -> str:
+ """readline() -> str. Read until newline or EOF.
+
+ Returns an empty string if EOF is hit immediately.
+ """
+ raise IOError(".readline() not supported")
+
+ def __iter__(self):
+ """__iter__() -> Iterator. Return line iterator (actually just self).
+ """
+ return self
+
+ def next(self):
+ """Same as readline() except raises StopIteration on immediate EOF.
+ """
+ line = self.readline()
+ if line == '':
+ raise StopIteration
+ return line
+
+
+class TextIOWrapper(TextIOBase):
+
+ """Buffered text stream.
+
+ Character and line based layer over a BufferedIOBase object.
+ """
+
+ # XXX tell(), seek()
+
+ def __init__(self, buffer, encoding=None, newline=None):
+ if newline not in (None, '\n', '\r\n'):
+ raise IOError("illegal newline %s" % newline) # XXX: ValueError?
+ if encoding is None:
+ # XXX This is questionable
+ encoding = sys.getfilesystemencoding()
+ if encoding is None:
+ encoding = "latin-1" # XXX, but this is best for transparancy
+
+ self.buffer = buffer
+ self._encoding = encoding
+ self._newline = newline or os.linesep
+ self._fix_newlines = newline is None
+ self._decoder = None
+ self._pending = ''
+
+ def write(self, s: str):
+ return self.buffer.write(s.encode(self._encoding))
+
+ def _get_decoder(self):
+ make_decoder = codecs.getincrementaldecoder(self._encoding)
+ if make_decoder is None:
+ raise IOError(".readline() not supported for encoding %s" %
+ self._encoding)
+ decoder = self._decoder = make_decoder() # XXX: errors
+ if isinstance(decoder, codecs.BufferedIncrementalDecoder):
+ # XXX Hack: make the codec use bytes instead of strings
+ decoder.buffer = b""
+ return decoder
+
+ def read(self, n: int = -1):
+ decoder = self._decoder or self._get_decoder()
+ res = self._pending
+ if n < 0:
+ res += decoder.decode(self.buffer.read(), True)
+ self._pending = ''
+ return res
+ else:
+ while len(res) < n:
+ data = self.buffer.read(64)
+ res += decoder.decode(data, not data)
+ if not data:
+ break
+ self._pending = res[n:]
+ return res[:n]
+
+ def readline(self):
+ line = self._pending
+ start = 0
+ decoder = self._decoder or self._get_decoder()
+
+ while True:
+ # In C we'd look for these in parallel of course.
+ nlpos = line.find("\n", start)
+ crpos = line.find("\r", start)
+ if nlpos >= 0 and crpos >= 0:
+ endpos = min(nlpos, crpos)
+ else:
+ endpos = nlpos if nlpos >= 0 else crpos
+
+ if endpos != -1:
+ endc = line[endpos]
+ if endc == "\n":
+ ending = "\n"
+ break
+
+ # We've seen \r - is it standalone, \r\n or \r at end of line?
+ if endpos + 1 < len(line):
+ if line[endpos+1] == '\n':
+ ending = "\r\n"
+ else:
+ ending = "\r"
+ break
+ # There might be a following \n in the next block of data ...
+ start = endpos
+ else:
+ start = len(line)
+
+ # No line ending seen yet - get more data
+ while True:
+ data = self.buffer.read(64)
+ more_line = decoder.decode(data, not data)
+ if more_line != "" or not data:
+ break
+
+ if more_line == "":
+ ending = ''
+ endpos = len(line)
+ break
+
+ line += more_line
+
+ nextpos = endpos + len(ending)
+ self._pending = line[nextpos:]
+
+ # XXX Update self.newlines here if we want to support that
+
+ if self._fix_newlines and ending != "\n" and ending != '':
+ return line[:endpos] + "\n"
+ else:
+ return line[:nextpos]
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 956a502..53419f4 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2,7 +2,7 @@
import unittest
from test import test_support
-
+from itertools import chain
import io
@@ -16,7 +16,7 @@ class MockIO(io.RawIOBase):
try:
return self._readStack.pop(0)
except:
- return io.EOF
+ return b""
def write(self, b):
self._writeStack.append(b)
@@ -41,6 +41,18 @@ class MockIO(io.RawIOBase):
return 42
+class MockFileIO(io.BytesIO):
+
+ def __init__(self, data):
+ self.read_history = []
+ io.BytesIO.__init__(self, data)
+
+ def read(self, n=None):
+ res = io.BytesIO.read(self, n)
+ self.read_history.append(None if res is None else len(res))
+ return res
+
+
class MockNonBlockWriterIO(io.RawIOBase):
def __init__(self, blockingScript):
@@ -147,31 +159,31 @@ class IOTest(unittest.TestCase):
f.close()
-class BytesIOTest(unittest.TestCase):
+class MemorySeekTest(unittest.TestCase):
def testInit(self):
- buf = b"1234567890"
- bytesIo = io.BytesIO(buf)
+ buf = self.buftype("1234567890")
+ bytesIo = self.ioclass(buf)
def testRead(self):
- buf = b"1234567890"
- bytesIo = io.BytesIO(buf)
+ buf = self.buftype("1234567890")
+ bytesIo = self.ioclass(buf)
self.assertEquals(buf[:1], bytesIo.read(1))
self.assertEquals(buf[1:5], bytesIo.read(4))
self.assertEquals(buf[5:], bytesIo.read(900))
- self.assertEquals(io.EOF, bytesIo.read())
+ self.assertEquals(self.EOF, bytesIo.read())
def testReadNoArgs(self):
- buf = b"1234567890"
- bytesIo = io.BytesIO(buf)
+ buf = self.buftype("1234567890")
+ bytesIo = self.ioclass(buf)
self.assertEquals(buf, bytesIo.read())
- self.assertEquals(io.EOF, bytesIo.read())
+ self.assertEquals(self.EOF, bytesIo.read())
def testSeek(self):
- buf = b"1234567890"
- bytesIo = io.BytesIO(buf)
+ buf = self.buftype("1234567890")
+ bytesIo = self.ioclass(buf)
bytesIo.read(5)
bytesIo.seek(0)
@@ -181,8 +193,8 @@ class BytesIOTest(unittest.TestCase):
self.assertEquals(buf[3:], bytesIo.read())
def testTell(self):
- buf = b"1234567890"
- bytesIo = io.BytesIO(buf)
+ buf = self.buftype("1234567890")
+ bytesIo = self.ioclass(buf)
self.assertEquals(0, bytesIo.tell())
bytesIo.seek(5)
@@ -191,6 +203,18 @@ class BytesIOTest(unittest.TestCase):
self.assertEquals(10000, bytesIo.tell())
+class BytesIOTest(MemorySeekTest):
+ buftype = bytes
+ ioclass = io.BytesIO
+ EOF = b""
+
+
+class StringIOTest(MemorySeekTest):
+ buftype = str
+ ioclass = io.StringIO
+ EOF = ""
+
+
class BufferedReaderTest(unittest.TestCase):
def testRead(self):
@@ -199,6 +223,25 @@ class BufferedReaderTest(unittest.TestCase):
self.assertEquals(b"abcdef", bufIo.read(6))
+ def testBuffering(self):
+ data = b"abcdefghi"
+ dlen = len(data)
+
+ tests = [
+ [ 100, [ 3, 1, 4, 8 ], [ dlen, 0 ] ],
+ [ 100, [ 3, 3, 3], [ dlen ] ],
+ [ 4, [ 1, 2, 4, 2 ], [ 4, 4, 1 ] ],
+ ]
+
+ for bufsize, buf_read_sizes, raw_read_sizes in tests:
+ rawIo = MockFileIO(data)
+ bufIo = io.BufferedReader(rawIo, buffer_size=bufsize)
+ pos = 0
+ for nbytes in buf_read_sizes:
+ self.assertEquals(bufIo.read(nbytes), data[pos:pos+nbytes])
+ pos += nbytes
+ self.assertEquals(rawIo.read_history, raw_read_sizes)
+
def testReadNonBlocking(self):
# Inject some None's in there to simulate EWOULDBLOCK
rawIo = MockIO((b"abc", b"d", None, b"efg", None, None))
@@ -208,7 +251,7 @@ class BufferedReaderTest(unittest.TestCase):
self.assertEquals(b"e", bufIo.read(1))
self.assertEquals(b"fg", bufIo.read())
self.assert_(None is bufIo.read())
- self.assertEquals(io.EOF, bufIo.read())
+ self.assertEquals(b"", bufIo.read())
def testReadToEof(self):
rawIo = MockIO((b"abc", b"d", b"efg"))
@@ -270,8 +313,9 @@ class BufferedWriterTest(unittest.TestCase):
bufIo.write(b"asdfasdfasdf")
- # XXX I don't like this test. It relies too heavily on how the algorithm
- # actually works, which we might change. Refactor later.
+ # XXX I don't like this test. It relies too heavily on how the
+ # algorithm actually works, which we might change. Refactor
+ # later.
def testFileno(self):
rawIo = MockIO((b"abc", b"d", b"efg"))
@@ -299,7 +343,7 @@ class BufferedRWPairTest(unittest.TestCase):
# XXX need implementation
-class BufferedRandom(unittest.TestCase):
+class BufferedRandomTest(unittest.TestCase):
def testReadAndWrite(self):
raw = MockIO((b"asdf", b"ghjk"))
@@ -331,12 +375,56 @@ class BufferedRandom(unittest.TestCase):
self.assertEquals(7, rw.tell())
self.assertEquals(b"fl", rw.read(11))
+
+class TextIOWrapperTest(unittest.TestCase):
+ def testNewlines(self):
+ input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
+
+ tests = [
+ [ None, [ 'unix\n', 'windows\n', 'os9\n', 'last\n', 'nonl' ] ],
+ [ '\n', input_lines ],
+ [ '\r\n', input_lines ],
+ ]
+
+ encodings = ('utf-8', 'bz2')
+
+ # Try a range of pad sizes to test the case where \r is the last
+ # character in TextIOWrapper._pending_line.
+ for encoding in encodings:
+ for do_reads in (False, True):
+ for padlen in chain(range(10), range(50, 60)):
+ pad = '.' * padlen
+ data_lines = [ pad + line for line in input_lines ]
+ # XXX: str.encode() should return bytes
+ data = bytes(''.join(data_lines).encode(encoding))
+
+ for newline, exp_line_ends in tests:
+ exp_lines = [ pad + line for line in exp_line_ends ]
+ bufIo = io.BufferedReader(io.BytesIO(data))
+ textIo = io.TextIOWrapper(bufIo, newline=newline,
+ encoding=encoding)
+ if do_reads:
+ got_lines = []
+ while True:
+ c2 = textIo.read(2)
+ if c2 == '':
+ break
+ self.assertEquals(len(c2), 2)
+ got_lines.append(c2 + textIo.readline())
+ else:
+ got_lines = list(textIo)
+
+ for got_line, exp_line in zip(got_lines, exp_lines):
+ self.assertEquals(got_line, exp_line)
+ self.assertEquals(len(got_lines), len(exp_lines))
+
# XXX Tests for open()
def test_main():
- test_support.run_unittest(IOTest, BytesIOTest, BufferedReaderTest,
+ test_support.run_unittest(IOTest, BytesIOTest, StringIOTest,
+ BufferedReaderTest,
BufferedWriterTest, BufferedRWPairTest,
- BufferedRandom)
+ BufferedRandomTest, TextIOWrapperTest)
if __name__ == "__main__":
test_main()