diff options
-rw-r--r-- | Lib/io.py | 288 | ||||
-rw-r--r-- | Lib/test/test_memoryio.py | 10 | ||||
-rw-r--r-- | Lib/test/test_minidom.py | 3 | ||||
-rw-r--r-- | Lib/test/test_uu.py | 34 | ||||
-rw-r--r-- | Lib/xml/dom/minidom.py | 7 | ||||
-rw-r--r-- | Misc/NEWS | 2 | ||||
-rw-r--r-- | Modules/_stringio.c | 379 | ||||
-rw-r--r-- | setup.py | 1 |
8 files changed, 702 insertions, 22 deletions
@@ -1769,20 +1769,20 @@ class TextIOWrapper(TextIOBase): def newlines(self): return self._decoder.newlines if self._decoder else None -class StringIO(TextIOWrapper): - """An in-memory stream for text. The initial_value argument sets the - value of object. The other arguments are like those of TextIOWrapper's - constructor. +class _StringIO(TextIOWrapper): + """Text I/O implementation using an in-memory buffer. + + The initial_value argument sets the value of object. The newline + argument is like the one of TextIOWrapper's constructor. """ # XXX This is really slow, but fully functional - def __init__(self, initial_value="", encoding="utf-8", - errors="strict", newline="\n"): - super(StringIO, self).__init__(BytesIO(), - encoding=encoding, - errors=errors, - newline=newline) + def __init__(self, initial_value="", newline="\n"): + super(_StringIO, self).__init__(BytesIO(), + encoding="utf-8", + errors="strict", + newline=newline) if initial_value: if not isinstance(initial_value, str): initial_value = str(initial_value) @@ -1792,3 +1792,271 @@ class StringIO(TextIOWrapper): def getvalue(self): self.flush() return self.buffer.getvalue().decode(self._encoding, self._errors) + +try: + import _stringio + + # This subclass is a reimplementation of the TextIOWrapper + # interface without any of its text decoding facilities. All the + # stored data is manipulated with the efficient + # _stringio._StringIO extension type. Also, the newline decoding + # mechanism of IncrementalNewlineDecoder is reimplemented here for + # efficiency. Doing otherwise, would require us to implement a + # fake decoder which would add an additional and unnecessary layer + # on top of the _StringIO methods. + + class StringIO(_stringio._StringIO, TextIOBase): + """Text I/O implementation using an in-memory buffer. + + The initial_value argument sets the value of object. The newline + argument is like the one of TextIOWrapper's constructor. + """ + + _CHUNK_SIZE = 4096 + + def __init__(self, initial_value="", newline="\n"): + if newline not in (None, "", "\n", "\r", "\r\n"): + raise ValueError("illegal newline value: %r" % (newline,)) + + self._readuniversal = not newline + self._readtranslate = newline is None + self._readnl = newline + self._writetranslate = newline != "" + self._writenl = newline or os.linesep + self._pending = "" + self._seennl = 0 + + # Reset the buffer first, in case __init__ is called + # multiple times. + self.truncate(0) + if initial_value is None: + initial_value = "" + self.write(initial_value) + self.seek(0) + + @property + def buffer(self): + raise UnsupportedOperation("%s.buffer attribute is unsupported" % + self.__class__.__name__) + + def _decode_newlines(self, input, final=False): + # decode input (with the eventual \r from a previous pass) + if self._pending: + input = self._pending + input + + # retain last \r even when not translating data: + # then readline() is sure to get \r\n in one pass + if input.endswith("\r") and not final: + input = input[:-1] + self._pending = "\r" + else: + self._pending = "" + + # Record which newlines are read + crlf = input.count('\r\n') + cr = input.count('\r') - crlf + lf = input.count('\n') - crlf + self._seennl |= (lf and self._LF) | (cr and self._CR) \ + | (crlf and self._CRLF) + + if self._readtranslate: + if crlf: + output = input.replace("\r\n", "\n") + if cr: + output = input.replace("\r", "\n") + else: + output = input + + return output + + def writable(self): + return True + + def readable(self): + return True + + def seekable(self): + return True + + _read = _stringio._StringIO.read + _write = _stringio._StringIO.write + _tell = _stringio._StringIO.tell + _seek = _stringio._StringIO.seek + _truncate = _stringio._StringIO.truncate + _getvalue = _stringio._StringIO.getvalue + + def getvalue(self) -> str: + """Retrieve the entire contents of the object.""" + if self.closed: + raise ValueError("read on closed file") + return self._getvalue() + + def write(self, s: str) -> int: + """Write string s to file. + + Returns the number of characters written. + """ + if self.closed: + raise ValueError("write to closed file") + if not isinstance(s, str): + raise TypeError("can't write %s to text stream" % + s.__class__.__name__) + length = len(s) + if self._writetranslate and self._writenl != "\n": + s = s.replace("\n", self._writenl) + self._pending = "" + self._write(s) + return length + + def read(self, n: int = None) -> str: + """Read at most n characters, returned as a string. + + If the argument is negative or omitted, read until EOF + is reached. Return an empty string at EOF. + """ + if self.closed: + raise ValueError("read to closed file") + if n is None: + n = -1 + res = self._pending + if n < 0: + res += self._decode_newlines(self._read(), True) + self._pending = "" + return res + else: + res = self._decode_newlines(self._read(n), True) + self._pending = res[n:] + return res[:n] + + def tell(self) -> int: + """Tell the current file position.""" + if self.closed: + raise ValueError("tell from closed file") + if self._pending: + return self._tell() - len(self._pending) + else: + return self._tell() + + def seek(self, pos: int = None, whence: int = 0) -> int: + """Change stream position. + + Seek to character offset pos relative to position indicated by whence: + 0 Start of stream (the default). pos should be >= 0; + 1 Current position - pos must be 0; + 2 End of stream - pos must be 0. + Returns the new absolute position. + """ + if self.closed: + raise ValueError("seek from closed file") + self._pending = "" + return self._seek(pos, whence) + + def truncate(self, pos: int = None) -> int: + """Truncate size to pos. + + The pos argument defaults to the current file position, as + returned by tell(). Imply an absolute seek to pos. + Returns the new absolute position. + """ + if self.closed: + raise ValueError("truncate from closed file") + self._pending = "" + return self._truncate(pos) + + def readline(self, limit: int = None) -> str: + if self.closed: + raise ValueError("read from closed file") + if limit is None: + limit = -1 + if limit >= 0: + # XXX: Hack to support limit argument, for backwards + # XXX compatibility + line = self.readline() + if len(line) <= limit: + return line + line, self._pending = line[:limit], line[limit:] + self._pending + return line + + line = self._pending + self._pending = "" + + start = 0 + pos = endpos = None + while True: + if self._readtranslate: + # Newlines are already translated, only search for \n + pos = line.find('\n', start) + if pos >= 0: + endpos = pos + 1 + break + else: + start = len(line) + + elif self._readuniversal: + # Universal newline search. Find any of \r, \r\n, \n + # The decoder ensures that \r\n are not split in two pieces + + # In C we'd look for these in parallel of course. + nlpos = line.find("\n", start) + crpos = line.find("\r", start) + if crpos == -1: + if nlpos == -1: + # Nothing found + start = len(line) + else: + # Found \n + endpos = nlpos + 1 + break + elif nlpos == -1: + # Found lone \r + endpos = crpos + 1 + break + elif nlpos < crpos: + # Found \n + endpos = nlpos + 1 + break + elif nlpos == crpos + 1: + # Found \r\n + endpos = crpos + 2 + break + else: + # Found \r + endpos = crpos + 1 + break + else: + # non-universal + pos = line.find(self._readnl) + if pos >= 0: + endpos = pos + len(self._readnl) + break + + # No line ending seen yet - get more data + more_line = self.read(self._CHUNK_SIZE) + if more_line: + line += more_line + else: + # end of file + return line + + self._pending = line[endpos:] + return line[:endpos] + + _LF = 1 + _CR = 2 + _CRLF = 4 + + @property + def newlines(self): + return (None, + "\n", + "\r", + ("\r", "\n"), + "\r\n", + ("\n", "\r\n"), + ("\r", "\r\n"), + ("\r", "\n", "\r\n") + )[self._seennl] + + +except ImportError: + StringIO = _StringIO diff --git a/Lib/test/test_memoryio.py b/Lib/test/test_memoryio.py index 2d91cbd..d1745bc 100644 --- a/Lib/test/test_memoryio.py +++ b/Lib/test/test_memoryio.py @@ -10,7 +10,7 @@ import io import sys try: - import _bytesio + import _bytesio, _stringio has_c_implementation = True except ImportError: has_c_implementation = False @@ -373,7 +373,7 @@ class PyBytesIOTest(MemoryTestMixin, unittest.TestCase): class PyStringIOTest(MemoryTestMixin, unittest.TestCase): buftype = str - ioclass = io.StringIO + ioclass = io._StringIO EOF = "" def test_relative_seek(self): @@ -404,10 +404,14 @@ if has_c_implementation: class CBytesIOTest(PyBytesIOTest): ioclass = io.BytesIO + class CStringIOTest(PyStringIOTest): + ioclass = io.StringIO + + def test_main(): tests = [PyBytesIOTest, PyStringIOTest] if has_c_implementation: - tests.extend([CBytesIOTest]) + tests.extend([CBytesIOTest, CStringIOTest]) support.run_unittest(*tests) if __name__ == '__main__': diff --git a/Lib/test/test_minidom.py b/Lib/test/test_minidom.py index ca1f836..c4c568f 100644 --- a/Lib/test/test_minidom.py +++ b/Lib/test/test_minidom.py @@ -3,7 +3,6 @@ import os import sys import pickle -from io import StringIO from test.support import verbose, run_unittest, TestSkipped import unittest @@ -80,7 +79,7 @@ class MinidomTest(unittest.TestCase): self.confirm(t == s, "looking for %s, found %s" % (repr(s), repr(t))) def testParseFromFile(self): - dom = parse(StringIO(open(tstfile).read())) + dom = parse(open(tstfile)) dom.unlink() self.confirm(isinstance(dom, Document)) diff --git a/Lib/test/test_uu.py b/Lib/test/test_uu.py index 02d0171..d2b6e73 100644 --- a/Lib/test/test_uu.py +++ b/Lib/test/test_uu.py @@ -17,6 +17,32 @@ encodedtext = b"""\ M5&AE('-M;V]T:\"US8V%L960@<'ET:&]N(&-R97!T(&]V97(@=&AE('-L965P (:6YG(&1O9PH """ +# Stolen from io.py +class FakeIO(io.TextIOWrapper): + """Text I/O implementation using an in-memory buffer. + + Can be a used as a drop-in replacement for sys.stdin and sys.stdout. + """ + + # XXX This is really slow, but fully functional + + def __init__(self, initial_value="", encoding="utf-8", + errors="strict", newline="\n"): + super(FakeIO, self).__init__(io.BytesIO(), + encoding=encoding, + errors=errors, + newline=newline) + if initial_value: + if not isinstance(initial_value, str): + initial_value = str(initial_value) + self.write(initial_value) + self.seek(0) + + def getvalue(self): + self.flush() + return self.buffer.getvalue().decode(self._encoding, self._errors) + + def encodedtextwrapped(mode, filename): return (bytes("begin %03o %s\n" % (mode, filename), "ascii") + encodedtext + b"\n \nend\n") @@ -76,15 +102,15 @@ class UUStdIOTest(unittest.TestCase): sys.stdout = self.stdout def test_encode(self): - sys.stdin = io.StringIO(plaintext.decode("ascii")) - sys.stdout = io.StringIO() + sys.stdin = FakeIO(plaintext.decode("ascii")) + sys.stdout = FakeIO() uu.encode("-", "-", "t1", 0o666) self.assertEqual(sys.stdout.getvalue(), encodedtextwrapped(0o666, "t1").decode("ascii")) def test_decode(self): - sys.stdin = io.StringIO(encodedtextwrapped(0o666, "t1").decode("ascii")) - sys.stdout = io.StringIO() + sys.stdin = FakeIO(encodedtextwrapped(0o666, "t1").decode("ascii")) + sys.stdout = FakeIO() uu.decode("-", "-") stdout = sys.stdout sys.stdout = self.stdout diff --git a/Lib/xml/dom/minidom.py b/Lib/xml/dom/minidom.py index f229369..3025ed7 100644 --- a/Lib/xml/dom/minidom.py +++ b/Lib/xml/dom/minidom.py @@ -14,6 +14,7 @@ Todo: * SAX 2 namespaces """ +import codecs import io import xml.dom @@ -49,16 +50,16 @@ class Node(xml.dom.Node): # indent = the indentation string to prepend, per level # newl = the newline string to append use_encoding = "utf-8" if encoding is None else encoding - writer = io.StringIO(encoding=use_encoding) + writer = codecs.getwriter(use_encoding)(io.BytesIO()) if self.nodeType == Node.DOCUMENT_NODE: # Can pass encoding only to document, to put it into XML header self.writexml(writer, "", indent, newl, encoding) else: self.writexml(writer, "", indent, newl) if encoding is None: - return writer.getvalue() + return writer.stream.getvalue().decode(use_encoding) else: - return writer.buffer.getvalue() + return writer.stream.getvalue() def hasChildNodes(self): if self.childNodes: @@ -78,6 +78,8 @@ Extension Modules Library ------- +- Added C optimized implementation of io.StringIO. + - The ``pickle`` module is now automatically use an optimized C implementation of Pickler and Unpickler when available. The ``cPickle`` module is no longer needed. diff --git a/Modules/_stringio.c b/Modules/_stringio.c new file mode 100644 index 0000000..83fc79e --- /dev/null +++ b/Modules/_stringio.c @@ -0,0 +1,379 @@ +#include "Python.h" + +/* This module is a stripped down version of _bytesio.c with a Py_UNICODE + buffer. Most of the functionality is provided by subclassing _StringIO. */ + + +typedef struct { + PyObject_HEAD + Py_UNICODE *buf; + Py_ssize_t pos; + Py_ssize_t string_size; + size_t buf_size; +} StringIOObject; + + +/* Internal routine for changing the size, in terms of characters, of the + buffer of StringIO objects. The caller should ensure that the 'size' + argument is non-negative. Returns 0 on success, -1 otherwise. */ +static int +resize_buffer(StringIOObject *self, size_t size) +{ + /* Here, unsigned types are used to avoid dealing with signed integer + overflow, which is undefined in C. */ + size_t alloc = self->buf_size; + Py_UNICODE *new_buf = NULL; + + assert(self->buf != NULL); + + /* For simplicity, stay in the range of the signed type. Anyway, Python + doesn't allow strings to be longer than this. */ + if (size > PY_SSIZE_T_MAX) + goto overflow; + + if (size < alloc / 2) { + /* Major downsize; resize down to exact size. */ + alloc = size + 1; + } + else if (size < alloc) { + /* Within allocated size; quick exit */ + return 0; + } + else if (size <= alloc * 1.125) { + /* Moderate upsize; overallocate similar to list_resize() */ + alloc = size + (size >> 3) + (size < 9 ? 3 : 6); + } + else { + /* Major upsize; resize up to exact size */ + alloc = size + 1; + } + + if (alloc > ((size_t)-1) / sizeof(Py_UNICODE)) + goto overflow; + new_buf = (Py_UNICODE *)PyMem_Realloc(self->buf, + alloc * sizeof(Py_UNICODE)); + if (new_buf == NULL) { + PyErr_NoMemory(); + return -1; + } + self->buf_size = alloc; + self->buf = new_buf; + + return 0; + + overflow: + PyErr_SetString(PyExc_OverflowError, + "new buffer size too large"); + return -1; +} + +/* Internal routine for writing a string of characters to the buffer of a + StringIO object. Returns the number of bytes wrote, or -1 on error. */ +static Py_ssize_t +write_str(StringIOObject *self, const Py_UNICODE *str, Py_ssize_t len) +{ + assert(self->buf != NULL); + assert(self->pos >= 0); + assert(len >= 0); + + /* This overflow check is not strictly necessary. However, it avoids us to + deal with funky things like comparing an unsigned and a signed + integer. */ + if (self->pos > PY_SSIZE_T_MAX - len) { + PyErr_SetString(PyExc_OverflowError, + "new position too large"); + return -1; + } + if (self->pos + len > self->string_size) { + if (resize_buffer(self, self->pos + len) < 0) + return -1; + } + + if (self->pos > self->string_size) { + /* In case of overseek, pad with null bytes the buffer region between + the end of stream and the current position. + + 0 lo string_size hi + | |<---used--->|<----------available----------->| + | | <--to pad-->|<---to write---> | + 0 buf positon + + */ + memset(self->buf + self->string_size, '\0', + (self->pos - self->string_size) * sizeof(Py_UNICODE)); + } + + /* Copy the data to the internal buffer, overwriting some of the + existing data if self->pos < self->string_size. */ + memcpy(self->buf + self->pos, str, len * sizeof(Py_UNICODE)); + self->pos += len; + + /* Set the new length of the internal string if it has changed */ + if (self->string_size < self->pos) { + self->string_size = self->pos; + } + + return len; +} + +static PyObject * +stringio_getvalue(StringIOObject *self) +{ + return PyUnicode_FromUnicode(self->buf, self->string_size); +} + +static PyObject * +stringio_tell(StringIOObject *self) +{ + return PyLong_FromSsize_t(self->pos); +} + +static PyObject * +stringio_read(StringIOObject *self, PyObject *args) +{ + Py_ssize_t size, n; + Py_UNICODE *output; + PyObject *arg = Py_None; + + if (!PyArg_ParseTuple(args, "|O:read", &arg)) + return NULL; + + if (PyLong_Check(arg)) { + size = PyLong_AsSsize_t(arg); + } + else if (arg == Py_None) { + /* Read until EOF is reached, by default. */ + size = -1; + } + else { + PyErr_Format(PyExc_TypeError, "integer argument expected, got '%s'", + Py_TYPE(arg)->tp_name); + return NULL; + } + + /* adjust invalid sizes */ + n = self->string_size - self->pos; + if (size < 0 || size > n) { + size = n; + if (size < 0) + size = 0; + } + + assert(self->buf != NULL); + output = self->buf + self->pos; + self->pos += size; + + return PyUnicode_FromUnicode(output, size); +} + +static PyObject * +stringio_truncate(StringIOObject *self, PyObject *args) +{ + Py_ssize_t size; + PyObject *arg = Py_None; + + if (!PyArg_ParseTuple(args, "|O:truncate", &arg)) + return NULL; + + if (PyLong_Check(arg)) { + size = PyLong_AsSsize_t(arg); + } + else if (arg == Py_None) { + /* Truncate to current position if no argument is passed. */ + size = self->pos; + } + else { + PyErr_Format(PyExc_TypeError, "integer argument expected, got '%s'", + Py_TYPE(arg)->tp_name); + return NULL; + } + + if (size < 0) { + PyErr_Format(PyExc_ValueError, + "Negative size value %zd", size); + return NULL; + } + + if (size < self->string_size) { + self->string_size = size; + if (resize_buffer(self, size) < 0) + return NULL; + } + self->pos = size; + + return PyLong_FromSsize_t(size); +} + +static PyObject * +stringio_seek(StringIOObject *self, PyObject *args) +{ + Py_ssize_t pos; + int mode = 0; + + if (!PyArg_ParseTuple(args, "n|i:seek", &pos, &mode)) + return NULL; + + if (mode != 0 && mode != 1 && mode != 2) { + PyErr_Format(PyExc_ValueError, + "Invalid whence (%i, should be 0, 1 or 2)", mode); + return NULL; + } + else if (pos < 0 && mode == 0) { + PyErr_Format(PyExc_ValueError, + "Negative seek position %zd", pos); + return NULL; + } + else if (mode != 0 && pos != 0) { + PyErr_SetString(PyExc_IOError, + "Can't do nonzero cur-relative seeks"); + return NULL; + } + + /* mode 0: offset relative to beginning of the string. + mode 1: no change to current position. + mode 2: change position to end of file. */ + if (mode == 1) { + pos = self->pos; + } + else if (mode == 2) { + pos = self->string_size; + } + + self->pos = pos; + + return PyLong_FromSsize_t(self->pos); +} + +static PyObject * +stringio_write(StringIOObject *self, PyObject *obj) +{ + const Py_UNICODE *str; + Py_ssize_t size; + Py_ssize_t n = 0; + + if (PyUnicode_Check(obj)) { + str = PyUnicode_AsUnicode(obj); + size = PyUnicode_GetSize(obj); + } + else { + PyErr_Format(PyExc_TypeError, "string argument expected, got '%s'", + Py_TYPE(obj)->tp_name); + return NULL; + } + + if (size != 0) { + n = write_str(self, str, size); + if (n < 0) + return NULL; + } + + return PyLong_FromSsize_t(n); +} + +static void +stringio_dealloc(StringIOObject *self) +{ + PyMem_Free(self->buf); + Py_TYPE(self)->tp_free(self); +} + +static PyObject * +stringio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + StringIOObject *self; + + assert(type != NULL && type->tp_alloc != NULL); + self = (StringIOObject *)type->tp_alloc(type, 0); + if (self == NULL) + return NULL; + + self->string_size = 0; + self->pos = 0; + self->buf_size = 0; + self->buf = (Py_UNICODE *)PyMem_Malloc(0); + if (self->buf == NULL) { + Py_DECREF(self); + return PyErr_NoMemory(); + } + + return (PyObject *)self; +} + +static struct PyMethodDef stringio_methods[] = { + {"getvalue", (PyCFunction)stringio_getvalue, METH_VARARGS, NULL}, + {"read", (PyCFunction)stringio_read, METH_VARARGS, NULL}, + {"tell", (PyCFunction)stringio_tell, METH_NOARGS, NULL}, + {"truncate", (PyCFunction)stringio_truncate, METH_VARARGS, NULL}, + {"seek", (PyCFunction)stringio_seek, METH_VARARGS, NULL}, + {"write", (PyCFunction)stringio_write, METH_O, NULL}, + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject StringIO_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + "_stringio._StringIO", /*tp_name*/ + sizeof(StringIOObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)stringio_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + 0, /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + stringio_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + stringio_new, /*tp_new*/ +}; + +static struct PyModuleDef _stringiomodule = { + PyModuleDef_HEAD_INIT, + "_stringio", + NULL, + -1, + NULL, + NULL, + NULL, + NULL, + NULL +}; + +PyMODINIT_FUNC +PyInit__stringio(void) +{ + PyObject *m; + + if (PyType_Ready(&StringIO_Type) < 0) + return NULL; + m = PyModule_Create(&_stringiomodule); + if (m == NULL) + return NULL; + Py_INCREF(&StringIO_Type); + if (PyModule_AddObject(m, "_StringIO", (PyObject *)&StringIO_Type) < 0) + return NULL; + return m; +} @@ -422,6 +422,7 @@ class PyBuildExt(build_ext): exts.append( Extension("_functools", ["_functoolsmodule.c"]) ) # Memory-based IO accelerator modules exts.append( Extension("_bytesio", ["_bytesio.c"]) ) + exts.append( Extension("_stringio", ["_stringio.c"]) ) # C-optimized pickle replacement exts.append( Extension("_pickle", ["_pickle.c"]) ) # atexit |