summaryrefslogtreecommitdiffstats
path: root/Modules/_io/stringio.c
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/_io/stringio.c')
-rw-r--r--Modules/_io/stringio.c185
1 files changed, 153 insertions, 32 deletions
diff --git a/Modules/_io/stringio.c b/Modules/_io/stringio.c
index 1f1fa40..9d73884 100644
--- a/Modules/_io/stringio.c
+++ b/Modules/_io/stringio.c
@@ -1,19 +1,32 @@
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "structmember.h"
+#include "accu.h"
#include "_iomodule.h"
/* Implementation note: the buffer is always at least one character longer
than the enclosed string, for proper functioning of _PyIO_find_line_ending.
*/
+#define STATE_REALIZED 1
+#define STATE_ACCUMULATING 2
+
typedef struct {
PyObject_HEAD
- Py_UNICODE *buf;
+ Py_UCS4 *buf;
Py_ssize_t pos;
Py_ssize_t string_size;
size_t buf_size;
+ /* The stringio object can be in two states: accumulating or realized.
+ In accumulating state, the internal buffer contains nothing and
+ the contents are given by the embedded _PyAccu structure.
+ In realized state, the internal buffer is meaningful and the
+ _PyAccu is destroyed.
+ */
+ int state;
+ _PyAccu accu;
+
char ok; /* initialized? */
char closed;
char readuniversal;
@@ -21,7 +34,7 @@ typedef struct {
PyObject *decoder;
PyObject *readnl;
PyObject *writenl;
-
+
PyObject *dict;
PyObject *weakreflist;
} stringio;
@@ -40,6 +53,11 @@ typedef struct {
return NULL; \
}
+#define ENSURE_REALIZED(self) \
+ if (realize(self) < 0) { \
+ return NULL; \
+ }
+
PyDoc_STRVAR(stringio_doc,
"Text I/O implementation using an in-memory buffer.\n"
"\n"
@@ -56,7 +74,7 @@ resize_buffer(stringio *self, size_t size)
/* Here, unsigned types are used to avoid dealing with signed integer
overflow, which is undefined in C. */
size_t alloc = self->buf_size;
- Py_UNICODE *new_buf = NULL;
+ Py_UCS4 *new_buf = NULL;
assert(self->buf != NULL);
@@ -84,10 +102,9 @@ resize_buffer(stringio *self, size_t size)
alloc = size + 1;
}
- if (alloc > ((size_t)-1) / sizeof(Py_UNICODE))
+ if (alloc > PY_SIZE_MAX / sizeof(Py_UCS4))
goto overflow;
- new_buf = (Py_UNICODE *)PyMem_Realloc(self->buf,
- alloc * sizeof(Py_UNICODE));
+ new_buf = (Py_UCS4 *)PyMem_Realloc(self->buf, alloc * sizeof(Py_UCS4));
if (new_buf == NULL) {
PyErr_NoMemory();
return -1;
@@ -103,14 +120,62 @@ resize_buffer(stringio *self, size_t size)
return -1;
}
+static PyObject *
+make_intermediate(stringio *self)
+{
+ PyObject *intermediate = _PyAccu_Finish(&self->accu);
+ self->state = STATE_REALIZED;
+ if (intermediate == NULL)
+ return NULL;
+ if (_PyAccu_Init(&self->accu) ||
+ _PyAccu_Accumulate(&self->accu, intermediate)) {
+ Py_DECREF(intermediate);
+ return NULL;
+ }
+ self->state = STATE_ACCUMULATING;
+ return intermediate;
+}
+
+static int
+realize(stringio *self)
+{
+ Py_ssize_t len;
+ PyObject *intermediate;
+
+ if (self->state == STATE_REALIZED)
+ return 0;
+ assert(self->state == STATE_ACCUMULATING);
+ self->state = STATE_REALIZED;
+
+ intermediate = _PyAccu_Finish(&self->accu);
+ if (intermediate == NULL)
+ return -1;
+
+ /* Append the intermediate string to the internal buffer.
+ The length should be equal to the current cursor position.
+ */
+ len = PyUnicode_GET_LENGTH(intermediate);
+ if (resize_buffer(self, len) < 0) {
+ Py_DECREF(intermediate);
+ return -1;
+ }
+ if (!PyUnicode_AsUCS4(intermediate, self->buf, len, 0)) {
+ Py_DECREF(intermediate);
+ return -1;
+ }
+
+ Py_DECREF(intermediate);
+ return 0;
+}
+
/* Internal routine for writing a whole PyUnicode object to the buffer of a
StringIO object. Returns 0 on success, or -1 on error. */
static Py_ssize_t
write_str(stringio *self, PyObject *obj)
{
- Py_UNICODE *str;
Py_ssize_t len;
PyObject *decoded = NULL;
+
assert(self->buf != NULL);
assert(self->pos >= 0);
@@ -132,9 +197,11 @@ write_str(stringio *self, PyObject *obj)
return -1;
assert(PyUnicode_Check(decoded));
- str = PyUnicode_AS_UNICODE(decoded);
- len = PyUnicode_GET_SIZE(decoded);
-
+ if (PyUnicode_READY(decoded)) {
+ Py_DECREF(decoded);
+ return -1;
+ }
+ len = PyUnicode_GET_LENGTH(decoded);
assert(len >= 0);
/* This overflow check is not strictly necessary. However, it avoids us to
@@ -145,6 +212,17 @@ write_str(stringio *self, PyObject *obj)
"new position too large");
goto fail;
}
+
+ if (self->state == STATE_ACCUMULATING) {
+ if (self->string_size == self->pos) {
+ if (_PyAccu_Accumulate(&self->accu, decoded))
+ goto fail;
+ goto success;
+ }
+ if (realize(self))
+ goto fail;
+ }
+
if (self->pos + len > self->string_size) {
if (resize_buffer(self, self->pos + len) < 0)
goto fail;
@@ -161,18 +239,22 @@ write_str(stringio *self, PyObject *obj)
*/
memset(self->buf + self->string_size, '\0',
- (self->pos - self->string_size) * sizeof(Py_UNICODE));
+ (self->pos - self->string_size) * sizeof(Py_UCS4));
}
/* Copy the data to the internal buffer, overwriting some of the
existing data if self->pos < self->string_size. */
- memcpy(self->buf + self->pos, str, len * sizeof(Py_UNICODE));
- self->pos += len;
+ if (!PyUnicode_AsUCS4(decoded,
+ self->buf + self->pos,
+ self->buf_size - self->pos,
+ 0))
+ goto fail;
+success:
/* Set the new length of the internal string if it has changed. */
- if (self->string_size < self->pos) {
+ self->pos += len;
+ if (self->string_size < self->pos)
self->string_size = self->pos;
- }
Py_DECREF(decoded);
return 0;
@@ -190,7 +272,10 @@ stringio_getvalue(stringio *self)
{
CHECK_INITIALIZED(self);
CHECK_CLOSED(self);
- return PyUnicode_FromUnicode(self->buf, self->string_size);
+ if (self->state == STATE_ACCUMULATING)
+ return make_intermediate(self);
+ return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, self->buf,
+ self->string_size);
}
PyDoc_STRVAR(stringio_tell_doc,
@@ -214,7 +299,7 @@ static PyObject *
stringio_read(stringio *self, PyObject *args)
{
Py_ssize_t size, n;
- Py_UNICODE *output;
+ Py_UCS4 *output;
PyObject *arg = Py_None;
CHECK_INITIALIZED(self);
@@ -245,21 +330,29 @@ stringio_read(stringio *self, PyObject *args)
size = 0;
}
+ /* Optimization for seek(0); read() */
+ if (self->state == STATE_ACCUMULATING && self->pos == 0 && size == n) {
+ PyObject *result = make_intermediate(self);
+ self->pos = self->string_size;
+ return result;
+ }
+
+ ENSURE_REALIZED(self);
output = self->buf + self->pos;
self->pos += size;
- return PyUnicode_FromUnicode(output, size);
+ return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, size);
}
/* Internal helper, used by stringio_readline and stringio_iternext */
static PyObject *
_stringio_readline(stringio *self, Py_ssize_t limit)
{
- Py_UNICODE *start, *end, old_char;
+ Py_UCS4 *start, *end, old_char;
Py_ssize_t len, consumed;
/* In case of overseek, return the empty string */
if (self->pos >= self->string_size)
- return PyUnicode_FromString("");
+ return PyUnicode_New(0, 0);
start = self->buf + self->pos;
if (limit < 0 || limit > self->string_size - self->pos)
@@ -270,14 +363,14 @@ _stringio_readline(stringio *self, Py_ssize_t limit)
*end = '\0';
len = _PyIO_find_line_ending(
self->readtranslate, self->readuniversal, self->readnl,
- start, end, &consumed);
+ PyUnicode_4BYTE_KIND, (char*)start, (char*)end, &consumed);
*end = old_char;
/* If we haven't found any line ending, we just return everything
(`consumed` is ignored). */
if (len < 0)
len = limit;
self->pos += len;
- return PyUnicode_FromUnicode(start, len);
+ return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, start, len);
}
PyDoc_STRVAR(stringio_readline_doc,
@@ -295,6 +388,7 @@ stringio_readline(stringio *self, PyObject *args)
if (!PyArg_ParseTuple(args, "|O:readline", &arg))
return NULL;
CHECK_CLOSED(self);
+ ENSURE_REALIZED(self);
if (PyNumber_Check(arg)) {
limit = PyNumber_AsSsize_t(arg, PyExc_OverflowError);
@@ -316,6 +410,7 @@ stringio_iternext(stringio *self)
CHECK_INITIALIZED(self);
CHECK_CLOSED(self);
+ ENSURE_REALIZED(self);
if (Py_TYPE(self) == &PyStringIO_Type) {
/* Skip method call overhead for speed */
@@ -337,7 +432,7 @@ stringio_iternext(stringio *self)
if (line == NULL)
return NULL;
- if (PyUnicode_GET_SIZE(line) == 0) {
+ if (PyUnicode_GET_LENGTH(line) == 0) {
/* Reached EOF */
Py_DECREF(line);
return NULL;
@@ -386,6 +481,7 @@ stringio_truncate(stringio *self, PyObject *args)
}
if (size < self->string_size) {
+ ENSURE_REALIZED(self);
if (resize_buffer(self, size) < 0)
return NULL;
self->string_size = size;
@@ -462,8 +558,10 @@ stringio_write(stringio *self, PyObject *obj)
Py_TYPE(obj)->tp_name);
return NULL;
}
+ if (PyUnicode_READY(obj))
+ return NULL;
CHECK_CLOSED(self);
- size = PyUnicode_GET_SIZE(obj);
+ size = PyUnicode_GET_LENGTH(obj);
if (size > 0 && write_str(self, obj) < 0)
return NULL;
@@ -484,6 +582,7 @@ stringio_close(stringio *self)
/* Free up some memory */
if (resize_buffer(self, 0) < 0)
return NULL;
+ _PyAccu_Destroy(&self->accu);
Py_CLEAR(self->readnl);
Py_CLEAR(self->writenl);
Py_CLEAR(self->decoder);
@@ -513,6 +612,7 @@ stringio_dealloc(stringio *self)
PyMem_Free(self->buf);
self->buf = NULL;
}
+ _PyAccu_Destroy(&self->accu);
Py_CLEAR(self->readnl);
Py_CLEAR(self->writenl);
Py_CLEAR(self->decoder);
@@ -535,7 +635,7 @@ stringio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
/* tp_alloc initializes all the fields to zero. So we don't have to
initialize them here. */
- self->buf = (Py_UNICODE *)PyMem_Malloc(0);
+ self->buf = (Py_UCS4 *)PyMem_Malloc(0);
if (self->buf == NULL) {
Py_DECREF(self);
return PyErr_NoMemory();
@@ -551,6 +651,7 @@ stringio_init(stringio *self, PyObject *args, PyObject *kwds)
PyObject *value = NULL;
PyObject *newline_obj = NULL;
char *newline = "\n";
+ Py_ssize_t value_len;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO:__init__", kwlist,
&value, &newline_obj))
@@ -592,6 +693,7 @@ stringio_init(stringio *self, PyObject *args, PyObject *kwds)
self->ok = 0;
+ _PyAccu_Destroy(&self->accu);
Py_CLEAR(self->readnl);
Py_CLEAR(self->writenl);
Py_CLEAR(self->decoder);
@@ -628,19 +730,27 @@ stringio_init(stringio *self, PyObject *args, PyObject *kwds)
/* Now everything is set up, resize buffer to size of initial value,
and copy it */
self->string_size = 0;
- if (value && value != Py_None) {
- Py_ssize_t len = PyUnicode_GetSize(value);
+ if (value && value != Py_None)
+ value_len = PyUnicode_GetLength(value);
+ else
+ value_len = 0;
+ if (value_len > 0) {
/* This is a heuristic, for newline translation might change
the string length. */
- if (resize_buffer(self, len) < 0)
+ if (resize_buffer(self, 0) < 0)
return -1;
+ self->state = STATE_REALIZED;
self->pos = 0;
if (write_str(self, value) < 0)
return -1;
}
else {
+ /* Empty stringio object, we can start by accumulating */
if (resize_buffer(self, 0) < 0)
return -1;
+ if (_PyAccu_Init(&self->accu))
+ return -1;
+ self->state = STATE_ACCUMULATING;
}
self->pos = 0;
@@ -760,11 +870,22 @@ stringio_setstate(stringio *self, PyObject *state)
once by __init__. So we do not take any chance and replace object's
buffer completely. */
{
- Py_UNICODE *buf = PyUnicode_AS_UNICODE(PyTuple_GET_ITEM(state, 0));
- Py_ssize_t bufsize = PyUnicode_GET_SIZE(PyTuple_GET_ITEM(state, 0));
- if (resize_buffer(self, bufsize) < 0)
+ PyObject *item;
+ Py_UCS4 *buf;
+ Py_ssize_t bufsize;
+
+ item = PyTuple_GET_ITEM(state, 0);
+ buf = PyUnicode_AsUCS4Copy(item);
+ if (buf == NULL)
return NULL;
- memcpy(self->buf, buf, bufsize * sizeof(Py_UNICODE));
+ bufsize = PyUnicode_GET_LENGTH(item);
+
+ if (resize_buffer(self, bufsize) < 0) {
+ PyMem_Free(buf);
+ return NULL;
+ }
+ memcpy(self->buf, buf, bufsize * sizeof(Py_UCS4));
+ PyMem_Free(buf);
self->string_size = bufsize;
}