summaryrefslogtreecommitdiffstats
path: root/Objects/stringlib
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2015-10-08 22:33:49 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2015-10-08 22:33:49 (GMT)
commitfdfbf781140f22619b0ef6bfeac792496774bb69 (patch)
tree880a43536436cac5fc80543f4d29459b7429a449 /Objects/stringlib
parent08ec6d9611b612182808bdf1d30cdd786cde9c3e (diff)
downloadcpython-fdfbf781140f22619b0ef6bfeac792496774bb69.zip
cpython-fdfbf781140f22619b0ef6bfeac792496774bb69.tar.gz
cpython-fdfbf781140f22619b0ef6bfeac792496774bb69.tar.bz2
Issue #25318: Add _PyBytesWriter API
Add a new private API to optimize Unicode encoders. It uses a small buffer allocated on the stack and supports overallocation. Use _PyBytesWriter API for UCS1 (ASCII and Latin1) and UTF-8 encoders. Enable overallocation for the UTF-8 encoder with error handlers. unicode_encode_ucs1(): initialize collend to collstart+1 to not check the current character twice, we already know that it is not ASCII.
Diffstat (limited to 'Objects/stringlib')
-rw-r--r--Objects/stringlib/codecs.h84
1 files changed, 21 insertions, 63 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 562191c..d7a9918 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -263,10 +263,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Py_ssize_t i; /* index into s of next input byte */
- PyObject *result; /* result string object */
char *p; /* next free byte in output buffer */
- Py_ssize_t nallocated; /* number of result bytes allocated */
- Py_ssize_t nneeded; /* number of result bytes needed */
#if STRINGLIB_SIZEOF_CHAR > 1
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
@@ -275,39 +272,25 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#endif
#if STRINGLIB_SIZEOF_CHAR == 1
const Py_ssize_t max_char_size = 2;
- char stackbuf[MAX_SHORT_UNICHARS * 2];
#elif STRINGLIB_SIZEOF_CHAR == 2
const Py_ssize_t max_char_size = 3;
- char stackbuf[MAX_SHORT_UNICHARS * 3];
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
const Py_ssize_t max_char_size = 4;
- char stackbuf[MAX_SHORT_UNICHARS * 4];
#endif
+ _PyBytesWriter writer;
assert(size >= 0);
+ _PyBytesWriter_Init(&writer);
- if (size <= MAX_SHORT_UNICHARS) {
- /* Write into the stack buffer; nallocated can't overflow.
- * At the end, we'll allocate exactly as much heap space as it
- * turns out we need.
- */
- nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
- result = NULL; /* will allocate after we're done */
- p = stackbuf;
- }
- else {
- if (size > PY_SSIZE_T_MAX / max_char_size) {
- /* integer overflow */
- return PyErr_NoMemory();
- }
- /* Overallocate on the heap, and give the excess back at the end. */
- nallocated = size * max_char_size;
- result = PyBytes_FromStringAndSize(NULL, nallocated);
- if (result == NULL)
- return NULL;
- p = PyBytes_AS_STRING(result);
+ if (size > PY_SSIZE_T_MAX / max_char_size) {
+ /* integer overflow */
+ return PyErr_NoMemory();
}
+ p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
+ if (p == NULL)
+ return NULL;
+
for (i = 0; i < size;) {
Py_UCS4 ch = data[i++];
@@ -338,6 +321,9 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
endpos++;
+ /* Only overallocate the buffer if it's not the last write */
+ writer.overallocate = (endpos < size);
+
switch (error_handler)
{
case _Py_ERROR_REPLACE:
@@ -387,29 +373,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
repsize = PyUnicode_GET_LENGTH(rep);
if (repsize > max_char_size) {
- Py_ssize_t offset;
-
- if (result == NULL)
- offset = p - stackbuf;
- else
- offset = p - PyBytes_AS_STRING(result);
-
- if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
- /* integer overflow */
- PyErr_NoMemory();
+ p = _PyBytesWriter_Prepare(&writer, p,
+ repsize - max_char_size);
+ if (p == NULL)
goto error;
- }
- nallocated += repsize - max_char_size;
- if (result != NULL) {
- if (_PyBytes_Resize(&result, nallocated) < 0)
- goto error;
- } else {
- result = PyBytes_FromStringAndSize(NULL, nallocated);
- if (result == NULL)
- goto error;
- Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
- }
- p = PyBytes_AS_STRING(result) + offset;
}
if (PyBytes_Check(rep)) {
@@ -437,6 +404,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
i = newpos;
}
+
+ /* If overallocation was disabled, ensure that it was the last
+ write. Otherwise, we missed an optimization */
+ assert(writer.overallocate || i == size);
}
else
#if STRINGLIB_SIZEOF_CHAR > 2
@@ -461,31 +432,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
}
- if (result == NULL) {
- /* This was stack allocated. */
- nneeded = p - stackbuf;
- assert(nneeded <= nallocated);
- result = PyBytes_FromStringAndSize(stackbuf, nneeded);
- }
- else {
- /* Cut back to size actually needed. */
- nneeded = p - PyBytes_AS_STRING(result);
- assert(nneeded <= nallocated);
- _PyBytes_Resize(&result, nneeded);
- }
-
#if STRINGLIB_SIZEOF_CHAR > 1
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
#endif
- return result;
+ return _PyBytesWriter_Finish(&writer, p);
#if STRINGLIB_SIZEOF_CHAR > 1
error:
Py_XDECREF(rep);
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
- Py_XDECREF(result);
+ _PyBytesWriter_Dealloc(&writer);
return NULL;
#endif