summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2015-10-12 20:36:57 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2015-10-12 20:36:57 (GMT)
commit358af1352689fc10c81690a193ff5414f5f930af (patch)
tree02a20c52da654b83bf13b976a1dd798d348e304e
parentd65e4f4eea278357e5aaee9f510922ef83e04143 (diff)
downloadcpython-358af1352689fc10c81690a193ff5414f5f930af.zip
cpython-358af1352689fc10c81690a193ff5414f5f930af.tar.gz
cpython-358af1352689fc10c81690a193ff5414f5f930af.tar.bz2
Issue #25353: Optimize unicode escape and raw unicode escape encoders to use
the new _PyBytesWriter API.
-rw-r--r--Modules/_pickle.c44
-rw-r--r--Objects/unicodeobject.c113
2 files changed, 93 insertions, 64 deletions
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index abaf4e5..341ac0d 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -2110,38 +2110,35 @@ save_bytes(PicklerObject *self, PyObject *obj)
static PyObject *
raw_unicode_escape(PyObject *obj)
{
- PyObject *repr;
char *p;
Py_ssize_t i, size;
- size_t expandsize;
void *data;
unsigned int kind;
+ _PyBytesWriter writer;
if (PyUnicode_READY(obj))
return NULL;
+ _PyBytesWriter_Init(&writer);
+
size = PyUnicode_GET_LENGTH(obj);
data = PyUnicode_DATA(obj);
kind = PyUnicode_KIND(obj);
- if (kind == PyUnicode_4BYTE_KIND)
- expandsize = 10;
- else
- expandsize = 6;
- if ((size_t)size > (size_t)PY_SSIZE_T_MAX / expandsize)
- return PyErr_NoMemory();
- repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
- if (repr == NULL)
- return NULL;
- if (size == 0)
- return repr;
- assert(Py_REFCNT(repr) == 1);
+ p = _PyBytesWriter_Alloc(&writer, size);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
- p = PyBytes_AS_STRING(repr);
for (i=0; i < size; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
/* Map 32-bit characters to '\Uxxxxxxxx' */
if (ch >= 0x10000) {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -2153,8 +2150,13 @@ raw_unicode_escape(PyObject *obj)
*p++ = Py_hexdigits[(ch >> 4) & 0xf];
*p++ = Py_hexdigits[ch & 15];
}
- /* Map 16-bit characters to '\uxxxx' */
+ /* Map 16-bit characters, '\\' and '\n' to '\uxxxx' */
else if (ch >= 256 || ch == '\\' || ch == '\n') {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
@@ -2166,10 +2168,12 @@ raw_unicode_escape(PyObject *obj)
else
*p++ = (char) ch;
}
- size = p - PyBytes_AS_STRING(repr);
- if (_PyBytes_Resize(&repr, size) < 0)
- return NULL;
- return repr;
+
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
static int
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4b3746c..f5044c8 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6052,11 +6052,10 @@ PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
{
Py_ssize_t i, len;
- PyObject *repr;
char *p;
int kind;
void *data;
- Py_ssize_t expandsize = 0;
+ _PyBytesWriter writer;
/* Initial allocation is based on the longest-possible character
escape.
@@ -6072,35 +6071,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
+
+ _PyBytesWriter_Init(&writer);
+
len = PyUnicode_GET_LENGTH(unicode);
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
- switch (kind) {
- case PyUnicode_1BYTE_KIND: expandsize = 4; break;
- case PyUnicode_2BYTE_KIND: expandsize = 6; break;
- case PyUnicode_4BYTE_KIND: expandsize = 10; break;
- }
-
- if (len == 0)
- return PyBytes_FromStringAndSize(NULL, 0);
-
- if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
- return PyErr_NoMemory();
- repr = PyBytes_FromStringAndSize(NULL,
- 2
- + expandsize*len
- + 1);
- if (repr == NULL)
- return NULL;
-
- p = PyBytes_AS_STRING(repr);
+ p = _PyBytesWriter_Alloc(&writer, len);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
for (i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
/* Escape backslashes */
if (ch == '\\') {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = (char) ch;
continue;
@@ -6109,6 +6101,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) {
assert(ch <= MAX_UNICODE);
+
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -6124,6 +6121,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0x000F];
@@ -6134,20 +6135,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map special whitespace to '\t', \n', '\r' */
else if (ch == '\t') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 't';
}
else if (ch == '\n') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'n';
}
else if (ch == '\r') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'r';
}
/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch >= 0x7F) {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 4-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'x';
*p++ = Py_hexdigits[(ch >> 4) & 0x000F];
@@ -6159,10 +6177,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
*p++ = (char) ch;
}
- assert(p - PyBytes_AS_STRING(repr) > 0);
- if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
- return NULL;
- return repr;
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
PyObject *
@@ -6291,13 +6310,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
{
- PyObject *repr;
char *p;
- char *q;
- Py_ssize_t expandsize, pos;
+ Py_ssize_t pos;
int kind;
void *data;
Py_ssize_t len;
+ _PyBytesWriter writer;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
@@ -6305,28 +6323,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
+
+ _PyBytesWriter_Init(&writer);
+
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
len = PyUnicode_GET_LENGTH(unicode);
- /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
- bytes, and 1 byte characters 4. */
- expandsize = kind * 2 + 2;
- if (len > PY_SSIZE_T_MAX / expandsize)
- return PyErr_NoMemory();
-
- repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
- if (repr == NULL)
- return NULL;
- if (len == 0)
- return repr;
+ p = _PyBytesWriter_Alloc(&writer, len);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
- p = q = PyBytes_AS_STRING(repr);
for (pos = 0; pos < len; pos++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* Map 32-bit characters to '\Uxxxxxxxx' */
if (ch >= 0x10000) {
assert(ch <= MAX_UNICODE);
+
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6340,6 +6359,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch >= 256) {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
@@ -6352,10 +6376,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
*p++ = (char) ch;
}
- assert(p > q);
- if (_PyBytes_Resize(&repr, p - q) < 0)
- return NULL;
- return repr;
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
PyObject *