diff options
author | Inada Naoki <songofacandy@gmail.com> | 2020-02-27 04:48:59 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-27 04:48:59 (GMT) |
commit | 02a4d57263a9846de35b0db12763ff9e7326f62c (patch) | |
tree | 7055c08b72477a75014f9cc65f95ee5ec23d95da /Objects/unicodeobject.c | |
parent | 0c6e3aa67b84adb0fb7c272ae06b7ae77f832295 (diff) | |
download | cpython-02a4d57263a9846de35b0db12763ff9e7326f62c.zip cpython-02a4d57263a9846de35b0db12763ff9e7326f62c.tar.gz cpython-02a4d57263a9846de35b0db12763ff9e7326f62c.tar.bz2 |
bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (GH-18327)
Avoid using temporary bytes object.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 98 |
1 files changed, 73 insertions, 25 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ee6d3df..e0a666f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) } +static int unicode_fill_utf8(PyObject *unicode); + const char * PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) { - PyObject *bytes; - if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; @@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) return NULL; if (PyUnicode_UTF8(unicode) == NULL) { - assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); - bytes = _PyUnicode_AsUTF8String(unicode, NULL); - if (bytes == NULL) - return NULL; - _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); - if (_PyUnicode_UTF8(unicode) == NULL) { - PyErr_NoMemory(); - Py_DECREF(bytes); + if (unicode_fill_utf8(unicode) == -1) { return NULL; } - _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); - memcpy(_PyUnicode_UTF8(unicode), - PyBytes_AS_STRING(bytes), - _PyUnicode_UTF8_LENGTH(unicode) + 1); - Py_DECREF(bytes); } if (psize) @@ -5381,10 +5369,6 @@ static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors) { - enum PyUnicode_Kind kind; - void *data; - Py_ssize_t size; - if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; @@ -5397,9 +5381,12 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), PyUnicode_UTF8_LENGTH(unicode)); - kind = PyUnicode_KIND(unicode); - data = PyUnicode_DATA(unicode); - size = PyUnicode_GET_LENGTH(unicode); + enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); + + _PyBytesWriter writer; + char *end; switch (kind) { default: @@ -5407,12 +5394,73 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, case PyUnicode_1BYTE_KIND: /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ assert(!PyUnicode_IS_ASCII(unicode)); - return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors); + end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + break; + case PyUnicode_2BYTE_KIND: + end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + break; + case PyUnicode_4BYTE_KIND: + end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors); + break; + } + + if (end == NULL) { + _PyBytesWriter_Dealloc(&writer); + return NULL; + } + return _PyBytesWriter_Finish(&writer, end); +} + +static int +unicode_fill_utf8(PyObject *unicode) +{ + /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ + assert(!PyUnicode_IS_ASCII(unicode)); + + enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + void *data = PyUnicode_DATA(unicode); + Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); + + _PyBytesWriter writer; + char *end; + + switch (kind) { + default: + Py_UNREACHABLE(); + case PyUnicode_1BYTE_KIND: + end = ucs1lib_utf8_encoder(&writer, unicode, data, size, + _Py_ERROR_STRICT, NULL); + break; case PyUnicode_2BYTE_KIND: - return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors); + end = ucs2lib_utf8_encoder(&writer, unicode, data, size, + _Py_ERROR_STRICT, NULL); + break; case PyUnicode_4BYTE_KIND: - return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors); + end = ucs4lib_utf8_encoder(&writer, unicode, data, size, + _Py_ERROR_STRICT, NULL); + break; + } + if (end == NULL) { + _PyBytesWriter_Dealloc(&writer); + return -1; + } + + char *start = writer.use_small_buffer ? writer.small_buffer : + PyBytes_AS_STRING(writer.buffer); + Py_ssize_t len = end - start; + + char *cache = PyObject_MALLOC(len + 1); + if (cache == NULL) { + _PyBytesWriter_Dealloc(&writer); + PyErr_NoMemory(); + return -1; } + _PyUnicode_UTF8(unicode) = cache; + _PyUnicode_UTF8_LENGTH(unicode) = len; + memcpy(cache, start, len); + cache[len] = '\0'; + _PyBytesWriter_Dealloc(&writer); + return 0; } PyObject * |