diff options
author | Inada Naoki <songofacandy@gmail.com> | 2019-06-24 03:30:24 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-24 03:30:24 (GMT) |
commit | 770847a7db33b3d4c451b42372b6942687aa6121 (patch) | |
tree | 04aaf3163636bed947763435ad03a76b6f211d7b /Objects/unicodeobject.c | |
parent | b3ca7972c8d8c6479b6542ce28e0f7a6ebd5b8fe (diff) | |
download | cpython-770847a7db33b3d4c451b42372b6942687aa6121.zip cpython-770847a7db33b3d4c451b42372b6942687aa6121.tar.gz cpython-770847a7db33b3d4c451b42372b6942687aa6121.tar.bz2 |
bpo-37348: optimize decoding ASCII string (GH-14283)
`_PyUnicode_Writer` is a relatively complex structure. Initializing it is significant overhead when decoding short ASCII string.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 85 |
1 files changed, 51 insertions, 34 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4f83625..625be4b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); +static inline void +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors); @@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed) { - _PyUnicodeWriter writer; - const char *starts = s; - const char *end = s + size; - - Py_ssize_t startinpos; - Py_ssize_t endinpos; - const char *errmsg = ""; - PyObject *error_handler_obj = NULL; - PyObject *exc = NULL; - if (size == 0) { if (consumed) *consumed = 0; @@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, return get_latin1_char((unsigned char)s[0]); } - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) - goto onError; + const char *starts = s; + const char *end = s + size; + + // fast path: try ASCII string. + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { + return NULL; + } + s += ascii_decode(s, end, PyUnicode_DATA(u)); + if (s == end) { + return u; + } + + // Use _PyUnicodeWriter after fast path is failed. + _PyUnicodeWriter writer; + _PyUnicodeWriter_InitWithBuffer(&writer, u); + writer.pos = s - starts; + + Py_ssize_t startinpos, endinpos; + const char *errmsg = ""; + PyObject *error_handler_obj = NULL; + PyObject *exc = NULL; - writer.pos = ascii_decode(s, end, writer.data); - s += writer.pos; while (s < end) { Py_UCS4 ch; int kind = writer.kind; @@ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, length after conversion to the true value. (But decoding error handler might have to resize the string) */ _PyUnicodeWriter_Init(&writer); - writer.min_length = size; + writer.min_length = size; if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { goto onError; } @@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s, const char *errors) { const char *starts = s; - _PyUnicodeWriter writer; - int kind; - void *data; - Py_ssize_t startinpos; - Py_ssize_t endinpos; - Py_ssize_t outpos; - const char *e; + const char *e = s + size; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; @@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) + // Shortcut for simple case + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { return NULL; + } + Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u)); + if (outpos == size) { + return u; + } - e = s + size; - data = writer.data; - outpos = ascii_decode(s, e, (Py_UCS1 *)data); + _PyUnicodeWriter writer; + _PyUnicodeWriter_InitWithBuffer(&writer, u); writer.pos = outpos; - if (writer.pos == size) - return _PyUnicodeWriter_Finish(&writer); - s += writer.pos; - kind = writer.kind; + s += outpos; + int kind = writer.kind; + void *data = writer.data; + Py_ssize_t startinpos, endinpos; + while (s < e) { unsigned char c = (unsigned char)*s; if (c < 128) { @@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) assert(writer->kind <= PyUnicode_1BYTE_KIND); } +// Initialize _PyUnicodeWriter with initial buffer +static inline void +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) +{ + memset(writer, 0, sizeof(*writer)); + writer->buffer = buffer; + _PyUnicodeWriter_Update(writer); + writer->min_length = writer->size; +} + int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) |