diff options
-rw-r--r-- | Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst | 2 | ||||
-rw-r--r-- | Modules/_json.c | 80 |
2 files changed, 25 insertions, 57 deletions
diff --git a/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst b/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst new file mode 100644 index 0000000..92bebee --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst @@ -0,0 +1,2 @@ +``_json.scanstring`` is now up to 3x faster when there are many backslash +escaped characters in the JSON string. diff --git a/Modules/_json.c b/Modules/_json.c index 112903e..54ac605 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -73,19 +73,6 @@ static PyMemberDef encoder_members[] = { {NULL} }; -static PyObject * -join_list_unicode(PyObject *lst) -{ - /* return u''.join(lst) */ - static PyObject *sep = NULL; - if (sep == NULL) { - sep = PyUnicode_FromStringAndSize("", 0); - if (sep == NULL) - return NULL; - } - return PyUnicode_Join(sep, lst); -} - /* Forward decls */ static PyObject * @@ -385,21 +372,6 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { return tpl; } -#define APPEND_OLD_CHUNK \ - if (chunk != NULL) { \ - if (chunks == NULL) { \ - chunks = PyList_New(0); \ - if (chunks == NULL) { \ - goto bail; \ - } \ - } \ - if (PyList_Append(chunks, chunk)) { \ - Py_CLEAR(chunk); \ - goto bail; \ - } \ - Py_CLEAR(chunk); \ - } - static PyObject * scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) { @@ -417,12 +389,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next Py_ssize_t next /* = begin */; const void *buf; int kind; - PyObject *chunks = NULL; - PyObject *chunk = NULL; if (PyUnicode_READY(pystr) == -1) return 0; + _PyUnicodeWriter writer; + _PyUnicodeWriter_Init(&writer); + writer.overallocate = 1; + len = PyUnicode_GET_LENGTH(pystr); buf = PyUnicode_DATA(pystr); kind = PyUnicode_KIND(pystr); @@ -449,18 +423,26 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } c = d; } - if (!(c == '"' || c == '\\')) { + + if (c == '"') { + // Fast path for simple case. + if (writer.buffer == NULL) { + PyObject *ret = PyUnicode_Substring(pystr, end, next); + if (ret == NULL) { + goto bail; + } + *next_end_ptr = next + 1;; + return ret; + } + } + else if (c != '\\') { raise_errmsg("Unterminated string starting at", pystr, begin); goto bail; } + /* Pick up this chunk if it's not zero length */ if (next != end) { - APPEND_OLD_CHUNK - chunk = PyUnicode_FromKindAndData( - kind, - (char*)buf + kind * end, - next - end); - if (chunk == NULL) { + if (_PyUnicodeWriter_WriteSubstring(&writer, pystr, end, next) < 0) { goto bail; } } @@ -551,34 +533,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next end -= 6; } } - APPEND_OLD_CHUNK - chunk = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &c, 1); - if (chunk == NULL) { + if (_PyUnicodeWriter_WriteChar(&writer, c) < 0) { goto bail; } } - if (chunks == NULL) { - if (chunk != NULL) - rval = chunk; - else - rval = PyUnicode_FromStringAndSize("", 0); - } - else { - APPEND_OLD_CHUNK - rval = join_list_unicode(chunks); - if (rval == NULL) { - goto bail; - } - Py_CLEAR(chunks); - } - + rval = _PyUnicodeWriter_Finish(&writer); *next_end_ptr = end; return rval; + bail: *next_end_ptr = -1; - Py_XDECREF(chunks); - Py_XDECREF(chunk); + _PyUnicodeWriter_Dealloc(&writer); return NULL; } |