bpo-37587: json: Use _PyUnicodeWriter when scanning string. (GH-15591)

author: Inada Naoki <songofacandy@gmail.com> 2019-10-17 07:12:41 (GMT)
committer: GitHub <noreply@github.com> 2019-10-17 07:12:41 (GMT)
commit: 9c11029bb41caab5576f354fbf808a5e91325bb0 (patch)
tree: 882cf98dd7a6f04626700384e236424ab120b0b5
parent: a661392f8fb5ac4fc095aa1845d1eb7a25c4e9be (diff)
download: cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.zip
cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.tar.gz
cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.tar.bz2
2 files changed, 25 insertions, 57 deletions
diff --git a/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst b/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst
new file mode 100644
index 0000000..92bebee
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst
@@ -0,0 +1,2 @@
+``_json.scanstring`` is now up to 3x faster when there are many backslash
+escaped characters in the JSON string.
diff --git a/Modules/_json.c b/Modules/_json.c
index 112903e..54ac605 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -73,19 +73,6 @@ static PyMemberDef encoder_members[] = {
     {NULL}
 };
 
-static PyObject *
-join_list_unicode(PyObject *lst)
-{
-    /* return u''.join(lst) */
-    static PyObject *sep = NULL;
-    if (sep == NULL) {
-        sep = PyUnicode_FromStringAndSize("", 0);
-        if (sep == NULL)
-            return NULL;
-    }
-    return PyUnicode_Join(sep, lst);
-}
-
 /* Forward decls */
 
 static PyObject *
@@ -385,21 +372,6 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
     return tpl;
 }
 
-#define APPEND_OLD_CHUNK \
-    if (chunk != NULL) { \
-        if (chunks == NULL) { \
-            chunks = PyList_New(0); \
-            if (chunks == NULL) { \
-                goto bail; \
-            } \
-        } \
-        if (PyList_Append(chunks, chunk)) { \
-            Py_CLEAR(chunk); \
-            goto bail; \
-        } \
-        Py_CLEAR(chunk); \
-    }
-
 static PyObject *
 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
 {
@@ -417,12 +389,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
     Py_ssize_t next /* = begin */;
     const void *buf;
     int kind;
-    PyObject *chunks = NULL;
-    PyObject *chunk = NULL;
 
     if (PyUnicode_READY(pystr) == -1)
         return 0;
 
+    _PyUnicodeWriter writer;
+    _PyUnicodeWriter_Init(&writer);
+    writer.overallocate = 1;
+
     len = PyUnicode_GET_LENGTH(pystr);
     buf = PyUnicode_DATA(pystr);
     kind = PyUnicode_KIND(pystr);
@@ -449,18 +423,26 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
             }
             c = d;
         }
-        if (!(c == '"' || c == '\\')) {
+
+        if (c == '"') {
+            // Fast path for simple case.
+            if (writer.buffer == NULL) {
+                PyObject *ret = PyUnicode_Substring(pystr, end, next);
+                if (ret == NULL) {
+                    goto bail;
+                }
+                *next_end_ptr = next + 1;;
+                return ret;
+            }
+        }
+        else if (c != '\\') {
             raise_errmsg("Unterminated string starting at", pystr, begin);
             goto bail;
         }
+
         /* Pick up this chunk if it's not zero length */
         if (next != end) {
-            APPEND_OLD_CHUNK
-                chunk = PyUnicode_FromKindAndData(
-                    kind,
-                    (char*)buf + kind * end,
-                    next - end);
-            if (chunk == NULL) {
+            if (_PyUnicodeWriter_WriteSubstring(&writer, pystr, end, next) < 0) {
                 goto bail;
             }
         }
@@ -551,34 +533,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                     end -= 6;
             }
         }
-        APPEND_OLD_CHUNK
-        chunk = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &c, 1);
-        if (chunk == NULL) {
+        if (_PyUnicodeWriter_WriteChar(&writer, c) < 0) {
             goto bail;
         }
     }
 
-    if (chunks == NULL) {
-        if (chunk != NULL)
-            rval = chunk;
-        else
-            rval = PyUnicode_FromStringAndSize("", 0);
-    }
-    else {
-        APPEND_OLD_CHUNK
-        rval = join_list_unicode(chunks);
-        if (rval == NULL) {
-            goto bail;
-        }
-        Py_CLEAR(chunks);
-    }
-
+    rval = _PyUnicodeWriter_Finish(&writer);
     *next_end_ptr = end;
     return rval;
+
 bail:
     *next_end_ptr = -1;
-    Py_XDECREF(chunks);
-    Py_XDECREF(chunk);
+    _PyUnicodeWriter_Dealloc(&writer);
     return NULL;
 }
author	Inada Naoki <songofacandy@gmail.com>	2019-10-17 07:12:41 (GMT)
committer	GitHub <noreply@github.com>	2019-10-17 07:12:41 (GMT)
commit	9c11029bb41caab5576f354fbf808a5e91325bb0 (patch)
tree	882cf98dd7a6f04626700384e236424ab120b0b5
parent	a661392f8fb5ac4fc095aa1845d1eb7a25c4e9be (diff)
download	cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.zip cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.tar.gz cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.tar.bz2