summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorInada Naoki <songofacandy@gmail.com>2019-10-17 07:12:41 (GMT)
committerGitHub <noreply@github.com>2019-10-17 07:12:41 (GMT)
commit9c11029bb41caab5576f354fbf808a5e91325bb0 (patch)
tree882cf98dd7a6f04626700384e236424ab120b0b5
parenta661392f8fb5ac4fc095aa1845d1eb7a25c4e9be (diff)
downloadcpython-9c11029bb41caab5576f354fbf808a5e91325bb0.zip
cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.tar.gz
cpython-9c11029bb41caab5576f354fbf808a5e91325bb0.tar.bz2
bpo-37587: json: Use _PyUnicodeWriter when scanning string. (GH-15591)
-rw-r--r--Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst2
-rw-r--r--Modules/_json.c80
2 files changed, 25 insertions, 57 deletions
diff --git a/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst b/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst
new file mode 100644
index 0000000..92bebee
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-08-29-18-48-48.bpo-37587.N7TGTC.rst
@@ -0,0 +1,2 @@
+``_json.scanstring`` is now up to 3x faster when there are many backslash
+escaped characters in the JSON string.
diff --git a/Modules/_json.c b/Modules/_json.c
index 112903e..54ac605 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -73,19 +73,6 @@ static PyMemberDef encoder_members[] = {
{NULL}
};
-static PyObject *
-join_list_unicode(PyObject *lst)
-{
- /* return u''.join(lst) */
- static PyObject *sep = NULL;
- if (sep == NULL) {
- sep = PyUnicode_FromStringAndSize("", 0);
- if (sep == NULL)
- return NULL;
- }
- return PyUnicode_Join(sep, lst);
-}
-
/* Forward decls */
static PyObject *
@@ -385,21 +372,6 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
return tpl;
}
-#define APPEND_OLD_CHUNK \
- if (chunk != NULL) { \
- if (chunks == NULL) { \
- chunks = PyList_New(0); \
- if (chunks == NULL) { \
- goto bail; \
- } \
- } \
- if (PyList_Append(chunks, chunk)) { \
- Py_CLEAR(chunk); \
- goto bail; \
- } \
- Py_CLEAR(chunk); \
- }
-
static PyObject *
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
{
@@ -417,12 +389,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
Py_ssize_t next /* = begin */;
const void *buf;
int kind;
- PyObject *chunks = NULL;
- PyObject *chunk = NULL;
if (PyUnicode_READY(pystr) == -1)
return 0;
+ _PyUnicodeWriter writer;
+ _PyUnicodeWriter_Init(&writer);
+ writer.overallocate = 1;
+
len = PyUnicode_GET_LENGTH(pystr);
buf = PyUnicode_DATA(pystr);
kind = PyUnicode_KIND(pystr);
@@ -449,18 +423,26 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
c = d;
}
- if (!(c == '"' || c == '\\')) {
+
+ if (c == '"') {
+ // Fast path for simple case.
+ if (writer.buffer == NULL) {
+ PyObject *ret = PyUnicode_Substring(pystr, end, next);
+ if (ret == NULL) {
+ goto bail;
+ }
+ *next_end_ptr = next + 1;;
+ return ret;
+ }
+ }
+ else if (c != '\\') {
raise_errmsg("Unterminated string starting at", pystr, begin);
goto bail;
}
+
/* Pick up this chunk if it's not zero length */
if (next != end) {
- APPEND_OLD_CHUNK
- chunk = PyUnicode_FromKindAndData(
- kind,
- (char*)buf + kind * end,
- next - end);
- if (chunk == NULL) {
+ if (_PyUnicodeWriter_WriteSubstring(&writer, pystr, end, next) < 0) {
goto bail;
}
}
@@ -551,34 +533,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
end -= 6;
}
}
- APPEND_OLD_CHUNK
- chunk = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &c, 1);
- if (chunk == NULL) {
+ if (_PyUnicodeWriter_WriteChar(&writer, c) < 0) {
goto bail;
}
}
- if (chunks == NULL) {
- if (chunk != NULL)
- rval = chunk;
- else
- rval = PyUnicode_FromStringAndSize("", 0);
- }
- else {
- APPEND_OLD_CHUNK
- rval = join_list_unicode(chunks);
- if (rval == NULL) {
- goto bail;
- }
- Py_CLEAR(chunks);
- }
-
+ rval = _PyUnicodeWriter_Finish(&writer);
*next_end_ptr = end;
return rval;
+
bail:
*next_end_ptr = -1;
- Py_XDECREF(chunks);
- Py_XDECREF(chunk);
+ _PyUnicodeWriter_Dealloc(&writer);
return NULL;
}