summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorInada Naoki <songofacandy@gmail.com>2019-06-24 03:30:24 (GMT)
committerGitHub <noreply@github.com>2019-06-24 03:30:24 (GMT)
commit770847a7db33b3d4c451b42372b6942687aa6121 (patch)
tree04aaf3163636bed947763435ad03a76b6f211d7b /Objects/unicodeobject.c
parentb3ca7972c8d8c6479b6542ce28e0f7a6ebd5b8fe (diff)
downloadcpython-770847a7db33b3d4c451b42372b6942687aa6121.zip
cpython-770847a7db33b3d4c451b42372b6942687aa6121.tar.gz
cpython-770847a7db33b3d4c451b42372b6942687aa6121.tar.bz2
bpo-37348: optimize decoding ASCII string (GH-14283)
`_PyUnicode_Writer` is a relatively complex structure. Initializing it is significant overhead when decoding short ASCII string.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c85
1 files changed, 51 insertions, 34 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4f83625..625be4b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
+static inline void
+_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
const char *errors);
@@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
- _PyUnicodeWriter writer;
- const char *starts = s;
- const char *end = s + size;
-
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
- const char *errmsg = "";
- PyObject *error_handler_obj = NULL;
- PyObject *exc = NULL;
-
if (size == 0) {
if (consumed)
*consumed = 0;
@@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
return get_latin1_char((unsigned char)s[0]);
}
- _PyUnicodeWriter_Init(&writer);
- writer.min_length = size;
- if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
- goto onError;
+ const char *starts = s;
+ const char *end = s + size;
+
+ // fast path: try ASCII string.
+ PyObject *u = PyUnicode_New(size, 127);
+ if (u == NULL) {
+ return NULL;
+ }
+ s += ascii_decode(s, end, PyUnicode_DATA(u));
+ if (s == end) {
+ return u;
+ }
+
+ // Use _PyUnicodeWriter after fast path is failed.
+ _PyUnicodeWriter writer;
+ _PyUnicodeWriter_InitWithBuffer(&writer, u);
+ writer.pos = s - starts;
+
+ Py_ssize_t startinpos, endinpos;
+ const char *errmsg = "";
+ PyObject *error_handler_obj = NULL;
+ PyObject *exc = NULL;
- writer.pos = ascii_decode(s, end, writer.data);
- s += writer.pos;
while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
@@ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
length after conversion to the true value. (But decoding error
handler might have to resize the string) */
_PyUnicodeWriter_Init(&writer);
- writer.min_length = size;
+ writer.min_length = size;
if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
goto onError;
}
@@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s,
const char *errors)
{
const char *starts = s;
- _PyUnicodeWriter writer;
- int kind;
- void *data;
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
- Py_ssize_t outpos;
- const char *e;
+ const char *e = s + size;
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
@@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s,
if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]);
- _PyUnicodeWriter_Init(&writer);
- writer.min_length = size;
- if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
+ // Shortcut for simple case
+ PyObject *u = PyUnicode_New(size, 127);
+ if (u == NULL) {
return NULL;
+ }
+ Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
+ if (outpos == size) {
+ return u;
+ }
- e = s + size;
- data = writer.data;
- outpos = ascii_decode(s, e, (Py_UCS1 *)data);
+ _PyUnicodeWriter writer;
+ _PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = outpos;
- if (writer.pos == size)
- return _PyUnicodeWriter_Finish(&writer);
- s += writer.pos;
- kind = writer.kind;
+ s += outpos;
+ int kind = writer.kind;
+ void *data = writer.data;
+ Py_ssize_t startinpos, endinpos;
+
while (s < e) {
unsigned char c = (unsigned char)*s;
if (c < 128) {
@@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
assert(writer->kind <= PyUnicode_1BYTE_KIND);
}
+// Initialize _PyUnicodeWriter with initial buffer
+static inline void
+_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
+{
+ memset(writer, 0, sizeof(*writer));
+ writer->buffer = buffer;
+ _PyUnicodeWriter_Update(writer);
+ writer->min_length = writer->size;
+}
+
int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar)