summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2015-09-21 21:06:27 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2015-09-21 21:06:27 (GMT)
commitf96418de05fc8710f9dc1e3a19878f8e744956f2 (patch)
tree6882ce050257fcb75357680e20d92d58f4e9acc9 /Objects/unicodeobject.c
parentba4529593877f2016215149912496e030782a57e (diff)
downloadcpython-f96418de05fc8710f9dc1e3a19878f8e744956f2.zip
cpython-f96418de05fc8710f9dc1e3a19878f8e744956f2.tar.gz
cpython-f96418de05fc8710f9dc1e3a19878f8e744956f2.tar.bz2
Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,
ignore and replace. Initial patch written by Naoki Inada. The decoder is now up to 60 times as fast for these error handlers. Add also unit tests for the ASCII decoder.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c66
1 files changed, 61 insertions, 5 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0709789..b8840e1 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode)
/* --- 7-bit ASCII Codec -------------------------------------------------- */
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+ if (errors == NULL)
+ return _Py_ERROR_OTHER;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_ERROR_SURROGATEESCAPE;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "replace") == 0)
+ return _Py_ERROR_REPLACE;
+ return _Py_ERROR_OTHER;
+}
+
PyObject *
PyUnicode_DecodeASCII(const char *s,
Py_ssize_t size,
@@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s,
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
@@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s,
PyUnicode_WRITE(kind, data, writer.pos, c);
writer.pos++;
++s;
+ continue;
}
- else {
+
+ /* byte outsize range 0x00..0x7f: call the error handler */
+
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ case _Py_ERROR_SURROGATEESCAPE:
+ /* Fast-path: the error handler only writes one character,
+ but we must switch to UCS2 at the first write */
+ if (kind < PyUnicode_2BYTE_KIND) {
+ if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+ 0xffff) < 0)
+ return NULL;
+ kind = writer.kind;
+ data = writer.data;
+ }
+
+ if (error_handler == _Py_ERROR_REPLACE)
+ PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+ else
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ break;
+
+ case _Py_ERROR_IGNORE:
+ ++s;
+ break;
+
+ default:
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
+ errors, &error_handler_obj,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&writer))
@@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s,
data = writer.data;
}
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
_PyUnicodeWriter_Dealloc(&writer);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}