summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2015-10-09 11:10:05 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2015-10-09 11:10:05 (GMT)
commit6bd525b656f75c9752d39d9c4be1e1b29fa67cdb (patch)
tree645853491c0ae3addc1f578dfe0b5345b3cd7b0f /Objects
parentce179bf6baed91ba84cc3ff647e96287c3b8e2f2 (diff)
downloadcpython-6bd525b656f75c9752d39d9c4be1e1b29fa67cdb.zip
cpython-6bd525b656f75c9752d39d9c4be1e1b29fa67cdb.tar.gz
cpython-6bd525b656f75c9752d39d9c4be1e1b29fa67cdb.tar.bz2
Optimize error handlers of ASCII and Latin1 encoders when the replacement
string is pure ASCII: use _PyBytesWriter_WriteBytes(), don't check individual character. Cleanup unicode_encode_ucs1(): * Rename repunicode to rep * Clear rep object on error * Factorize code between bytes and unicode path
Diffstat (limited to 'Objects')
-rw-r--r--Objects/stringlib/codecs.h18
-rw-r--r--Objects/unicodeobject.c72
2 files changed, 47 insertions, 43 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 7e8d928..2beb604 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#if STRINGLIB_SIZEOF_CHAR > 1
else if (Py_UNICODE_IS_SURROGATE(ch)) {
Py_ssize_t startpos, endpos, newpos;
- Py_ssize_t repsize, k;
+ Py_ssize_t k;
if (error_handler == _Py_ERROR_UNKNOWN)
error_handler = get_error_handler(errors);
@@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
p = _PyBytesWriter_WriteBytes(&writer, p,
PyBytes_AS_STRING(rep),
PyBytes_GET_SIZE(rep));
- if (p == NULL)
- goto error;
}
else {
/* rep is unicode */
if (PyUnicode_READY(rep) < 0)
goto error;
- repsize = PyUnicode_GET_LENGTH(rep);
-
- p = _PyBytesWriter_Prepare(&writer, p, repsize);
- if (p == NULL)
- goto error;
-
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, "utf-8",
unicode,
@@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
}
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
- memcpy(p, PyUnicode_DATA(rep), repsize);
- p += repsize;
+ p = _PyBytesWriter_WriteBytes(&writer, p,
+ PyUnicode_DATA(rep),
+ PyUnicode_GET_LENGTH(rep));
}
+
+ if (p == NULL)
+ goto error;
Py_CLEAR(rep);
i = newpos;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 23b8cc7..35df747 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode,
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+ PyObject *rep = NULL;
/* output object */
_PyBytesWriter writer;
@@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode,
++pos;
}
else {
- PyObject *repunicode;
- Py_ssize_t repsize, newpos, i;
+ Py_ssize_t newpos, i;
/* startpos for collecting unencodable chars */
Py_ssize_t collstart = pos;
Py_ssize_t collend = collstart + 1;
@@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode,
/* fallback to general error handling */
default:
- repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
- encoding, reason, unicode, &exc,
- collstart, collend, &newpos);
- if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
- PyUnicode_READY(repunicode) == -1))
+ rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+ encoding, reason, unicode, &exc,
+ collstart, collend, &newpos);
+ if (rep == NULL)
goto onError;
/* substract preallocated bytes */
writer.min_size -= 1;
- if (PyBytes_Check(repunicode)) {
+ if (PyBytes_Check(rep)) {
/* Directly copy bytes result to output. */
str = _PyBytesWriter_WriteBytes(&writer, str,
- PyBytes_AS_STRING(repunicode),
- PyBytes_GET_SIZE(repunicode));
+ PyBytes_AS_STRING(rep),
+ PyBytes_GET_SIZE(rep));
if (str == NULL)
goto onError;
-
- pos = newpos;
- Py_DECREF(repunicode);
- break;
}
+ else {
+ assert(PyUnicode_Check(rep));
- /* need more space? (at least enough for what we
- have+the replacement+the rest of the string, so
- we won't have to check space for encodable characters) */
- repsize = PyUnicode_GET_LENGTH(repunicode);
+ if (PyUnicode_READY(rep) < 0)
+ goto onError;
- str = _PyBytesWriter_Prepare(&writer, str, repsize);
- if (str == NULL)
- goto onError;
+ if (PyUnicode_IS_ASCII(rep)) {
+ /* Fast path: all characters are smaller than limit */
+ assert(limit >= 128);
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ str = _PyBytesWriter_WriteBytes(&writer, str,
+ PyUnicode_DATA(rep),
+ PyUnicode_GET_LENGTH(rep));
+ }
+ else {
+ Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
- /* check if there is anything unencodable in the replacement
- and copy it to the output */
- for (i = 0; repsize-->0; ++i, ++str) {
- ch = PyUnicode_READ_CHAR(repunicode, i);
- if (ch >= limit) {
- raise_encode_exception(&exc, encoding, unicode,
- pos, pos+1, reason);
- Py_DECREF(repunicode);
- goto onError;
+ str = _PyBytesWriter_Prepare(&writer, str, repsize);
+ if (str == NULL)
+ goto onError;
+
+ /* check if there is anything unencodable in the
+ replacement and copy it to the output */
+ for (i = 0; repsize-->0; ++i, ++str) {
+ ch = PyUnicode_READ_CHAR(rep, i);
+ if (ch >= limit) {
+ raise_encode_exception(&exc, encoding, unicode,
+ pos, pos+1, reason);
+ goto onError;
+ }
+ *str = (char)ch;
+ }
}
- *str = (char)ch;
}
pos = newpos;
- Py_DECREF(repunicode);
+ Py_CLEAR(rep);
}
/* If overallocation was disabled, ensure that it was the last
@@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode,
return _PyBytesWriter_Finish(&writer, str);
onError:
+ Py_XDECREF(rep);
_PyBytesWriter_Dealloc(&writer);
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);