diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2014-04-05 09:44:04 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2014-04-05 09:44:04 (GMT) |
commit | 89a76abf20889551ec1ed64dee1a4161a435db5b (patch) | |
tree | df9f731767053c66cd84bb64ee8dce9b77f938f6 /Objects | |
parent | 5a29f2584396fd96796a783564bfda3045f19729 (diff) | |
download | cpython-89a76abf20889551ec1ed64dee1a4161a435db5b.zip cpython-89a76abf20889551ec1ed64dee1a4161a435db5b.tar.gz cpython-89a76abf20889551ec1ed64dee1a4161a435db5b.tar.bz2 |
Issue #21118: Optimize str.translate() for ASCII => ASCII translation
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 121 |
1 files changed, 120 insertions, 1 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 91f6620..0386a87 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8545,6 +8545,116 @@ charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, return 1; } +static int +unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, + Py_UCS1 *translate) +{ + PyObject *item; + int ret = 0; + + item = NULL; + if (charmaptranslate_lookup(ch, mapping, &item)) { + return -1; + } + + if (item == Py_None) { + /* deletion: skip fast translate */ + goto exit; + } + + if (item == NULL) { + /* not found => default to 1:1 mapping */ + translate[ch] = ch; + return 1; + } + + if (PyLong_Check(item)) { + long replace = (Py_UCS4)PyLong_AS_LONG(item); + if (replace == -1) { + Py_DECREF(item); + return -1; + } + if (replace < 0 || 127 < replace) { + /* invalid character or character outside ASCII: + skip the fast translate */ + goto exit; + } + translate[ch] = (Py_UCS1)replace; + } + else if (PyUnicode_Check(item)) { + Py_UCS4 replace; + + if (PyUnicode_READY(item) == -1) { + Py_DECREF(item); + return -1; + } + if (PyUnicode_GET_LENGTH(item) != 1) + goto exit; + + replace = PyUnicode_READ_CHAR(item, 0); + if (replace > 127) + goto exit; + translate[ch] = (Py_UCS1)replace; + } + else { + /* not a long or unicode */ + goto exit; + } + Py_DECREF(item); + item = NULL; + ret = 1; + +exit: + Py_XDECREF(item); + return ret; +} + +/* Fast path for ascii => ascii translation. Return 1 if the whole string + was translated into writer, return 0 if the input string was partially + translated into writer, raise an exception and return -1 on error. */ +static int +unicode_fast_translate(PyObject *input, PyObject *mapping, + _PyUnicodeWriter *writer) +{ + Py_UCS1 translate[128], ch, ch2; + Py_ssize_t len; + Py_UCS1 *in, *end, *out; + int res; + + if (PyUnicode_READY(input) == -1) + return -1; + if (!PyUnicode_IS_ASCII(input)) + return 0; + len = PyUnicode_GET_LENGTH(input); + + memset(translate, 0xff, 128); + + in = PyUnicode_1BYTE_DATA(input); + end = in + len; + + assert(PyUnicode_IS_ASCII(writer->buffer)); + assert(PyUnicode_GET_LENGTH(writer->buffer) == len); + out = PyUnicode_1BYTE_DATA(writer->buffer); + + for (; in < end; in++, out++) { + ch = *in; + ch2 = translate[ch]; + if (ch2 == 0xff) { + res = unicode_fast_translate_lookup(mapping, ch, translate); + if (res < 0) + return -1; + if (res == 0) { + writer->pos = in - PyUnicode_1BYTE_DATA(input); + return 0; + } + ch2 = translate[ch]; + } + *out = ch2; + } + writer->pos = len; + return 1; +} + PyObject * _PyUnicode_TranslateCharmap(PyObject *input, PyObject *mapping, @@ -8561,6 +8671,7 @@ _PyUnicode_TranslateCharmap(PyObject *input, PyObject *errorHandler = NULL; PyObject *exc = NULL; int ignore; + int res; if (mapping == NULL) { PyErr_BadArgument(); @@ -8584,9 +8695,17 @@ _PyUnicode_TranslateCharmap(PyObject *input, if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; + res = unicode_fast_translate(input, mapping, &writer); + if (res < 0) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + if (res == 1) + return _PyUnicodeWriter_Finish(&writer); + ignore = (errors != NULL && strcmp(errors, "ignore") == 0); - i = 0; + i = writer.pos; while (i<size) { /* try to encode it */ int translate; |