From 1194ea020cc370225f8dfb198a32dd6fe239cab2 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 4 Apr 2014 19:37:40 +0200 Subject: Issue #21118: Use _PyUnicodeWriter API in str.translate() to simplify and factorize the code --- Objects/unicodeobject.c | 266 +++++++++++++++++------------------------------- 1 file changed, 95 insertions(+), 171 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e38ded0..067a945 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8495,76 +8495,54 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) return -1; } } -/* ensure that *outobj is at least requiredsize characters long, - if not reallocate and adjust various state variables. - Return 0 on success, -1 on error */ -static int -charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, - Py_ssize_t requiredsize) -{ - Py_ssize_t oldsize = *psize; - Py_UCS4 *new_outobj; - if (requiredsize > oldsize) { - /* exponentially overallocate to minimize reallocations */ - if (requiredsize < 2 * oldsize) - requiredsize = 2 * oldsize; - new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); - if (new_outobj == 0) - return -1; - *outobj = new_outobj; - *psize = requiredsize; - } - return 0; -} -/* lookup the character, put the result in the output string and adjust - various state variables. Return a new reference to the object that - was put in the output buffer in *result, or Py_None, if the mapping was - undefined (in which case no character was written). - The called must decref result. - Return 0 on success, -1 on error. */ + +/* lookup the character, write the result into the writer. + Return 1 if the result was written into the writer, return 0 if the mapping + was undefined, raise an exception return -1 on error. */ static int -charmaptranslate_output(PyObject *input, Py_ssize_t ipos, - PyObject *mapping, Py_UCS4 **output, - Py_ssize_t *osize, Py_ssize_t *opos, - PyObject **res) +charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, + _PyUnicodeWriter *writer) { - Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); - if (charmaptranslate_lookup(curinp, mapping, res)) + PyObject *item; + + if (charmaptranslate_lookup(ch, mapping, &item)) return -1; - if (*res==NULL) { + + if (item == NULL) { /* not found => default to 1:1 mapping */ - (*output)[(*opos)++] = curinp; + if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { + return -1; + } + return 1; } - else if (*res==Py_None) - ; - else if (PyLong_Check(*res)) { - /* no overflow check, because we know that the space is enough */ - (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); + + if (item == Py_None) { + Py_DECREF(item); + return 0; } - else if (PyUnicode_Check(*res)) { - Py_ssize_t repsize; - if (PyUnicode_READY(*res) == -1) + + if (PyLong_Check(item)) { + Py_UCS4 ch = (Py_UCS4)PyLong_AS_LONG(item); + if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { + Py_DECREF(item); return -1; - repsize = PyUnicode_GET_LENGTH(*res); - if (repsize==1) { - /* no overflow check, because we know that the space is enough */ - (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); - } - else if (repsize!=0) { - /* more than one character */ - Py_ssize_t requiredsize = *opos + - (PyUnicode_GET_LENGTH(input) - ipos) + - repsize - 1; - Py_ssize_t i; - if (charmaptranslate_makespace(output, osize, requiredsize)) - return -1; - for(i = 0; i < repsize; i++) - (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); } + Py_DECREF(item); + return 1; } - else + + if (!PyUnicode_Check(item)) { + Py_DECREF(item); return -1; - return 0; + } + + if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { + Py_DECREF(item); + return -1; + } + + Py_DECREF(item); + return 1; } PyObject * @@ -8573,22 +8551,16 @@ _PyUnicode_TranslateCharmap(PyObject *input, const char *errors) { /* input object */ - char *idata; + char *data; Py_ssize_t size, i; int kind; /* output buffer */ - Py_UCS4 *output = NULL; - Py_ssize_t osize; - PyObject *res; - /* current output position */ - Py_ssize_t opos; + _PyUnicodeWriter writer; + /* error handler */ char *reason = "character maps to "; PyObject *errorHandler = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, - * 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + int ignore; if (mapping == NULL) { PyErr_BadArgument(); @@ -8597,10 +8569,9 @@ _PyUnicode_TranslateCharmap(PyObject *input, if (PyUnicode_READY(input) == -1) return NULL; - idata = (char*)PyUnicode_DATA(input); + data = (char*)PyUnicode_DATA(input); kind = PyUnicode_KIND(input); size = PyUnicode_GET_LENGTH(input); - i = 0; if (size == 0) { Py_INCREF(input); @@ -8609,121 +8580,74 @@ _PyUnicode_TranslateCharmap(PyObject *input, /* allocate enough for a simple 1:1 translation without replacements, if we need more, we'll resize */ - osize = size; - output = PyMem_Malloc(osize * sizeof(Py_UCS4)); - opos = 0; - if (output == NULL) { - PyErr_NoMemory(); + _PyUnicodeWriter_Init(&writer); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; - } + ignore = (errors != NULL && strcmp(errors, "ignore") == 0); + + i = 0; while (i adjust input pointer */ + + if (translate != 0) { + /* it worked => adjust input pointer */ ++i; - else { /* untranslatable character */ - PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ - Py_ssize_t repsize; - Py_ssize_t newpos; - Py_ssize_t uni2; - /* startpos for collecting untranslatable chars */ - Py_ssize_t collstart = i; - Py_ssize_t collend = i+1; - Py_ssize_t coll; - - /* find all untranslatable characters */ - while (collend < size) { - if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) - goto onError; - Py_XDECREF(x); - if (x!=Py_None) - break; - ++collend; - } - /* cache callback name lookup - * (if not done yet, i.e. it's the first error) */ - if (known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - known_errorHandler = 4; - else - known_errorHandler = 0; - } - switch (known_errorHandler) { - case 1: /* strict */ - make_translate_exception(&exc, - input, collstart, collend, reason); - if (exc != NULL) - PyCodec_StrictErrors(exc); + continue; + } + + /* untranslatable character */ + collstart = i; + collend = i+1; + + /* find all untranslatable characters */ + while (collend < size) { + PyObject *x; + ch = PyUnicode_READ(kind, data, collend); + if (charmaptranslate_lookup(ch, mapping, &x)) goto onError; - case 2: /* replace */ - /* No need to check for space, this is a 1:1 replacement */ - for (coll = collstart; coll0; ++uni2) - output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); - i = newpos; + ++collend; + } + + if (ignore) { + i = collend; + } + else { + repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, + reason, input, &exc, + collstart, collend, &newpos); + if (repunicode == NULL) + goto onError; + if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { Py_DECREF(repunicode); + goto onError; } + Py_DECREF(repunicode); + i = newpos; } } - res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); - if (!res) - goto onError; - PyMem_Free(output); Py_XDECREF(exc); Py_XDECREF(errorHandler); - return res; + return _PyUnicodeWriter_Finish(&writer); onError: - PyMem_Free(output); + _PyUnicodeWriter_Dealloc(&writer); Py_XDECREF(exc); Py_XDECREF(errorHandler); return NULL; -- cgit v0.12