diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2014-04-04 17:37:40 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2014-04-04 17:37:40 (GMT) |
commit | 1194ea020cc370225f8dfb198a32dd6fe239cab2 (patch) | |
tree | 0b8e2cfa7da4a54fc6bbfac89ad10282fea64de0 | |
parent | d129eeb3031128969e8aea377bf068e72adf1433 (diff) | |
download | cpython-1194ea020cc370225f8dfb198a32dd6fe239cab2.zip cpython-1194ea020cc370225f8dfb198a32dd6fe239cab2.tar.gz cpython-1194ea020cc370225f8dfb198a32dd6fe239cab2.tar.bz2 |
Issue #21118: Use _PyUnicodeWriter API in str.translate() to simplify and
factorize the code
-rw-r--r-- | Objects/unicodeobject.c | 266 |
1 files changed, 95 insertions, 171 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e38ded0..067a945 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8495,76 +8495,54 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) return -1; } } -/* ensure that *outobj is at least requiredsize characters long, - if not reallocate and adjust various state variables. - Return 0 on success, -1 on error */ -static int -charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, - Py_ssize_t requiredsize) -{ - Py_ssize_t oldsize = *psize; - Py_UCS4 *new_outobj; - if (requiredsize > oldsize) { - /* exponentially overallocate to minimize reallocations */ - if (requiredsize < 2 * oldsize) - requiredsize = 2 * oldsize; - new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); - if (new_outobj == 0) - return -1; - *outobj = new_outobj; - *psize = requiredsize; - } - return 0; -} -/* lookup the character, put the result in the output string and adjust - various state variables. Return a new reference to the object that - was put in the output buffer in *result, or Py_None, if the mapping was - undefined (in which case no character was written). - The called must decref result. - Return 0 on success, -1 on error. */ + +/* lookup the character, write the result into the writer. + Return 1 if the result was written into the writer, return 0 if the mapping + was undefined, raise an exception return -1 on error. */ static int -charmaptranslate_output(PyObject *input, Py_ssize_t ipos, - PyObject *mapping, Py_UCS4 **output, - Py_ssize_t *osize, Py_ssize_t *opos, - PyObject **res) +charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, + _PyUnicodeWriter *writer) { - Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); - if (charmaptranslate_lookup(curinp, mapping, res)) + PyObject *item; + + if (charmaptranslate_lookup(ch, mapping, &item)) return -1; - if (*res==NULL) { + + if (item == NULL) { /* not found => default to 1:1 mapping */ - (*output)[(*opos)++] = curinp; + if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { + return -1; + } + return 1; } - else if (*res==Py_None) - ; - else if (PyLong_Check(*res)) { - /* no overflow check, because we know that the space is enough */ - (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); + + if (item == Py_None) { + Py_DECREF(item); + return 0; } - else if (PyUnicode_Check(*res)) { - Py_ssize_t repsize; - if (PyUnicode_READY(*res) == -1) + + if (PyLong_Check(item)) { + Py_UCS4 ch = (Py_UCS4)PyLong_AS_LONG(item); + if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { + Py_DECREF(item); return -1; - repsize = PyUnicode_GET_LENGTH(*res); - if (repsize==1) { - /* no overflow check, because we know that the space is enough */ - (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); - } - else if (repsize!=0) { - /* more than one character */ - Py_ssize_t requiredsize = *opos + - (PyUnicode_GET_LENGTH(input) - ipos) + - repsize - 1; - Py_ssize_t i; - if (charmaptranslate_makespace(output, osize, requiredsize)) - return -1; - for(i = 0; i < repsize; i++) - (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); } + Py_DECREF(item); + return 1; } - else + + if (!PyUnicode_Check(item)) { + Py_DECREF(item); return -1; - return 0; + } + + if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { + Py_DECREF(item); + return -1; + } + + Py_DECREF(item); + return 1; } PyObject * @@ -8573,22 +8551,16 @@ _PyUnicode_TranslateCharmap(PyObject *input, const char *errors) { /* input object */ - char *idata; + char *data; Py_ssize_t size, i; int kind; /* output buffer */ - Py_UCS4 *output = NULL; - Py_ssize_t osize; - PyObject *res; - /* current output position */ - Py_ssize_t opos; + _PyUnicodeWriter writer; + /* error handler */ char *reason = "character maps to <undefined>"; PyObject *errorHandler = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, - * 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + int ignore; if (mapping == NULL) { PyErr_BadArgument(); @@ -8597,10 +8569,9 @@ _PyUnicode_TranslateCharmap(PyObject *input, if (PyUnicode_READY(input) == -1) return NULL; - idata = (char*)PyUnicode_DATA(input); + data = (char*)PyUnicode_DATA(input); kind = PyUnicode_KIND(input); size = PyUnicode_GET_LENGTH(input); - i = 0; if (size == 0) { Py_INCREF(input); @@ -8609,121 +8580,74 @@ _PyUnicode_TranslateCharmap(PyObject *input, /* allocate enough for a simple 1:1 translation without replacements, if we need more, we'll resize */ - osize = size; - output = PyMem_Malloc(osize * sizeof(Py_UCS4)); - opos = 0; - if (output == NULL) { - PyErr_NoMemory(); + _PyUnicodeWriter_Init(&writer); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError; - } + ignore = (errors != NULL && strcmp(errors, "ignore") == 0); + + i = 0; while (i<size) { /* try to encode it */ - PyObject *x = NULL; - if (charmaptranslate_output(input, i, mapping, - &output, &osize, &opos, &x)) { - Py_XDECREF(x); + int translate; + PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ + Py_ssize_t newpos; + /* startpos for collecting untranslatable chars */ + Py_ssize_t collstart; + Py_ssize_t collend; + Py_ssize_t coll; + Py_UCS4 ch; + + ch = PyUnicode_READ(kind, data, i); + translate = charmaptranslate_output(ch, mapping, &writer); + if (translate < 0) goto onError; - } - Py_XDECREF(x); - if (x!=Py_None) /* it worked => adjust input pointer */ + + if (translate != 0) { + /* it worked => adjust input pointer */ ++i; - else { /* untranslatable character */ - PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ - Py_ssize_t repsize; - Py_ssize_t newpos; - Py_ssize_t uni2; - /* startpos for collecting untranslatable chars */ - Py_ssize_t collstart = i; - Py_ssize_t collend = i+1; - Py_ssize_t coll; - - /* find all untranslatable characters */ - while (collend < size) { - if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) - goto onError; - Py_XDECREF(x); - if (x!=Py_None) - break; - ++collend; - } - /* cache callback name lookup - * (if not done yet, i.e. it's the first error) */ - if (known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - known_errorHandler = 4; - else - known_errorHandler = 0; - } - switch (known_errorHandler) { - case 1: /* strict */ - make_translate_exception(&exc, - input, collstart, collend, reason); - if (exc != NULL) - PyCodec_StrictErrors(exc); + continue; + } + + /* untranslatable character */ + collstart = i; + collend = i+1; + + /* find all untranslatable characters */ + while (collend < size) { + PyObject *x; + ch = PyUnicode_READ(kind, data, collend); + if (charmaptranslate_lookup(ch, mapping, &x)) goto onError; - case 2: /* replace */ - /* No need to check for space, this is a 1:1 replacement */ - for (coll = collstart; coll<collend; coll++) - output[opos++] = '?'; - /* fall through */ - case 3: /* ignore */ - i = collend; - break; - case 4: /* xmlcharrefreplace */ - /* generate replacement (temporarily (mis)uses i) */ - for (i = collstart; i < collend; ++i) { - char buffer[2+29+1+1]; - char *cp; - sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); - if (charmaptranslate_makespace(&output, &osize, - opos+strlen(buffer)+(size-collend))) - goto onError; - for (cp = buffer; *cp; ++cp) - output[opos++] = *cp; - } - i = collend; + Py_XDECREF(x); + if (x != Py_None) break; - default: - repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, - reason, input, &exc, - collstart, collend, &newpos); - if (repunicode == NULL) - goto onError; - if (PyUnicode_READY(repunicode) == -1) { - Py_DECREF(repunicode); - goto onError; - } - /* generate replacement */ - repsize = PyUnicode_GET_LENGTH(repunicode); - if (charmaptranslate_makespace(&output, &osize, - opos+repsize+(size-collend))) { - Py_DECREF(repunicode); - goto onError; - } - for (uni2 = 0; repsize-->0; ++uni2) - output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); - i = newpos; + ++collend; + } + + if (ignore) { + i = collend; + } + else { + repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, + reason, input, &exc, + collstart, collend, &newpos); + if (repunicode == NULL) + goto onError; + if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { Py_DECREF(repunicode); + goto onError; } + Py_DECREF(repunicode); + i = newpos; } } - res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); - if (!res) - goto onError; - PyMem_Free(output); Py_XDECREF(exc); Py_XDECREF(errorHandler); - return res; + return _PyUnicodeWriter_Finish(&writer); onError: - PyMem_Free(output); + _PyUnicodeWriter_Dealloc(&writer); Py_XDECREF(exc); Py_XDECREF(errorHandler); return NULL; |