From 23e275b3ad409ccc9602d1c061726af0926fea51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Wed, 2 Nov 2011 18:02:51 +0100 Subject: Port UCS1 and charmap codecs to new API. --- Include/unicodeobject.h | 6 ++ Modules/_codecsmodule.c | 6 +- Objects/unicodeobject.c | 211 +++++++++++++++++++++++++++++------------------- 3 files changed, 134 insertions(+), 89 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 6e613eb..dc3bad7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1425,6 +1425,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( (unicode ordinal -> char ordinal) */ const char *errors /* error handling */ ); +PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( + PyObject *unicode, /* Unicode object */ + PyObject *mapping, /* character mapping + (unicode ordinal -> char ordinal) */ + const char *errors /* error handling */ + ); #endif /* Translate a Py_UNICODE buffer of the given length by applying a diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index be31fd2..c9409cc 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -992,11 +992,7 @@ charmap_encode(PyObject *self, str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeCharmap( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - mapping, - errors), + v = codec_tuple(_PyUnicode_EncodeCharmap(str, mapping, errors), PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0a33ece..0ecbd1d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -248,7 +248,7 @@ _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); static PyObject * unicode_encode_call_errorhandler(const char *errors, PyObject **errorHandler,const char *encoding, const char *reason, - const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, + PyObject *unicode, PyObject **exceptionObject, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); static void @@ -4745,8 +4745,7 @@ _PyUnicode_AsUTF8String(PyObject *obj, const char *errors) #endif rep = unicode_encode_call_errorhandler( errors, &errorHandler, "utf-8", "surrogates not allowed", - PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), - &exc, startpos, startpos+1, &newpos); + obj, &exc, startpos, startpos+1, &newpos); if (!rep) goto error; @@ -6450,7 +6449,7 @@ make_encode_exception_obj(PyObject **exceptionObject, { if (*exceptionObject == NULL) { *exceptionObject = PyObject_CallFunction( - PyExc_UnicodeEncodeError, "sUnns", + PyExc_UnicodeEncodeError, "sOnns", encoding, unicode, startpos, endpos, reason); } else { @@ -6502,12 +6501,12 @@ static PyObject * unicode_encode_call_errorhandler(const char *errors, PyObject **errorHandler, const char *encoding, const char *reason, - const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, + PyObject *unicode, PyObject **exceptionObject, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; - + Py_ssize_t len; PyObject *restuple; PyObject *resunicode; @@ -6517,8 +6516,12 @@ unicode_encode_call_errorhandler(const char *errors, return NULL; } - make_encode_exception(exceptionObject, - encoding, unicode, size, startpos, endpos, reason); + if (PyUnicode_READY(unicode) < 0) + return NULL; + len = PyUnicode_GET_LENGTH(unicode); + + make_encode_exception_obj(exceptionObject, + encoding, unicode, startpos, endpos, reason); if (*exceptionObject == NULL) return NULL; @@ -6542,8 +6545,8 @@ unicode_encode_call_errorhandler(const char *errors, return NULL; } if (*newpos<0) - *newpos = size+*newpos; - if (*newpos<0 || *newpos>size) { + *newpos = len + *newpos; + if (*newpos<0 || *newpos>len) { PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); Py_DECREF(restuple); return NULL; @@ -6554,18 +6557,16 @@ unicode_encode_call_errorhandler(const char *errors, } static PyObject * -unicode_encode_ucs1(const Py_UNICODE *p, - Py_ssize_t size, +unicode_encode_ucs1(PyObject *unicode, const char *errors, int limit) { + /* input state */ + Py_ssize_t pos=0, size; + int kind; + void *data; /* output object */ PyObject *res; - /* pointers to the beginning and end+1 of input */ - const Py_UNICODE *startp = p; - const Py_UNICODE *endp = p + size; - /* pointer to the beginning of the unencodable characters */ - /* const Py_UNICODE *badp = NULL; */ /* pointer into the output */ char *str; /* current output position */ @@ -6578,6 +6579,11 @@ unicode_encode_ucs1(const Py_UNICODE *p, * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ int known_errorHandler = -1; + if (PyUnicode_READY(unicode) < 0) + return NULL; + size = PyUnicode_GET_LENGTH(unicode); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); /* allocate enough for a simple encoding without replacements, if we need more, we'll resize */ if (size == 0) @@ -6588,28 +6594,24 @@ unicode_encode_ucs1(const Py_UNICODE *p, str = PyBytes_AS_STRING(res); ressize = size; - while (p=limit)) + while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) ++collend; /* cache callback name lookup (if not done yet, i.e. it's the first error) */ if (known_errorHandler==-1) { @@ -6626,39 +6628,40 @@ unicode_encode_ucs1(const Py_UNICODE *p, } switch (known_errorHandler) { case 1: /* strict */ - raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); + raise_encode_exception_obj(&exc, encoding, unicode, collstart, collend, reason); goto onError; case 2: /* replace */ while (collstart++ ressize) { if (requiredsize<2*ressize) requiredsize = 2*ressize; @@ -6667,17 +6670,18 @@ unicode_encode_ucs1(const Py_UNICODE *p, str = PyBytes_AS_STRING(res) + respos; ressize = requiredsize; } - /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) { - str += sprintf(str, "&#%d;", (int)*p); + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); } - p = collend; + pos = collend; break; default: repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, - encoding, reason, startp, size, &exc, - collstart-startp, collend-startp, &newpos); - if (repunicode == NULL) + encoding, reason, unicode, &exc, + collstart, collend, &newpos); + if (repunicode == NULL || (PyUnicode_Check(repunicode) && + PyUnicode_READY(repunicode) < 0)) goto onError; if (PyBytes_Check(repunicode)) { /* Directly copy bytes result to output. */ @@ -6694,7 +6698,7 @@ unicode_encode_ucs1(const Py_UNICODE *p, } memcpy(str, PyBytes_AsString(repunicode), repsize); str += repsize; - p = startp + newpos; + pos = newpos; Py_DECREF(repunicode); break; } @@ -6702,8 +6706,8 @@ unicode_encode_ucs1(const Py_UNICODE *p, have+the replacement+the rest of the string, so we won't have to check space for encodable characters) */ respos = str - PyBytes_AS_STRING(res); - repsize = PyUnicode_GET_SIZE(repunicode); - requiredsize = respos+repsize+(endp-collend); + repsize = PyUnicode_GET_LENGTH(repunicode); + requiredsize = respos+repsize+(size-collend); if (requiredsize > ressize) { if (requiredsize<2*ressize) requiredsize = 2*ressize; @@ -6716,17 +6720,17 @@ unicode_encode_ucs1(const Py_UNICODE *p, } /* check if there is anything unencodable in the replacement and copy it to the output */ - for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { - c = *uni2; + for (i = 0; repsize-->0; ++i, ++str) { + c = PyUnicode_READ_CHAR(repunicode, i); if (c >= limit) { - raise_encode_exception(&exc, encoding, startp, size, - unicodepos, unicodepos+1, reason); + raise_encode_exception_obj(&exc, encoding, unicode, + pos, pos+1, reason); Py_DECREF(repunicode); goto onError; } *str = (char)c; } - p = startp + newpos; + pos = newpos; Py_DECREF(repunicode); } } @@ -6750,12 +6754,19 @@ unicode_encode_ucs1(const Py_UNICODE *p, return NULL; } +/* Deprecated */ PyObject * PyUnicode_EncodeLatin1(const Py_UNICODE *p, Py_ssize_t size, const char *errors) { - return unicode_encode_ucs1(p, size, errors, 256); + PyObject *result; + PyObject *unicode = PyUnicode_FromUnicode(p, size); + if (unicode == NULL) + return NULL; + result = unicode_encode_ucs1(unicode, errors, 256); + Py_DECREF(unicode); + return result; } PyObject * @@ -6774,9 +6785,7 @@ _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) PyUnicode_GET_LENGTH(unicode)); /* Non-Latin-1 characters present. Defer to above function to raise the exception. */ - return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + return unicode_encode_ucs1(unicode, errors, 256); } PyObject* @@ -6888,12 +6897,19 @@ PyUnicode_DecodeASCII(const char *s, return NULL; } +/* Deprecated */ PyObject * PyUnicode_EncodeASCII(const Py_UNICODE *p, Py_ssize_t size, const char *errors) { - return unicode_encode_ucs1(p, size, errors, 128); + PyObject *result; + PyObject *unicode = PyUnicode_FromUnicode(p, size); + if (unicode == NULL) + return NULL; + result = unicode_encode_ucs1(unicode, errors, 128); + Py_DECREF(unicode); + return result; } PyObject * @@ -6910,9 +6926,7 @@ _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), PyUnicode_GET_LENGTH(unicode)); - return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + return unicode_encode_ucs1(unicode, errors, 128); } PyObject * @@ -8182,13 +8196,13 @@ charmapencode_output(Py_UNICODE c, PyObject *mapping, Return 0 on success, -1 on error */ static int charmap_encoding_error( - const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, + PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, PyObject **exceptionObject, int *known_errorHandler, PyObject **errorHandler, const char *errors, PyObject **res, Py_ssize_t *respos) { PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ - Py_ssize_t repsize; + Py_ssize_t size, repsize; Py_ssize_t newpos; Py_UNICODE *uni2; /* startpos for collecting unencodable chars */ @@ -8198,19 +8212,25 @@ charmap_encoding_error( char *encoding = "charmap"; char *reason = "character maps to "; charmapencode_result x; + Py_UCS4 ch; + if (PyUnicode_READY(unicode) < 0) + return -1; + size = PyUnicode_GET_LENGTH(unicode); /* find all unencodable characters */ while (collendpos < size) { PyObject *rep; if (Py_TYPE(mapping) == &EncodingMapType) { - int res = encoding_map_lookup(p[collendpos], mapping); + ch = PyUnicode_READ_CHAR(unicode, collendpos); + int res = encoding_map_lookup(ch, mapping); if (res != -1) break; ++collendpos; continue; } - rep = charmapencode_lookup(p[collendpos], mapping); + ch = PyUnicode_READ_CHAR(unicode, collendpos); + rep = charmapencode_lookup(ch, mapping); if (rep==NULL) return -1; else if (rep!=Py_None) { @@ -8236,7 +8256,7 @@ charmap_encoding_error( } switch (*known_errorHandler) { case 1: /* strict */ - raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); + raise_encode_exception_obj(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); return -1; case 2: /* replace */ for (collpos = collstartpos; collpos