diff options
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 440 |
1 files changed, 284 insertions, 156 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index e21834a..ea33c49 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -11,6 +11,8 @@ Copyright (c) Corporation for National Research Initiatives. #include "Python.h" #include <ctype.h> +const char *Py_hexdigits = "0123456789abcdef"; + /* --- Codec Registry ----------------------------------------------------- */ /* Import the standard encodings package which will register the first @@ -241,20 +243,15 @@ PyObject *codec_getitem(const char *encoding, int index) return v; } -/* Helper function to create an incremental codec. */ - +/* Helper functions to create an incremental codec. */ static -PyObject *codec_getincrementalcodec(const char *encoding, - const char *errors, - const char *attrname) +PyObject *codec_makeincrementalcodec(PyObject *codec_info, + const char *errors, + const char *attrname) { - PyObject *codecs, *ret, *inccodec; + PyObject *ret, *inccodec; - codecs = _PyCodec_Lookup(encoding); - if (codecs == NULL) - return NULL; - inccodec = PyObject_GetAttrString(codecs, attrname); - Py_DECREF(codecs); + inccodec = PyObject_GetAttrString(codec_info, attrname); if (inccodec == NULL) return NULL; if (errors) @@ -265,6 +262,21 @@ PyObject *codec_getincrementalcodec(const char *encoding, return ret; } +static +PyObject *codec_getincrementalcodec(const char *encoding, + const char *errors, + const char *attrname) +{ + PyObject *codec_info, *ret; + + codec_info = _PyCodec_Lookup(encoding); + if (codec_info == NULL) + return NULL; + ret = codec_makeincrementalcodec(codec_info, errors, attrname); + Py_DECREF(codec_info); + return ret; +} + /* Helper function to create a stream codec. */ static @@ -288,6 +300,24 @@ PyObject *codec_getstreamcodec(const char *encoding, return streamcodec; } +/* Helpers to work with the result of _PyCodec_Lookup + + */ +PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementaldecoder"); +} + +PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementalencoder"); +} + + /* Convenience APIs to query the Codec registry. All APIs return a codec object with incremented refcount. @@ -335,18 +365,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding, errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_EncodeInternal(PyObject *object, + PyObject *encoder, + const char *encoding, + const char *errors) { - PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v = NULL; - encoder = PyCodec_Encoder(encoding); - if (encoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -382,18 +409,15 @@ PyObject *PyCodec_Encode(PyObject *object, errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_DecodeInternal(PyObject *object, + PyObject *decoder, + const char *encoding, + const char *errors) { - PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - decoder = PyCodec_Decoder(encoding); - if (decoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -423,6 +447,132 @@ PyObject *PyCodec_Decode(PyObject *object, return NULL; } +/* Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = PyCodec_Encoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = PyCodec_Decoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + +/* Text encoding/decoding API */ +PyObject * _PyCodec_LookupTextEncoding(const char *encoding, + const char *alternate_command) +{ + _Py_IDENTIFIER(_is_text_encoding); + PyObject *codec; + PyObject *attr; + int is_text_codec; + + codec = _PyCodec_Lookup(encoding); + if (codec == NULL) + return NULL; + + /* Backwards compatibility: assume any raw tuple describes a text + * encoding, and the same for anything lacking the private + * attribute. + */ + if (!PyTuple_CheckExact(codec)) { + attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding); + if (attr == NULL) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_Clear(); + } else { + Py_DECREF(codec); + return NULL; + } + } else { + is_text_codec = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (!is_text_codec) { + Py_DECREF(codec); + PyErr_Format(PyExc_LookupError, + "'%.400s' is not a text encoding; " + "use %s to handle arbitrary codecs", + encoding, alternate_command); + return NULL; + } + } + } + + /* This appears to be a valid text encoding */ + return codec; +} + + +static +PyObject *codec_getitem_checked(const char *encoding, + const char *alternate_command, + int index) +{ + PyObject *codec; + PyObject *v; + + codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); + if (codec == NULL) + return NULL; + + v = PyTuple_GET_ITEM(codec, index); + Py_INCREF(v); + Py_DECREF(codec); + return v; +} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "codecs.encode()", 0); +} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "codecs.decode()", 1); +} + +PyObject *_PyCodec_EncodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = _PyCodec_TextEncoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *_PyCodec_DecodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = _PyCodec_TextDecoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the @@ -465,9 +615,11 @@ PyObject *PyCodec_LookupError(const char *name) static void wrong_exception_type(PyObject *exc) { - PyObject *type = PyObject_GetAttrString(exc, "__class__"); + _Py_IDENTIFIER(__class__); + _Py_IDENTIFIER(__name__); + PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__); if (type != NULL) { - PyObject *name = PyObject_GetAttrString(type, "__name__"); + PyObject *name = _PyObject_GetAttrId(type, &PyId___name__); Py_DECREF(type); if (name != NULL) { PyErr_Format(PyExc_TypeError, @@ -506,57 +658,58 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } - /* ouch: passing NULL, 0, pos gives None instead of u'' */ - return Py_BuildValue("(u#n)", &end, 0, end); + return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end); } PyObject *PyCodec_ReplaceErrors(PyObject *exc) { - PyObject *restuple; - Py_ssize_t start; - Py_ssize_t end; - Py_ssize_t i; + Py_ssize_t start, end, i, len; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *res; - Py_UNICODE *p; + int kind; + void *data; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; - res = PyUnicode_FromUnicode(NULL, end-start); + len = end - start; + res = PyUnicode_New(len, '?'); if (res == NULL) return NULL; - for (p = PyUnicode_AS_UNICODE(res), i = start; - i<end; ++p, ++i) - *p = '?'; - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); - return restuple; + kind = PyUnicode_KIND(res); + data = PyUnicode_DATA(res); + for (i = 0; i < len; ++i) + PyUnicode_WRITE(kind, data, i, '?'); + assert(_PyUnicode_CheckConsistency(res, 1)); + return Py_BuildValue("(Nn)", res, end); } else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { - Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; if (PyUnicodeDecodeError_GetEnd(exc, &end)) return NULL; - return Py_BuildValue("(u#n)", &res, 1, end); + return Py_BuildValue("(Cn)", + (int)Py_UNICODE_REPLACEMENT_CHARACTER, + end); } else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { PyObject *res; - Py_UNICODE *p; + int kind; + void *data; if (PyUnicodeTranslateError_GetStart(exc, &start)) return NULL; if (PyUnicodeTranslateError_GetEnd(exc, &end)) return NULL; - res = PyUnicode_FromUnicode(NULL, end-start); + len = end - start; + res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); if (res == NULL) return NULL; - for (p = PyUnicode_AS_UNICODE(res), i = start; - i<end; ++p, ++i) - *p = Py_UNICODE_REPLACEMENT_CHARACTER; - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); - return restuple; + kind = PyUnicode_KIND(res); + data = PyUnicode_DATA(res); + for (i=0; i < len; i++) + PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER); + assert(_PyUnicode_CheckConsistency(res, 1)); + return Py_BuildValue("(Nn)", res, end); } else { wrong_exception_type(exc); @@ -569,82 +722,74 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; + Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; - Py_UNICODE *p; - Py_UNICODE *startp; - Py_UNICODE *outp; - int ressize; + unsigned char *outp; + Py_ssize_t ressize; + Py_UCS4 ch; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - startp = PyUnicode_AS_UNICODE(object); - for (p = startp+start, ressize = 0; p < startp+end; ++p) { - if (*p<10) + if (end - start > PY_SSIZE_T_MAX / (2+7+1)) + end = start + PY_SSIZE_T_MAX / (2+7+1); + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + ch = PyUnicode_READ_CHAR(object, i); + if (ch<10) ressize += 2+1+1; - else if (*p<100) + else if (ch<100) ressize += 2+2+1; - else if (*p<1000) + else if (ch<1000) ressize += 2+3+1; - else if (*p<10000) + else if (ch<10000) ressize += 2+4+1; -#ifndef Py_UNICODE_WIDE - else + else if (ch<100000) ressize += 2+5+1; -#else - else if (*p<100000) - ressize += 2+5+1; - else if (*p<1000000) + else if (ch<1000000) ressize += 2+6+1; else ressize += 2+7+1; -#endif } /* allocate replacement */ - res = PyUnicode_FromUnicode(NULL, ressize); + res = PyUnicode_New(ressize, 127); if (res == NULL) { Py_DECREF(object); return NULL; } + outp = PyUnicode_1BYTE_DATA(res); /* generate replacement */ - for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); - p < startp+end; ++p) { - Py_UNICODE c = *p; + for (i = start; i < end; ++i) { int digits; int base; + ch = PyUnicode_READ_CHAR(object, i); *outp++ = '&'; *outp++ = '#'; - if (*p<10) { + if (ch<10) { digits = 1; base = 1; } - else if (*p<100) { + else if (ch<100) { digits = 2; base = 10; } - else if (*p<1000) { + else if (ch<1000) { digits = 3; base = 100; } - else if (*p<10000) { + else if (ch<10000) { digits = 4; base = 1000; } -#ifndef Py_UNICODE_WIDE - else { + else if (ch<100000) { digits = 5; base = 10000; } -#else - else if (*p<100000) { - digits = 5; - base = 10000; - } - else if (*p<1000000) { + else if (ch<1000000) { digits = 6; base = 100000; } @@ -652,16 +797,15 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) digits = 7; base = 1000000; } -#endif while (digits-->0) { - *outp++ = '0' + c/base; - c %= base; + *outp++ = '0' + ch/base; + ch %= base; base /= 10; } *outp++ = ';'; } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); return restuple; } @@ -671,87 +815,67 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } } -static Py_UNICODE hexdigits[] = { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' -}; - PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { -#ifndef Py_UNICODE_WIDE -#define IS_SURROGATE_PAIR(p, end) \ - (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \ - *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF) -#else -#define IS_SURROGATE_PAIR(p, end) 0 -#endif if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; + Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; - Py_UNICODE *p; - Py_UNICODE *startp; - Py_UNICODE *outp; - int ressize; + unsigned char *outp; + Py_ssize_t ressize; + Py_UCS4 c; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - startp = PyUnicode_AS_UNICODE(object); - for (p = startp+start, ressize = 0; p < startp+end; ++p) { -#ifdef Py_UNICODE_WIDE - if (*p >= 0x00010000) + if (end - start > PY_SSIZE_T_MAX / (1+1+8)) + end = start + PY_SSIZE_T_MAX / (1+1+8); + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (c >= 0x10000) { ressize += 1+1+8; - else -#endif - if (*p >= 0x100) { - if (IS_SURROGATE_PAIR(p, startp+end)) { - ressize += 1+1+8; - ++p; - } - else - ressize += 1+1+4; + } + else if (c >= 0x100) { + ressize += 1+1+4; } else ressize += 1+1+2; } - res = PyUnicode_FromUnicode(NULL, ressize); + res = PyUnicode_New(ressize, 127); if (res==NULL) return NULL; - for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); - p < startp+end; ++p) { - Py_UCS4 c = (Py_UCS4) *p; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); *outp++ = '\\'; - if (IS_SURROGATE_PAIR(p, startp+end)) { - c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000; - ++p; - } if (c >= 0x00010000) { *outp++ = 'U'; - *outp++ = hexdigits[(c>>28)&0xf]; - *outp++ = hexdigits[(c>>24)&0xf]; - *outp++ = hexdigits[(c>>20)&0xf]; - *outp++ = hexdigits[(c>>16)&0xf]; - *outp++ = hexdigits[(c>>12)&0xf]; - *outp++ = hexdigits[(c>>8)&0xf]; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; } else if (c >= 0x100) { *outp++ = 'u'; - *outp++ = hexdigits[(c>>12)&0xf]; - *outp++ = hexdigits[(c>>8)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; } else *outp++ = 'x'; - *outp++ = hexdigits[(c>>4)&0xf]; - *outp++ = hexdigits[c&0xf]; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); return restuple; } @@ -759,7 +883,6 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } -#undef IS_SURROGATE_PAIR } /* This handler is declared static until someone demonstrates @@ -769,12 +892,11 @@ PyCodec_SurrogatePassErrors(PyObject *exc) { PyObject *restuple; PyObject *object; + Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - Py_UNICODE *p; - Py_UNICODE *startp; char *outp; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; @@ -782,15 +904,15 @@ PyCodec_SurrogatePassErrors(PyObject *exc) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - startp = PyUnicode_AS_UNICODE(object); res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); if (!res) { Py_DECREF(object); return NULL; } outp = PyBytes_AsString(res); - for (p = startp+start; p < startp+end; p++) { - Py_UNICODE ch = *p; + for (i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); if (ch < 0xd800 || ch > 0xdfff) { /* Not a surrogate, fail with original exception */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); @@ -809,7 +931,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { unsigned char *p; - Py_UNICODE ch = 0; + Py_UCS4 ch = 0; if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; if (!(object = PyUnicodeDecodeError_GetObject(exc))) @@ -836,7 +958,10 @@ PyCodec_SurrogatePassErrors(PyObject *exc) PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } - return Py_BuildValue("(u#n)", &ch, 1, start+3); + res = PyUnicode_FromOrdinal(ch); + if (res == NULL) + return NULL; + return Py_BuildValue("(Nn)", res, start+3); } else { wrong_exception_type(exc); @@ -849,12 +974,11 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc) { PyObject *restuple; PyObject *object; + Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - Py_UNICODE *p; - Py_UNICODE *startp; char *outp; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; @@ -862,15 +986,15 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - startp = PyUnicode_AS_UNICODE(object); res = PyBytes_FromStringAndSize(NULL, end-start); if (!res) { Py_DECREF(object); return NULL; } outp = PyBytes_AsString(res); - for (p = startp+start; p < startp+end; p++) { - Py_UNICODE ch = *p; + for (i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); if (ch < 0xdc80 || ch > 0xdcff) { /* Not a UTF-8b surrogate, fail with original exception */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); @@ -886,8 +1010,9 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc) return restuple; } else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + PyObject *str; unsigned char *p; - Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */ + Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ int consumed = 0; if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; @@ -912,7 +1037,10 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc) PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } - return Py_BuildValue("(u#n)", ch, consumed, start+consumed); + str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); + if (str == NULL) + return NULL; + return Py_BuildValue("(Nn)", str, start+consumed); } else { wrong_exception_type(exc); @@ -1049,7 +1177,7 @@ static int _PyCodecRegistry_Init(void) interp->codec_error_registry = PyDict_New(); if (interp->codec_error_registry) { - for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) { + for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { PyObject *func = PyCFunction_New(&methods[i].def, NULL); int res; if (!func) |