diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
commit | 58cf607d13c178f41aed05458296b68e985c5fff (patch) | |
tree | d9a39a30200eef16fec17f0ed934186e8e864149 /Python/codecs.c | |
parent | a938bcfe952975cd117994acfef3712d61221f20 (diff) | |
download | cpython-58cf607d13c178f41aed05458296b68e985c5fff.zip cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.gz cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.bz2 |
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points
(U+D800-U+DFFF) to be encoded.
The utf-32* decoders no longer decode byte sequences that correspond to
surrogate code points.
The surrogatepass error handler now works with the utf-16* and utf-32* codecs.
Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 163 |
1 files changed, 146 insertions, 17 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index fe0cab4..8fe0af7 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +#define ENC_UTF8 0 +#define ENC_UTF16BE 1 +#define ENC_UTF16LE 2 +#define ENC_UTF32BE 3 +#define ENC_UTF32LE 4 + +static int +get_standard_encoding(const char *encoding, int *bytelength) +{ + if (Py_TOLOWER(encoding[0]) == 'u' && + Py_TOLOWER(encoding[1]) == 't' && + Py_TOLOWER(encoding[2]) == 'f') { + encoding += 3; + if (*encoding == '-' || *encoding == '_' ) + encoding++; + if (encoding[0] == '1' && encoding[1] == '6') { + encoding += 2; + *bytelength = 2; + if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN + return ENC_UTF16BE; +#else + return ENC_UTF16LE; +#endif + } + if (*encoding == '-' || *encoding == '_' ) + encoding++; + if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { + if (Py_TOLOWER(encoding[0]) == 'b') + return ENC_UTF16BE; + if (Py_TOLOWER(encoding[0]) == 'l') + return ENC_UTF16LE; + } + } + else if (encoding[0] == '3' && encoding[1] == '2') { + encoding += 2; + *bytelength = 4; + if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN + return ENC_UTF32BE; +#else + return ENC_UTF32LE; +#endif + } + if (*encoding == '-' || *encoding == '_' ) + encoding++; + if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { + if (Py_TOLOWER(encoding[0]) == 'b') + return ENC_UTF32BE; + if (Py_TOLOWER(encoding[0]) == 'l') + return ENC_UTF32LE; + } + } + } + /* utf-8 */ + *bytelength = 3; + return ENC_UTF8; +} + /* This handler is declared static until someone demonstrates a need to call it directly. */ static PyObject * @@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc) { PyObject *restuple; PyObject *object; + PyObject *encode; + char *encoding; + int code; + int bytelength; Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - char *outp; + unsigned char *outp; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + code = get_standard_encoding(encoding, &bytelength); + Py_DECREF(encode); + + res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); if (!res) { Py_DECREF(object); return NULL; } - outp = PyBytes_AsString(res); + outp = (unsigned char*)PyBytes_AsString(res); for (i = start; i < end; i++) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); @@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_DECREF(object); return NULL; } - *outp++ = (char)(0xe0 | (ch >> 12)); - *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *outp++ = (char)(0x80 | (ch & 0x3f)); + switch (code) { + case ENC_UTF8: + *outp++ = (unsigned char)(0xe0 | (ch >> 12)); + *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); + break; + case ENC_UTF16LE: + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + break; + case ENC_UTF16BE: + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + break; + case ENC_UTF32LE: + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 24); + break; + case ENC_UTF32BE: + *outp++ = (unsigned char)(ch >> 24); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + break; + } } restuple = Py_BuildValue("(On)", res, end); Py_DECREF(res); @@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc) Py_UCS4 ch = 0; if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; if (!(object = PyUnicodeDecodeError_GetObject(exc))) return NULL; if (!(p = (unsigned char*)PyBytes_AsString(object))) { Py_DECREF(object); return NULL; } + if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + code = get_standard_encoding(encoding, &bytelength); + Py_DECREF(encode); + /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ p += start; - if (PyBytes_GET_SIZE(object) - start >= 3 && - (p[0] & 0xf0) == 0xe0 && - (p[1] & 0xc0) == 0x80 && - (p[2] & 0xc0) == 0x80) { - /* it's a three-byte code */ - ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); - if (!Py_UNICODE_IS_SURROGATE(ch)) - /* it's not a surrogate - fail */ - ch = 0; + if (PyBytes_GET_SIZE(object) - start >= bytelength) { + switch (code) { + case ENC_UTF8: + if ((p[0] & 0xf0) == 0xe0 && + (p[1] & 0xc0) == 0x80 && + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + } + break; + case ENC_UTF16LE: + ch = p[1] << 8 | p[0]; + break; + case ENC_UTF16BE: + ch = p[0] << 8 | p[1]; + break; + case ENC_UTF32LE: + ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; + break; + case ENC_UTF32BE: + ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; + break; + } } + Py_DECREF(object); - if (ch == 0) { + if (!Py_UNICODE_IS_SURROGATE(ch)) { + /* it's not a surrogate - fail */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } res = PyUnicode_FromOrdinal(ch); if (res == NULL) return NULL; - return Py_BuildValue("(Nn)", res, start+3); + return Py_BuildValue("(Nn)", res, start + bytelength); } else { wrong_exception_type(exc); |