diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
commit | 58cf607d13c178f41aed05458296b68e985c5fff (patch) | |
tree | d9a39a30200eef16fec17f0ed934186e8e864149 /Objects | |
parent | a938bcfe952975cd117994acfef3712d61221f20 (diff) | |
download | cpython-58cf607d13c178f41aed05458296b68e985c5fff.zip cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.gz cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.bz2 |
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points
(U+D800-U+DFFF) to be encoded.
The utf-32* decoders no longer decode byte sequences that correspond to
surrogate code points.
The surrogatepass error handler now works with the utf-16* and utf-32* codecs.
Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/stringlib/codecs.h | 198 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 245 |
2 files changed, 403 insertions, 40 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 57319c6..14fdc6c 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -596,66 +596,232 @@ IllegalSurrogate: #undef SWAB -Py_LOCAL_INLINE(void) -STRINGLIB(utf16_encode)(unsigned short *out, - const STRINGLIB_CHAR *in, +#if STRINGLIB_MAX_CHAR >= 0x80 +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf16_encode_)(const STRINGLIB_CHAR *in, Py_ssize_t len, + unsigned short **outptr, int native_ordering) { + unsigned short *out = *outptr; const STRINGLIB_CHAR *end = in + len; #if STRINGLIB_SIZEOF_CHAR == 1 # define SWAB2(CH) ((CH) << 8) #else # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) #endif -#if STRINGLIB_MAX_CHAR < 0x10000 if (native_ordering) { -# if STRINGLIB_SIZEOF_CHAR == 2 - Py_MEMCPY(out, in, 2 * len); -# else - _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out); +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +# if STRINGLIB_MAX_CHAR >= 0xd800 + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; # endif + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch; + ch = *in++; +#if STRINGLIB_MAX_CHAR >= 0xd800 + if (ch < 0xd800) + *out++ = ch; + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +# if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + out[0] = Py_UNICODE_HIGH_SURROGATE(ch); + out[1] = Py_UNICODE_LOW_SURROGATE(ch); + out += 2; + } +# endif + else +#endif + *out++ = ch; + } } else { +#if STRINGLIB_MAX_CHAR < 0x10000 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); while (in < unrolled_end) { +# if STRINGLIB_MAX_CHAR >= 0xd800 + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +# endif out[0] = SWAB2(in[0]); out[1] = SWAB2(in[1]); out[2] = SWAB2(in[2]); out[3] = SWAB2(in[3]); in += 4; out += 4; } +#endif while (in < end) { - *out++ = SWAB2(*in); - ++in; + Py_UCS4 ch = *in++; +#if STRINGLIB_MAX_CHAR >= 0xd800 + if (ch < 0xd800) + *out++ = SWAB2((Py_UCS2)ch); + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +# if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); + Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); + out[0] = SWAB2(ch1); + out[1] = SWAB2(ch2); + out += 2; + } +# endif + else +#endif + *out++ = SWAB2((Py_UCS2)ch); } } -#else + *outptr = out; + return len; +#if STRINGLIB_MAX_CHAR >= 0xd800 + fail: +#endif + *outptr = out; + return len - (end - in + 1); +} +#endif + +#undef SWAB2 + +#if STRINGLIB_MAX_CHAR >= 0x80 +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, + Py_ssize_t len, + unsigned short **outptr, + int native_ordering) +{ + unsigned short *out = *outptr; + const STRINGLIB_CHAR *end = in + len; +#if STRINGLIB_SIZEOF_CHAR == 1 if (native_ordering) { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } + while (in < end) { + *out++ = *in++; + } + } else { +# define SWAB2(CH) ((CH) << 8) /* high byte is zero */ + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } while (in < end) { Py_UCS4 ch = *in++; - if (ch < 0x10000) + *out++ = SWAB2((Py_UCS2)ch); + } +#undef SWAB2 + } + *outptr = out; + return len; +#else + if (native_ordering) { +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch; + ch = *in++; + if (ch < 0xd800) *out++ = ch; - else { + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +#if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { out[0] = Py_UNICODE_HIGH_SURROGATE(ch); out[1] = Py_UNICODE_LOW_SURROGATE(ch); out += 2; } +#endif + else + *out++ = ch; } } else { +#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } +#endif while (in < end) { Py_UCS4 ch = *in++; - if (ch < 0x10000) + if (ch < 0xd800) *out++ = SWAB2((Py_UCS2)ch); - else { + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +#if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); out[0] = SWAB2(ch1); out[1] = SWAB2(ch2); out += 2; } +#endif + else + *out++ = SWAB2((Py_UCS2)ch); } +#undef SWAB2 } + *outptr = out; + return len; + fail: + *outptr = out; + return len - (end - in + 1); #endif -#undef SWAB2 } +#endif + #endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bddfafd..7114006 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4963,6 +4963,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, _PyUnicodeWriter writer; const unsigned char *q, *e; int le, bo = 0; /* assume native ordering by default */ + const char *encoding; const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; @@ -5002,6 +5003,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, #else le = bo <= 0; #endif + encoding = le ? "utf-32-le" : "utf-32-be"; _PyUnicodeWriter_Init(&writer); writer.min_length = (e - q + 3) / 4; @@ -5022,6 +5024,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; if (ch > maxch) break; + if (kind != PyUnicode_1BYTE_KIND && + Py_UNICODE_IS_SURROGATE(ch)) + break; PyUnicode_WRITE(kind, data, pos++, ch); q += 4; } while (q <= last); @@ -5031,6 +5036,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; if (ch > maxch) break; + if (kind != PyUnicode_1BYTE_KIND && + Py_UNICODE_IS_SURROGATE(ch)) + break; PyUnicode_WRITE(kind, data, pos++, ch); q += 4; } while (q <= last); @@ -5038,7 +5046,12 @@ PyUnicode_DecodeUTF32Stateful(const char *s, writer.pos = pos; } - if (ch <= maxch) { + if (Py_UNICODE_IS_SURROGATE(ch)) { + errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)"; + startinpos = ((const char *)q) - starts; + endinpos = startinpos + 4; + } + else if (ch <= maxch) { if (q == e || consumed) break; /* remaining bytes at the end? (size should be divisible by 4) */ @@ -5062,7 +5075,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, chooses to skip the input */ if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, - "utf32", errmsg, + encoding, errmsg, &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, &writer)) goto onError; @@ -5099,6 +5112,10 @@ _PyUnicode_EncodeUTF32(PyObject *str, #else int iorder[] = {3, 2, 1, 0}; #endif + const char *encoding; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; #define STORECHAR(CH) \ do { \ @@ -5130,7 +5147,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, if (byteorder == 0) STORECHAR(0xFEFF); if (len == 0) - goto done; + return v; if (byteorder == -1) { /* force LE */ @@ -5138,6 +5155,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, iorder[1] = 1; iorder[2] = 2; iorder[3] = 3; + encoding = "utf-32-le"; } else if (byteorder == 1) { /* force BE */ @@ -5145,13 +5163,103 @@ _PyUnicode_EncodeUTF32(PyObject *str, iorder[1] = 2; iorder[2] = 1; iorder[3] = 0; + encoding = "utf-32-be"; } + else + encoding = "utf-32"; - for (i = 0; i < len; i++) - STORECHAR(PyUnicode_READ(kind, data, i)); + if (kind == PyUnicode_1BYTE_KIND) { + for (i = 0; i < len; i++) + STORECHAR(PyUnicode_READ(kind, data, i)); + return v; + } - done: + for (i = 0; i < len;) { + Py_ssize_t repsize, moreunits; + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + i++; + assert(ch <= MAX_UNICODE); + if (!Py_UNICODE_IS_SURROGATE(ch)) { + STORECHAR(ch); + continue; + } + + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, + encoding, "surrogates not allowed", + str, &exc, i-1, i, &i); + + if (!rep) + goto error; + + if (PyBytes_Check(rep)) { + repsize = PyBytes_GET_SIZE(rep); + if (repsize & 3) { + raise_encode_exception(&exc, encoding, + str, i - 1, i, + "surrogates not allowed"); + goto error; + } + moreunits = repsize / 4; + } + else { + assert(PyUnicode_Check(rep)); + if (PyUnicode_READY(rep) < 0) + goto error; + moreunits = repsize = PyUnicode_GET_LENGTH(rep); + if (!PyUnicode_IS_ASCII(rep)) { + raise_encode_exception(&exc, encoding, + str, i - 1, i, + "surrogates not allowed"); + goto error; + } + } + + /* four bytes are reserved for each surrogate */ + if (moreunits > 1) { + Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); + Py_ssize_t morebytes = 4 * (moreunits - 1); + if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) + goto error; + p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; + } + + if (PyBytes_Check(rep)) { + Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); + p += repsize; + } else /* rep is unicode */ { + const Py_UCS1 *repdata; + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + repdata = PyUnicode_1BYTE_DATA(rep); + while (repsize--) { + Py_UCS4 ch = *repdata++; + STORECHAR(ch); + } + } + + Py_CLEAR(rep); + } + + /* Cut back to size actually needed. This is necessary for, for example, + encoding of a string containing isolated surrogates and the 'ignore' + handler is used. */ + nsize = p - (unsigned char*) PyBytes_AS_STRING(v); + if (nsize != PyBytes_GET_SIZE(v)) + _PyBytes_Resize(&v, nsize); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return v; + error: + Py_XDECREF(rep); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(v); + return NULL; #undef STORECHAR } @@ -5204,6 +5312,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; + const char *encoding; q = (unsigned char *)s; e = q + size; @@ -5237,8 +5346,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s, #if PY_LITTLE_ENDIAN native_ordering = bo <= 0; + encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; #else native_ordering = bo >= 0; + encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; #endif /* Note: size will always be longer than the resulting Unicode @@ -5312,7 +5423,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, - "utf16", errmsg, + encoding, errmsg, &starts, (const char **)&e, &startinpos, @@ -5348,13 +5459,17 @@ _PyUnicode_EncodeUTF16(PyObject *str, Py_ssize_t len; PyObject *v; unsigned short *out; - Py_ssize_t bytesize; Py_ssize_t pairs; #if PY_BIG_ENDIAN int native_ordering = byteorder >= 0; #else int native_ordering = byteorder <= 0; #endif + const char *encoding; + Py_ssize_t nsize, pos; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; if (!PyUnicode_Check(str)) { PyErr_BadArgument(); @@ -5376,8 +5491,8 @@ _PyUnicode_EncodeUTF16(PyObject *str, } if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) return PyErr_NoMemory(); - bytesize = (len + pairs + (byteorder == 0)) * 2; - v = PyBytes_FromStringAndSize(NULL, bytesize); + nsize = len + pairs + (byteorder == 0); + v = PyBytes_FromStringAndSize(NULL, nsize * 2); if (v == NULL) return NULL; @@ -5389,25 +5504,107 @@ _PyUnicode_EncodeUTF16(PyObject *str, if (len == 0) goto done; - switch (kind) { - case PyUnicode_1BYTE_KIND: { - ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); - break; - } - case PyUnicode_2BYTE_KIND: { - ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); - break; - } - case PyUnicode_4BYTE_KIND: { - ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); - break; + if (kind == PyUnicode_1BYTE_KIND) { + ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); + goto done; } - default: - assert(0); + + if (byteorder < 0) + encoding = "utf-16-le"; + else if (byteorder > 0) + encoding = "utf-16-be"; + else + encoding = "utf-16"; + + pos = 0; + while (pos < len) { + Py_ssize_t repsize, moreunits; + + if (kind == PyUnicode_2BYTE_KIND) { + pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, + &out, native_ordering); + } + else { + assert(kind == PyUnicode_4BYTE_KIND); + pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, + &out, native_ordering); + } + if (pos == len) + break; + + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, + encoding, "surrogates not allowed", + str, &exc, pos, pos + 1, &pos); + if (!rep) + goto error; + + if (PyBytes_Check(rep)) { + repsize = PyBytes_GET_SIZE(rep); + if (repsize & 1) { + raise_encode_exception(&exc, encoding, + str, pos - 1, pos, + "surrogates not allowed"); + goto error; + } + moreunits = repsize / 2; + } + else { + assert(PyUnicode_Check(rep)); + if (PyUnicode_READY(rep) < 0) + goto error; + moreunits = repsize = PyUnicode_GET_LENGTH(rep); + if (!PyUnicode_IS_ASCII(rep)) { + raise_encode_exception(&exc, encoding, + str, pos - 1, pos, + "surrogates not allowed"); + goto error; + } + } + + /* two bytes are reserved for each surrogate */ + if (moreunits > 1) { + Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); + Py_ssize_t morebytes = 2 * (moreunits - 1); + if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) + goto error; + out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; + } + + if (PyBytes_Check(rep)) { + Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); + out += moreunits; + } else /* rep is unicode */ { + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, + &out, native_ordering); + } + + Py_CLEAR(rep); } + /* Cut back to size actually needed. This is necessary for, for example, + encoding of a string containing isolated surrogates and the 'ignore' handler + is used. */ + nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); + if (nsize != PyBytes_GET_SIZE(v)) + _PyBytes_Resize(&v, nsize); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); done: return v; + error: + Py_XDECREF(rep); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(v); + return NULL; +#undef STORECHAR } PyObject * |