diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-19 09:32:41 (GMT) |
commit | 58cf607d13c178f41aed05458296b68e985c5fff (patch) | |
tree | d9a39a30200eef16fec17f0ed934186e8e864149 /Objects/stringlib/codecs.h | |
parent | a938bcfe952975cd117994acfef3712d61221f20 (diff) | |
download | cpython-58cf607d13c178f41aed05458296b68e985c5fff.zip cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.gz cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.bz2 |
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points
(U+D800-U+DFFF) to be encoded.
The utf-32* decoders no longer decode byte sequences that correspond to
surrogate code points.
The surrogatepass error handler now works with the utf-16* and utf-32* codecs.
Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
Diffstat (limited to 'Objects/stringlib/codecs.h')
-rw-r--r-- | Objects/stringlib/codecs.h | 198 |
1 files changed, 182 insertions, 16 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 57319c6..14fdc6c 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -596,66 +596,232 @@ IllegalSurrogate: #undef SWAB -Py_LOCAL_INLINE(void) -STRINGLIB(utf16_encode)(unsigned short *out, - const STRINGLIB_CHAR *in, +#if STRINGLIB_MAX_CHAR >= 0x80 +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf16_encode_)(const STRINGLIB_CHAR *in, Py_ssize_t len, + unsigned short **outptr, int native_ordering) { + unsigned short *out = *outptr; const STRINGLIB_CHAR *end = in + len; #if STRINGLIB_SIZEOF_CHAR == 1 # define SWAB2(CH) ((CH) << 8) #else # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) #endif -#if STRINGLIB_MAX_CHAR < 0x10000 if (native_ordering) { -# if STRINGLIB_SIZEOF_CHAR == 2 - Py_MEMCPY(out, in, 2 * len); -# else - _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out); +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +# if STRINGLIB_MAX_CHAR >= 0xd800 + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; # endif + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch; + ch = *in++; +#if STRINGLIB_MAX_CHAR >= 0xd800 + if (ch < 0xd800) + *out++ = ch; + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +# if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + out[0] = Py_UNICODE_HIGH_SURROGATE(ch); + out[1] = Py_UNICODE_LOW_SURROGATE(ch); + out += 2; + } +# endif + else +#endif + *out++ = ch; + } } else { +#if STRINGLIB_MAX_CHAR < 0x10000 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); while (in < unrolled_end) { +# if STRINGLIB_MAX_CHAR >= 0xd800 + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +# endif out[0] = SWAB2(in[0]); out[1] = SWAB2(in[1]); out[2] = SWAB2(in[2]); out[3] = SWAB2(in[3]); in += 4; out += 4; } +#endif while (in < end) { - *out++ = SWAB2(*in); - ++in; + Py_UCS4 ch = *in++; +#if STRINGLIB_MAX_CHAR >= 0xd800 + if (ch < 0xd800) + *out++ = SWAB2((Py_UCS2)ch); + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +# if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); + Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); + out[0] = SWAB2(ch1); + out[1] = SWAB2(ch2); + out += 2; + } +# endif + else +#endif + *out++ = SWAB2((Py_UCS2)ch); } } -#else + *outptr = out; + return len; +#if STRINGLIB_MAX_CHAR >= 0xd800 + fail: +#endif + *outptr = out; + return len - (end - in + 1); +} +#endif + +#undef SWAB2 + +#if STRINGLIB_MAX_CHAR >= 0x80 +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, + Py_ssize_t len, + unsigned short **outptr, + int native_ordering) +{ + unsigned short *out = *outptr; + const STRINGLIB_CHAR *end = in + len; +#if STRINGLIB_SIZEOF_CHAR == 1 if (native_ordering) { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } + while (in < end) { + *out++ = *in++; + } + } else { +# define SWAB2(CH) ((CH) << 8) /* high byte is zero */ + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } while (in < end) { Py_UCS4 ch = *in++; - if (ch < 0x10000) + *out++ = SWAB2((Py_UCS2)ch); + } +#undef SWAB2 + } + *outptr = out; + return len; +#else + if (native_ordering) { +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch; + ch = *in++; + if (ch < 0xd800) *out++ = ch; - else { + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +#if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { out[0] = Py_UNICODE_HIGH_SURROGATE(ch); out[1] = Py_UNICODE_LOW_SURROGATE(ch); out += 2; } +#endif + else + *out++ = ch; } } else { +#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } +#endif while (in < end) { Py_UCS4 ch = *in++; - if (ch < 0x10000) + if (ch < 0xd800) *out++ = SWAB2((Py_UCS2)ch); - else { + else if (ch < 0xe000) + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; +#if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); out[0] = SWAB2(ch1); out[1] = SWAB2(ch2); out += 2; } +#endif + else + *out++ = SWAB2((Py_UCS2)ch); } +#undef SWAB2 } + *outptr = out; + return len; + fail: + *outptr = out; + return len - (end - in + 1); #endif -#undef SWAB2 } +#endif + #endif /* STRINGLIB_IS_UNICODE */ |