summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2013-11-19 09:32:41 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2013-11-19 09:32:41 (GMT)
commit58cf607d13c178f41aed05458296b68e985c5fff (patch)
treed9a39a30200eef16fec17f0ed934186e8e864149 /Objects
parenta938bcfe952975cd117994acfef3712d61221f20 (diff)
downloadcpython-58cf607d13c178f41aed05458296b68e985c5fff.zip
cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.gz
cpython-58cf607d13c178f41aed05458296b68e985c5fff.tar.bz2
Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/stringlib/codecs.h198
-rw-r--r--Objects/unicodeobject.c245
2 files changed, 403 insertions, 40 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 57319c6..14fdc6c 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -596,66 +596,232 @@ IllegalSurrogate:
#undef SWAB
-Py_LOCAL_INLINE(void)
-STRINGLIB(utf16_encode)(unsigned short *out,
- const STRINGLIB_CHAR *in,
+#if STRINGLIB_MAX_CHAR >= 0x80
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(utf16_encode_)(const STRINGLIB_CHAR *in,
Py_ssize_t len,
+ unsigned short **outptr,
int native_ordering)
{
+ unsigned short *out = *outptr;
const STRINGLIB_CHAR *end = in + len;
#if STRINGLIB_SIZEOF_CHAR == 1
# define SWAB2(CH) ((CH) << 8)
#else
# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
#endif
-#if STRINGLIB_MAX_CHAR < 0x10000
if (native_ordering) {
-# if STRINGLIB_SIZEOF_CHAR == 2
- Py_MEMCPY(out, in, 2 * len);
-# else
- _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
+#if STRINGLIB_MAX_CHAR < 0x10000
+ const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+ while (in < unrolled_end) {
+# if STRINGLIB_MAX_CHAR >= 0xd800
+ if (((in[0] ^ 0xd800) &
+ (in[1] ^ 0xd800) &
+ (in[2] ^ 0xd800) &
+ (in[3] ^ 0xd800) & 0xf800) == 0)
+ break;
# endif
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ in += 4; out += 4;
+ }
+#endif
+ while (in < end) {
+ Py_UCS4 ch;
+ ch = *in++;
+#if STRINGLIB_MAX_CHAR >= 0xd800
+ if (ch < 0xd800)
+ *out++ = ch;
+ else if (ch < 0xe000)
+ /* reject surrogate characters (U+DC800-U+DFFF) */
+ goto fail;
+# if STRINGLIB_MAX_CHAR >= 0x10000
+ else if (ch >= 0x10000) {
+ out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
+ out[1] = Py_UNICODE_LOW_SURROGATE(ch);
+ out += 2;
+ }
+# endif
+ else
+#endif
+ *out++ = ch;
+ }
} else {
+#if STRINGLIB_MAX_CHAR < 0x10000
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
while (in < unrolled_end) {
+# if STRINGLIB_MAX_CHAR >= 0xd800
+ if (((in[0] ^ 0xd800) &
+ (in[1] ^ 0xd800) &
+ (in[2] ^ 0xd800) &
+ (in[3] ^ 0xd800) & 0xf800) == 0)
+ break;
+# endif
out[0] = SWAB2(in[0]);
out[1] = SWAB2(in[1]);
out[2] = SWAB2(in[2]);
out[3] = SWAB2(in[3]);
in += 4; out += 4;
}
+#endif
while (in < end) {
- *out++ = SWAB2(*in);
- ++in;
+ Py_UCS4 ch = *in++;
+#if STRINGLIB_MAX_CHAR >= 0xd800
+ if (ch < 0xd800)
+ *out++ = SWAB2((Py_UCS2)ch);
+ else if (ch < 0xe000)
+ /* reject surrogate characters (U+DC800-U+DFFF) */
+ goto fail;
+# if STRINGLIB_MAX_CHAR >= 0x10000
+ else if (ch >= 0x10000) {
+ Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
+ Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
+ out[0] = SWAB2(ch1);
+ out[1] = SWAB2(ch2);
+ out += 2;
+ }
+# endif
+ else
+#endif
+ *out++ = SWAB2((Py_UCS2)ch);
}
}
-#else
+ *outptr = out;
+ return len;
+#if STRINGLIB_MAX_CHAR >= 0xd800
+ fail:
+#endif
+ *outptr = out;
+ return len - (end - in + 1);
+}
+#endif
+
+#undef SWAB2
+
+#if STRINGLIB_MAX_CHAR >= 0x80
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
+ Py_ssize_t len,
+ unsigned short **outptr,
+ int native_ordering)
+{
+ unsigned short *out = *outptr;
+ const STRINGLIB_CHAR *end = in + len;
+#if STRINGLIB_SIZEOF_CHAR == 1
if (native_ordering) {
+ const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+ while (in < unrolled_end) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ in += 4; out += 4;
+ }
+ while (in < end) {
+ *out++ = *in++;
+ }
+ } else {
+# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
+ const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+ while (in < unrolled_end) {
+ out[0] = SWAB2(in[0]);
+ out[1] = SWAB2(in[1]);
+ out[2] = SWAB2(in[2]);
+ out[3] = SWAB2(in[3]);
+ in += 4; out += 4;
+ }
while (in < end) {
Py_UCS4 ch = *in++;
- if (ch < 0x10000)
+ *out++ = SWAB2((Py_UCS2)ch);
+ }
+#undef SWAB2
+ }
+ *outptr = out;
+ return len;
+#else
+ if (native_ordering) {
+#if STRINGLIB_MAX_CHAR < 0x10000
+ const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+ while (in < unrolled_end) {
+ /* check if any character is a surrogate character */
+ if (((in[0] ^ 0xd800) &
+ (in[1] ^ 0xd800) &
+ (in[2] ^ 0xd800) &
+ (in[3] ^ 0xd800) & 0xf800) == 0)
+ break;
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ in += 4; out += 4;
+ }
+#endif
+ while (in < end) {
+ Py_UCS4 ch;
+ ch = *in++;
+ if (ch < 0xd800)
*out++ = ch;
- else {
+ else if (ch < 0xe000)
+ /* reject surrogate characters (U+DC800-U+DFFF) */
+ goto fail;
+#if STRINGLIB_MAX_CHAR >= 0x10000
+ else if (ch >= 0x10000) {
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
out += 2;
}
+#endif
+ else
+ *out++ = ch;
}
} else {
+#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
+#if STRINGLIB_MAX_CHAR < 0x10000
+ const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+ while (in < unrolled_end) {
+ /* check if any character is a surrogate character */
+ if (((in[0] ^ 0xd800) &
+ (in[1] ^ 0xd800) &
+ (in[2] ^ 0xd800) &
+ (in[3] ^ 0xd800) & 0xf800) == 0)
+ break;
+ out[0] = SWAB2(in[0]);
+ out[1] = SWAB2(in[1]);
+ out[2] = SWAB2(in[2]);
+ out[3] = SWAB2(in[3]);
+ in += 4; out += 4;
+ }
+#endif
while (in < end) {
Py_UCS4 ch = *in++;
- if (ch < 0x10000)
+ if (ch < 0xd800)
*out++ = SWAB2((Py_UCS2)ch);
- else {
+ else if (ch < 0xe000)
+ /* reject surrogate characters (U+DC800-U+DFFF) */
+ goto fail;
+#if STRINGLIB_MAX_CHAR >= 0x10000
+ else if (ch >= 0x10000) {
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
out[0] = SWAB2(ch1);
out[1] = SWAB2(ch2);
out += 2;
}
+#endif
+ else
+ *out++ = SWAB2((Py_UCS2)ch);
}
+#undef SWAB2
}
+ *outptr = out;
+ return len;
+ fail:
+ *outptr = out;
+ return len - (end - in + 1);
#endif
-#undef SWAB2
}
+#endif
+
#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index bddfafd..7114006 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4963,6 +4963,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
_PyUnicodeWriter writer;
const unsigned char *q, *e;
int le, bo = 0; /* assume native ordering by default */
+ const char *encoding;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
@@ -5002,6 +5003,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
#else
le = bo <= 0;
#endif
+ encoding = le ? "utf-32-le" : "utf-32-be";
_PyUnicodeWriter_Init(&writer);
writer.min_length = (e - q + 3) / 4;
@@ -5022,6 +5024,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
if (ch > maxch)
break;
+ if (kind != PyUnicode_1BYTE_KIND &&
+ Py_UNICODE_IS_SURROGATE(ch))
+ break;
PyUnicode_WRITE(kind, data, pos++, ch);
q += 4;
} while (q <= last);
@@ -5031,6 +5036,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
if (ch > maxch)
break;
+ if (kind != PyUnicode_1BYTE_KIND &&
+ Py_UNICODE_IS_SURROGATE(ch))
+ break;
PyUnicode_WRITE(kind, data, pos++, ch);
q += 4;
} while (q <= last);
@@ -5038,7 +5046,12 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
writer.pos = pos;
}
- if (ch <= maxch) {
+ if (Py_UNICODE_IS_SURROGATE(ch)) {
+ errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
+ startinpos = ((const char *)q) - starts;
+ endinpos = startinpos + 4;
+ }
+ else if (ch <= maxch) {
if (q == e || consumed)
break;
/* remaining bytes at the end? (size should be divisible by 4) */
@@ -5062,7 +5075,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
chooses to skip the input */
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
- "utf32", errmsg,
+ encoding, errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
&writer))
goto onError;
@@ -5099,6 +5112,10 @@ _PyUnicode_EncodeUTF32(PyObject *str,
#else
int iorder[] = {3, 2, 1, 0};
#endif
+ const char *encoding;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *rep = NULL;
#define STORECHAR(CH) \
do { \
@@ -5130,7 +5147,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
if (byteorder == 0)
STORECHAR(0xFEFF);
if (len == 0)
- goto done;
+ return v;
if (byteorder == -1) {
/* force LE */
@@ -5138,6 +5155,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
+ encoding = "utf-32-le";
}
else if (byteorder == 1) {
/* force BE */
@@ -5145,13 +5163,103 @@ _PyUnicode_EncodeUTF32(PyObject *str,
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
+ encoding = "utf-32-be";
}
+ else
+ encoding = "utf-32";
- for (i = 0; i < len; i++)
- STORECHAR(PyUnicode_READ(kind, data, i));
+ if (kind == PyUnicode_1BYTE_KIND) {
+ for (i = 0; i < len; i++)
+ STORECHAR(PyUnicode_READ(kind, data, i));
+ return v;
+ }
- done:
+ for (i = 0; i < len;) {
+ Py_ssize_t repsize, moreunits;
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ i++;
+ assert(ch <= MAX_UNICODE);
+ if (!Py_UNICODE_IS_SURROGATE(ch)) {
+ STORECHAR(ch);
+ continue;
+ }
+
+ rep = unicode_encode_call_errorhandler(
+ errors, &errorHandler,
+ encoding, "surrogates not allowed",
+ str, &exc, i-1, i, &i);
+
+ if (!rep)
+ goto error;
+
+ if (PyBytes_Check(rep)) {
+ repsize = PyBytes_GET_SIZE(rep);
+ if (repsize & 3) {
+ raise_encode_exception(&exc, encoding,
+ str, i - 1, i,
+ "surrogates not allowed");
+ goto error;
+ }
+ moreunits = repsize / 4;
+ }
+ else {
+ assert(PyUnicode_Check(rep));
+ if (PyUnicode_READY(rep) < 0)
+ goto error;
+ moreunits = repsize = PyUnicode_GET_LENGTH(rep);
+ if (!PyUnicode_IS_ASCII(rep)) {
+ raise_encode_exception(&exc, encoding,
+ str, i - 1, i,
+ "surrogates not allowed");
+ goto error;
+ }
+ }
+
+ /* four bytes are reserved for each surrogate */
+ if (moreunits > 1) {
+ Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
+ Py_ssize_t morebytes = 4 * (moreunits - 1);
+ if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
+ /* integer overflow */
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
+ goto error;
+ p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
+ }
+
+ if (PyBytes_Check(rep)) {
+ Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
+ p += repsize;
+ } else /* rep is unicode */ {
+ const Py_UCS1 *repdata;
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ repdata = PyUnicode_1BYTE_DATA(rep);
+ while (repsize--) {
+ Py_UCS4 ch = *repdata++;
+ STORECHAR(ch);
+ }
+ }
+
+ Py_CLEAR(rep);
+ }
+
+ /* Cut back to size actually needed. This is necessary for, for example,
+ encoding of a string containing isolated surrogates and the 'ignore'
+ handler is used. */
+ nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
+ if (nsize != PyBytes_GET_SIZE(v))
+ _PyBytes_Resize(&v, nsize);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return v;
+ error:
+ Py_XDECREF(rep);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ Py_XDECREF(v);
+ return NULL;
#undef STORECHAR
}
@@ -5204,6 +5312,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
+ const char *encoding;
q = (unsigned char *)s;
e = q + size;
@@ -5237,8 +5346,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
#if PY_LITTLE_ENDIAN
native_ordering = bo <= 0;
+ encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
#else
native_ordering = bo >= 0;
+ encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
#endif
/* Note: size will always be longer than the resulting Unicode
@@ -5312,7 +5423,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
if (unicode_decode_call_errorhandler_writer(
errors,
&errorHandler,
- "utf16", errmsg,
+ encoding, errmsg,
&starts,
(const char **)&e,
&startinpos,
@@ -5348,13 +5459,17 @@ _PyUnicode_EncodeUTF16(PyObject *str,
Py_ssize_t len;
PyObject *v;
unsigned short *out;
- Py_ssize_t bytesize;
Py_ssize_t pairs;
#if PY_BIG_ENDIAN
int native_ordering = byteorder >= 0;
#else
int native_ordering = byteorder <= 0;
#endif
+ const char *encoding;
+ Py_ssize_t nsize, pos;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ PyObject *rep = NULL;
if (!PyUnicode_Check(str)) {
PyErr_BadArgument();
@@ -5376,8 +5491,8 @@ _PyUnicode_EncodeUTF16(PyObject *str,
}
if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
return PyErr_NoMemory();
- bytesize = (len + pairs + (byteorder == 0)) * 2;
- v = PyBytes_FromStringAndSize(NULL, bytesize);
+ nsize = len + pairs + (byteorder == 0);
+ v = PyBytes_FromStringAndSize(NULL, nsize * 2);
if (v == NULL)
return NULL;
@@ -5389,25 +5504,107 @@ _PyUnicode_EncodeUTF16(PyObject *str,
if (len == 0)
goto done;
- switch (kind) {
- case PyUnicode_1BYTE_KIND: {
- ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
- break;
- }
- case PyUnicode_2BYTE_KIND: {
- ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
- break;
- }
- case PyUnicode_4BYTE_KIND: {
- ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
- break;
+ if (kind == PyUnicode_1BYTE_KIND) {
+ ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
+ goto done;
}
- default:
- assert(0);
+
+ if (byteorder < 0)
+ encoding = "utf-16-le";
+ else if (byteorder > 0)
+ encoding = "utf-16-be";
+ else
+ encoding = "utf-16";
+
+ pos = 0;
+ while (pos < len) {
+ Py_ssize_t repsize, moreunits;
+
+ if (kind == PyUnicode_2BYTE_KIND) {
+ pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
+ &out, native_ordering);
+ }
+ else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
+ &out, native_ordering);
+ }
+ if (pos == len)
+ break;
+
+ rep = unicode_encode_call_errorhandler(
+ errors, &errorHandler,
+ encoding, "surrogates not allowed",
+ str, &exc, pos, pos + 1, &pos);
+ if (!rep)
+ goto error;
+
+ if (PyBytes_Check(rep)) {
+ repsize = PyBytes_GET_SIZE(rep);
+ if (repsize & 1) {
+ raise_encode_exception(&exc, encoding,
+ str, pos - 1, pos,
+ "surrogates not allowed");
+ goto error;
+ }
+ moreunits = repsize / 2;
+ }
+ else {
+ assert(PyUnicode_Check(rep));
+ if (PyUnicode_READY(rep) < 0)
+ goto error;
+ moreunits = repsize = PyUnicode_GET_LENGTH(rep);
+ if (!PyUnicode_IS_ASCII(rep)) {
+ raise_encode_exception(&exc, encoding,
+ str, pos - 1, pos,
+ "surrogates not allowed");
+ goto error;
+ }
+ }
+
+ /* two bytes are reserved for each surrogate */
+ if (moreunits > 1) {
+ Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
+ Py_ssize_t morebytes = 2 * (moreunits - 1);
+ if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
+ /* integer overflow */
+ PyErr_NoMemory();
+ goto error;
+ }
+ if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
+ goto error;
+ out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
+ }
+
+ if (PyBytes_Check(rep)) {
+ Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
+ out += moreunits;
+ } else /* rep is unicode */ {
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
+ &out, native_ordering);
+ }
+
+ Py_CLEAR(rep);
}
+ /* Cut back to size actually needed. This is necessary for, for example,
+ encoding of a string containing isolated surrogates and the 'ignore' handler
+ is used. */
+ nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
+ if (nsize != PyBytes_GET_SIZE(v))
+ _PyBytes_Resize(&v, nsize);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
done:
return v;
+ error:
+ Py_XDECREF(rep);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ Py_XDECREF(v);
+ return NULL;
+#undef STORECHAR
}
PyObject *