summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorXiang Zhang <angwerzx@126.com>2018-01-31 12:48:05 (GMT)
committerGitHub <noreply@github.com>2018-01-31 12:48:05 (GMT)
commit2c7fd46e11333ef5e5cce34212f7d087694f3658 (patch)
tree0497c3b1fa32112a475fe3b7da5390b59205f7fd /Objects/unicodeobject.c
parent84521047e413d7d1150aaa1c333580b683b3f4b1 (diff)
downloadcpython-2c7fd46e11333ef5e5cce34212f7d087694f3658.zip
cpython-2c7fd46e11333ef5e5cce34212f7d087694f3658.tar.gz
cpython-2c7fd46e11333ef5e5cce34212f7d087694f3658.tar.bz2
bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)
When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c22
1 files changed, 20 insertions, 2 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 775bd15..3d9e09d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t insize;
Py_ssize_t newpos;
Py_ssize_t replen;
+ Py_ssize_t remain;
PyObject *inputobj = NULL;
+ int need_to_grow = 0;
+ const char *new_inptr;
if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
@@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
+ remain = *inend - *input - *endinpos;
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
@@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
replen = PyUnicode_GET_LENGTH(repunicode);
if (replen > 1) {
writer->min_length += replen - 1;
+ need_to_grow = 1;
+ }
+ new_inptr = *input + newpos;
+ if (*inend - new_inptr > remain) {
+ /* We don't know the decoding algorithm here so we make the worst
+ assumption that one byte decodes to one unicode character.
+ If unfortunately one byte could decode to more unicode characters,
+ the decoder may write out-of-bound then. Is it possible for the
+ algorithms using this function? */
+ writer->min_length += *inend - new_inptr - remain;
+ need_to_grow = 1;
+ }
+ if (need_to_grow) {
writer->overallocate = 1;
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
goto onError;
*endinpos = newpos;
- *inptr = *input + newpos;
+ *inptr = new_inptr;
/* we made it! */
Py_DECREF(restuple);
@@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
#endif
/* Note: size will always be longer than the resulting Unicode
- character count */
+ character count normally. Error handler will take care of
+ resizing when needed. */
_PyUnicodeWriter_Init(&writer);
writer.min_length = (e - q + 1) / 2;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)