diff options
author | Xiang Zhang <angwerzx@126.com> | 2018-01-31 13:34:17 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-31 13:34:17 (GMT) |
commit | ea94fce6960d90fffeeda131e31024617912d231 (patch) | |
tree | 6ddaec1c139817e5f8d643038536ec8751ff75ac | |
parent | eb126eddbd7542ac9d7cd2736116aee2e0bd03dd (diff) | |
download | cpython-ea94fce6960d90fffeeda131e31024617912d231.zip cpython-ea94fce6960d90fffeeda131e31024617912d231.tar.gz cpython-ea94fce6960d90fffeeda131e31024617912d231.tar.bz2 |
[3.6] bpo-32583: Fix possible crashing in builtin Unicode decoders (GH-5325) (#5459)
When using customized decode error handlers, it is possible for builtin decoders
to write out-of-bounds and then crash..
(cherry picked from commit 2c7fd46e11333ef5e5cce34212f7d087694f3658)
-rw-r--r-- | Lib/test/test_codeccallbacks.py | 52 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst | 2 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 22 |
3 files changed, 74 insertions, 2 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 0c066e6..e2e7463 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1044,6 +1044,58 @@ class CodecCallbackTest(unittest.TestCase): for (encoding, data) in baddata: self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") + # issue32583 + def test_crashing_decode_handler(self): + # better generating one more character to fill the extra space slot + # so in debug build it can steadily fail + def forward_shorter_than_end(exc): + if isinstance(exc, UnicodeDecodeError): + # size one character, 0 < forward < exc.end + return ('\ufffd', exc.start+1) + else: + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error( + "test.forward_shorter_than_end", forward_shorter_than_end) + + self.assertEqual( + b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode( + 'utf-16-le', 'test.forward_shorter_than_end'), + '\ufffd\ufffd\ufffd\ufffd\xd8\x00' + ) + self.assertEqual( + b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode( + 'utf-16-be', 'test.forward_shorter_than_end'), + '\ufffd\ufffd\ufffd\ufffd\xd8\x00' + ) + self.assertEqual( + b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode( + 'utf-32-le', 'test.forward_shorter_than_end'), + '\ufffd\ufffd\ufffd\u1111\x00' + ) + self.assertEqual( + b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode( + 'utf-32-be', 'test.forward_shorter_than_end'), + '\ufffd\ufffd\ufffd\u1111\x00' + ) + + def replace_with_long(exc): + if isinstance(exc, UnicodeDecodeError): + exc.object = b"\x00" * 8 + return ('\ufffd', exc.start) + else: + raise TypeError("don't know how to handle %r" % exc) + codecs.register_error("test.replace_with_long", replace_with_long) + + self.assertEqual( + b'\x00'.decode('utf-16', 'test.replace_with_long'), + '\ufffd\x00\x00\x00\x00' + ) + self.assertEqual( + b'\x00'.decode('utf-32', 'test.replace_with_long'), + '\ufffd\x00\x00' + ) + + def test_fake_error_class(self): handlers = [ codecs.strict_errors, diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst b/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst new file mode 100644 index 0000000..45f1d04 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst @@ -0,0 +1,2 @@ +Fix possible crashing in builtin Unicode decoders caused by write +out-of-bound errors when using customized decode error handlers. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9ccd06e..a246756 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4429,7 +4429,10 @@ unicode_decode_call_errorhandler_writer( Py_ssize_t insize; Py_ssize_t newpos; Py_ssize_t replen; + Py_ssize_t remain; PyObject *inputobj = NULL; + int need_to_grow = 0; + const char *new_inptr; if (*errorHandler == NULL) { *errorHandler = PyCodec_LookupError(errors); @@ -4463,6 +4466,7 @@ unicode_decode_call_errorhandler_writer( if (!PyBytes_Check(inputobj)) { PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); } + remain = *inend - *input - *endinpos; *input = PyBytes_AS_STRING(inputobj); insize = PyBytes_GET_SIZE(inputobj); *inend = *input + insize; @@ -4482,6 +4486,19 @@ unicode_decode_call_errorhandler_writer( replen = PyUnicode_GET_LENGTH(repunicode); if (replen > 1) { writer->min_length += replen - 1; + need_to_grow = 1; + } + new_inptr = *input + newpos; + if (*inend - new_inptr > remain) { + /* We don't know the decoding algorithm here so we make the worst + assumption that one byte decodes to one unicode character. + If unfortunately one byte could decode to more unicode characters, + the decoder may write out-of-bound then. Is it possible for the + algorithms using this function? */ + writer->min_length += *inend - new_inptr - remain; + need_to_grow = 1; + } + if (need_to_grow) { writer->overallocate = 1; if (_PyUnicodeWriter_Prepare(writer, writer->min_length, PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) @@ -4491,7 +4508,7 @@ unicode_decode_call_errorhandler_writer( goto onError; *endinpos = newpos; - *inptr = *input + newpos; + *inptr = new_inptr; /* we made it! */ Py_XDECREF(restuple); @@ -5663,7 +5680,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s, #endif /* Note: size will always be longer than the resulting Unicode - character count */ + character count normally. Error handler will take care of + resizing when needed. */ _PyUnicodeWriter_Init(&writer); writer.min_length = (e - q + 1) / 2; if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) |