summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXiang Zhang <angwerzx@126.com>2018-01-31 13:34:17 (GMT)
committerGitHub <noreply@github.com>2018-01-31 13:34:17 (GMT)
commitea94fce6960d90fffeeda131e31024617912d231 (patch)
tree6ddaec1c139817e5f8d643038536ec8751ff75ac
parenteb126eddbd7542ac9d7cd2736116aee2e0bd03dd (diff)
downloadcpython-ea94fce6960d90fffeeda131e31024617912d231.zip
cpython-ea94fce6960d90fffeeda131e31024617912d231.tar.gz
cpython-ea94fce6960d90fffeeda131e31024617912d231.tar.bz2
[3.6] bpo-32583: Fix possible crashing in builtin Unicode decoders (GH-5325) (#5459)
When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.. (cherry picked from commit 2c7fd46e11333ef5e5cce34212f7d087694f3658)
-rw-r--r--Lib/test/test_codeccallbacks.py52
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst2
-rw-r--r--Objects/unicodeobject.c22
3 files changed, 74 insertions, 2 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 0c066e6..e2e7463 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -1044,6 +1044,58 @@ class CodecCallbackTest(unittest.TestCase):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
+ # issue32583
+ def test_crashing_decode_handler(self):
+ # better generating one more character to fill the extra space slot
+ # so in debug build it can steadily fail
+ def forward_shorter_than_end(exc):
+ if isinstance(exc, UnicodeDecodeError):
+ # size one character, 0 < forward < exc.end
+ return ('\ufffd', exc.start+1)
+ else:
+ raise TypeError("don't know how to handle %r" % exc)
+ codecs.register_error(
+ "test.forward_shorter_than_end", forward_shorter_than_end)
+
+ self.assertEqual(
+ b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
+ 'utf-16-le', 'test.forward_shorter_than_end'),
+ '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
+ )
+ self.assertEqual(
+ b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
+ 'utf-16-be', 'test.forward_shorter_than_end'),
+ '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
+ )
+ self.assertEqual(
+ b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
+ 'utf-32-le', 'test.forward_shorter_than_end'),
+ '\ufffd\ufffd\ufffd\u1111\x00'
+ )
+ self.assertEqual(
+ b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
+ 'utf-32-be', 'test.forward_shorter_than_end'),
+ '\ufffd\ufffd\ufffd\u1111\x00'
+ )
+
+ def replace_with_long(exc):
+ if isinstance(exc, UnicodeDecodeError):
+ exc.object = b"\x00" * 8
+ return ('\ufffd', exc.start)
+ else:
+ raise TypeError("don't know how to handle %r" % exc)
+ codecs.register_error("test.replace_with_long", replace_with_long)
+
+ self.assertEqual(
+ b'\x00'.decode('utf-16', 'test.replace_with_long'),
+ '\ufffd\x00\x00\x00\x00'
+ )
+ self.assertEqual(
+ b'\x00'.decode('utf-32', 'test.replace_with_long'),
+ '\ufffd\x00\x00'
+ )
+
+
def test_fake_error_class(self):
handlers = [
codecs.strict_errors,
diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst b/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst
new file mode 100644
index 0000000..45f1d04
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst
@@ -0,0 +1,2 @@
+Fix possible crashing in builtin Unicode decoders caused by write
+out-of-bound errors when using customized decode error handlers.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 9ccd06e..a246756 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4429,7 +4429,10 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t insize;
Py_ssize_t newpos;
Py_ssize_t replen;
+ Py_ssize_t remain;
PyObject *inputobj = NULL;
+ int need_to_grow = 0;
+ const char *new_inptr;
if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
@@ -4463,6 +4466,7 @@ unicode_decode_call_errorhandler_writer(
if (!PyBytes_Check(inputobj)) {
PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
}
+ remain = *inend - *input - *endinpos;
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
@@ -4482,6 +4486,19 @@ unicode_decode_call_errorhandler_writer(
replen = PyUnicode_GET_LENGTH(repunicode);
if (replen > 1) {
writer->min_length += replen - 1;
+ need_to_grow = 1;
+ }
+ new_inptr = *input + newpos;
+ if (*inend - new_inptr > remain) {
+ /* We don't know the decoding algorithm here so we make the worst
+ assumption that one byte decodes to one unicode character.
+ If unfortunately one byte could decode to more unicode characters,
+ the decoder may write out-of-bound then. Is it possible for the
+ algorithms using this function? */
+ writer->min_length += *inend - new_inptr - remain;
+ need_to_grow = 1;
+ }
+ if (need_to_grow) {
writer->overallocate = 1;
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@@ -4491,7 +4508,7 @@ unicode_decode_call_errorhandler_writer(
goto onError;
*endinpos = newpos;
- *inptr = *input + newpos;
+ *inptr = new_inptr;
/* we made it! */
Py_XDECREF(restuple);
@@ -5663,7 +5680,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
#endif
/* Note: size will always be longer than the resulting Unicode
- character count */
+ character count normally. Error handler will take care of
+ resizing when needed. */
_PyUnicodeWriter_Init(&writer);
writer.min_length = (e - q + 1) / 2;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)