diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-05-02 09:37:48 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-02 09:37:48 (GMT) |
commit | 18b07d773e09a2719e69aeaa925d5abb7ba0c068 (patch) | |
tree | 24e00867bcd614057ca886b2e977153d6168fa59 /Objects | |
parent | 614420df9796c8a4f01e24052fc0128b4c20c5bf (diff) | |
download | cpython-18b07d773e09a2719e69aeaa925d5abb7ba0c068.zip cpython-18b07d773e09a2719e69aeaa925d5abb7ba0c068.tar.gz cpython-18b07d773e09a2719e69aeaa925d5abb7ba0c068.tar.bz2 |
bpo-36819: Fix crashes in built-in encoders with weird error handlers (GH-28593)
If the error handler returns position less or equal than the starting
position of non-encodable characters, most of built-in encoders didn't
properly re-size the output buffer. This led to out-of-bounds writes,
and segfaults.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/stringlib/codecs.h | 15 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 60 |
2 files changed, 52 insertions, 23 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index b17cda1..958cc86 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, if (!rep) goto error; - /* subtract preallocated bytes */ - writer->min_size -= max_char_size * (newpos - startpos); + if (newpos < startpos) { + writer->overallocate = 1; + p = _PyBytesWriter_Prepare(writer, p, + max_char_size * (startpos - newpos)); + if (p == NULL) + goto error; + } + else { + /* subtract preallocated bytes */ + writer->min_size -= max_char_size * (newpos - startpos); + /* Only overallocate the buffer if it's not the last write */ + writer->overallocate = (newpos < size); + } if (PyBytes_Check(rep)) { p = _PyBytesWriter_WriteBytes(writer, p, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4933075..36a5296 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5868,7 +5868,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, pos = 0; while (pos < len) { - Py_ssize_t repsize, moreunits; + Py_ssize_t newpos, repsize, moreunits; if (kind == PyUnicode_2BYTE_KIND) { pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, @@ -5885,7 +5885,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", - str, &exc, pos, pos + 1, &pos); + str, &exc, pos, pos + 1, &newpos); if (!rep) goto error; @@ -5893,7 +5893,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, repsize = PyBytes_GET_SIZE(rep); if (repsize & 3) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } @@ -5906,28 +5906,30 @@ _PyUnicode_EncodeUTF32(PyObject *str, moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } } + moreunits += pos - newpos; + pos = newpos; /* four bytes are reserved for each surrogate */ - if (moreunits > 1) { + if (moreunits > 0) { Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { /* integer overflow */ PyErr_NoMemory(); goto error; } - if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0) + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0) goto error; out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { memcpy(out, PyBytes_AS_STRING(rep), repsize); - out += moreunits; + out += repsize / 4; } else /* rep is unicode */ { assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, @@ -6205,7 +6207,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, pos = 0; while (pos < len) { - Py_ssize_t repsize, moreunits; + Py_ssize_t newpos, repsize, moreunits; if (kind == PyUnicode_2BYTE_KIND) { pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, @@ -6222,7 +6224,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", - str, &exc, pos, pos + 1, &pos); + str, &exc, pos, pos + 1, &newpos); if (!rep) goto error; @@ -6230,7 +6232,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, repsize = PyBytes_GET_SIZE(rep); if (repsize & 1) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } @@ -6243,28 +6245,30 @@ _PyUnicode_EncodeUTF16(PyObject *str, moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, - str, pos - 1, pos, + str, pos, pos + 1, "surrogates not allowed"); goto error; } } + moreunits += pos - newpos; + pos = newpos; /* two bytes are reserved for each surrogate */ - if (moreunits > 1) { + if (moreunits > 0) { Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) { /* integer overflow */ PyErr_NoMemory(); goto error; } - if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0) + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0) goto error; out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { memcpy(out, PyBytes_AS_STRING(rep), repsize); - out += moreunits; + out += repsize / 2; } else /* rep is unicode */ { assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, @@ -7167,8 +7171,19 @@ unicode_encode_ucs1(PyObject *unicode, if (rep == NULL) goto onError; - /* subtract preallocated bytes */ - writer.min_size -= newpos - collstart; + if (newpos < collstart) { + writer.overallocate = 1; + str = _PyBytesWriter_Prepare(&writer, str, + collstart - newpos); + if (str == NULL) + goto onError; + } + else { + /* subtract preallocated bytes */ + writer.min_size -= newpos - collstart; + /* Only overallocate the buffer if it's not the last write */ + writer.overallocate = (newpos < size); + } if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ @@ -7944,13 +7959,14 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, pos, pos + 1, &newpos); if (rep == NULL) goto error; - pos = newpos; + Py_ssize_t morebytes = pos - newpos; if (PyBytes_Check(rep)) { outsize = PyBytes_GET_SIZE(rep); - if (outsize != 1) { + morebytes += outsize; + if (morebytes > 0) { Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); - newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes; if (_PyBytes_Resize(outbytes, newoutsize) < 0) { Py_DECREF(rep); goto error; @@ -7971,9 +7987,10 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, } outsize = PyUnicode_GET_LENGTH(rep); - if (outsize != 1) { + morebytes += outsize; + if (morebytes > 0) { Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); - newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); + newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes; if (_PyBytes_Resize(outbytes, newoutsize) < 0) { Py_DECREF(rep); goto error; @@ -7996,6 +8013,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, out++; } } + pos = newpos; Py_DECREF(rep); } /* write a NUL byte */ |