diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2025-05-13 13:25:08 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-13 13:25:08 (GMT) |
commit | 69b4387f78f413e8c47572a85b3478c47eba8142 (patch) | |
tree | 30bc6e580011bda809575c500cf3692edcb6a220 /Objects | |
parent | f0a7a6c2cc066523fc03d312cfebff8135d81aa2 (diff) | |
download | cpython-69b4387f78f413e8c47572a85b3478c47eba8142.zip cpython-69b4387f78f413e8c47572a85b3478c47eba8142.tar.gz cpython-69b4387f78f413e8c47572a85b3478c47eba8142.tar.bz2 |
[3.14] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133942)
If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().
_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623bd01349a18ba0c7a9cb1dad6a51e8e)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/bytesobject.c | 41 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 46 |
2 files changed, 51 insertions, 36 deletions
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index fc407ec..87ea116 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1075,10 +1075,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len, } /* Unescape a backslash-escaped string. */ -PyObject *_PyBytes_DecodeEscape(const char *s, +PyObject *_PyBytes_DecodeEscape2(const char *s, Py_ssize_t len, const char *errors, - const char **first_invalid_escape) + int *first_invalid_escape_char, + const char **first_invalid_escape_ptr) { int c; char *p; @@ -1092,7 +1093,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s, return NULL; writer.overallocate = 1; - *first_invalid_escape = NULL; + *first_invalid_escape_char = -1; + *first_invalid_escape_ptr = NULL; end = s + len; while (s < end) { @@ -1130,9 +1132,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s, c = (c<<3) + *s++ - '0'; } if (c > 0377) { - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-3; /* Back up 3 chars, since we've - already incremented s. */ + if (*first_invalid_escape_char == -1) { + *first_invalid_escape_char = c; + /* Back up 3 chars, since we've already incremented s. */ + *first_invalid_escape_ptr = s - 3; } } *p++ = c; @@ -1173,9 +1176,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s, break; default: - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-1; /* Back up one char, since we've - already incremented s. */ + if (*first_invalid_escape_char == -1) { + *first_invalid_escape_char = (unsigned char)s[-1]; + /* Back up one char, since we've already incremented s. */ + *first_invalid_escape_ptr = s - 1; } *p++ = '\\'; s--; @@ -1195,18 +1199,19 @@ PyObject *PyBytes_DecodeEscape(const char *s, Py_ssize_t Py_UNUSED(unicode), const char *Py_UNUSED(recode_encoding)) { - const char* first_invalid_escape; - PyObject *result = _PyBytes_DecodeEscape(s, len, errors, - &first_invalid_escape); + int first_invalid_escape_char; + const char *first_invalid_escape_ptr; + PyObject *result = _PyBytes_DecodeEscape2(s, len, errors, + &first_invalid_escape_char, + &first_invalid_escape_ptr); if (result == NULL) return NULL; - if (first_invalid_escape != NULL) { - unsigned char c = *first_invalid_escape; - if ('4' <= c && c <= '7') { + if (first_invalid_escape_char != -1) { + if (first_invalid_escape_char > 0xff) { if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "b\"\\%.3s\" is an invalid octal escape sequence. " + "b\"\\%o\" is an invalid octal escape sequence. " "Such sequences will not work in the future. ", - first_invalid_escape) < 0) + first_invalid_escape_char) < 0) { Py_DECREF(result); return NULL; @@ -1216,7 +1221,7 @@ PyObject *PyBytes_DecodeEscape(const char *s, if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "b\"\\%c\" is an invalid escape sequence. " "Such sequences will not work in the future. ", - c) < 0) + first_invalid_escape_char) < 0) { Py_DECREF(result); return NULL; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index eb3e1c4..4e1aafb 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6621,13 +6621,15 @@ _PyUnicode_GetNameCAPI(void) /* --- Unicode Escape Codec ----------------------------------------------- */ PyObject * -_PyUnicode_DecodeUnicodeEscapeInternal(const char *s, +_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s, Py_ssize_t size, const char *errors, Py_ssize_t *consumed, - const char **first_invalid_escape) + int *first_invalid_escape_char, + const char **first_invalid_escape_ptr) { const char *starts = s; + const char *initial_starts = starts; _PyUnicodeWriter writer; const char *end; PyObject *errorHandler = NULL; @@ -6635,7 +6637,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, _PyUnicode_Name_CAPI *ucnhash_capi; // so we can remember if we've seen an invalid escape char or not - *first_invalid_escape = NULL; + *first_invalid_escape_char = -1; + *first_invalid_escape_ptr = NULL; if (size == 0) { if (consumed) { @@ -6723,9 +6726,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, } } if (ch > 0377) { - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-3; /* Back up 3 chars, since we've - already incremented s. */ + if (*first_invalid_escape_char == -1) { + *first_invalid_escape_char = ch; + if (starts == initial_starts) { + /* Back up 3 chars, since we've already incremented s. */ + *first_invalid_escape_ptr = s - 3; + } } } WRITE_CHAR(ch); @@ -6820,9 +6826,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, goto error; default: - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-1; /* Back up one char, since we've - already incremented s. */ + if (*first_invalid_escape_char == -1) { + *first_invalid_escape_char = c; + if (starts == initial_starts) { + /* Back up one char, since we've already incremented s. */ + *first_invalid_escape_ptr = s - 1; + } } WRITE_ASCII_CHAR('\\'); WRITE_CHAR(c); @@ -6867,19 +6876,20 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, const char *errors, Py_ssize_t *consumed) { - const char *first_invalid_escape; - PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, + int first_invalid_escape_char; + const char *first_invalid_escape_ptr; + PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors, consumed, - &first_invalid_escape); + &first_invalid_escape_char, + &first_invalid_escape_ptr); if (result == NULL) return NULL; - if (first_invalid_escape != NULL) { - unsigned char c = *first_invalid_escape; - if ('4' <= c && c <= '7') { + if (first_invalid_escape_char != -1) { + if (first_invalid_escape_char > 0xff) { if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "\"\\%.3s\" is an invalid octal escape sequence. " + "\"\\%o\" is an invalid octal escape sequence. " "Such sequences will not work in the future. ", - first_invalid_escape) < 0) + first_invalid_escape_char) < 0) { Py_DECREF(result); return NULL; @@ -6889,7 +6899,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "\"\\%c\" is an invalid escape sequence. " "Such sequences will not work in the future. ", - c) < 0) + first_invalid_escape_char) < 0) { Py_DECREF(result); return NULL; |