diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2021-10-14 10:17:00 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-14 10:17:00 (GMT) |
commit | c96d1546b11b4c282a7e21737cb1f5d16349656d (patch) | |
tree | 5e6e49378a4207591316f08e6b7fd42d4e3ff40c /Objects | |
parent | e71662c1ae817e728233ce93882c5b20f4c31ebc (diff) | |
download | cpython-c96d1546b11b4c282a7e21737cb1f5d16349656d.zip cpython-c96d1546b11b4c282a7e21737cb1f5d16349656d.tar.gz cpython-c96d1546b11b4c282a7e21737cb1f5d16349656d.tar.bz2 |
bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939)
They support now splitting escape sequences between input chunks.
Add the third parameter "final" in codecs.unicode_escape_decode().
It is True by default to match the former behavior.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 49 |
1 files changed, 37 insertions, 12 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 741cf9d..af3b333 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6342,9 +6342,10 @@ PyUnicode_AsUTF16String(PyObject *unicode) static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; PyObject * -_PyUnicode_DecodeUnicodeEscape(const char *s, +_PyUnicode_DecodeUnicodeEscapeInternal(const char *s, Py_ssize_t size, const char *errors, + Py_ssize_t *consumed, const char **first_invalid_escape) { const char *starts = s; @@ -6357,6 +6358,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, *first_invalid_escape = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } /* Escaped strings will always be longer than the resulting @@ -6407,7 +6411,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; - goto error; + goto incomplete; } c = (unsigned char) *s++; @@ -6461,7 +6465,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, count = 8; message = "truncated \\UXXXXXXXX escape"; hexescape: - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6474,12 +6481,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (count) { - goto error; - } /* when we get here, ch is a 32-bit unicode character */ if (ch > MAX_UNICODE) { @@ -6506,14 +6510,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, } message = "malformed \\N character escape"; - if (s < end && *s == '{') { + if (s >= end) { + goto incomplete; + } + if (*s == '{') { const char *start = ++s; size_t namelen; /* look for the closing brace */ while (s < end && *s != '}') s++; + if (s >= end) { + goto incomplete; + } namelen = s - start; - if (namelen && s < end) { + if (namelen) { /* found a name. look it up in the unicode database */ s++; ch = 0xffffffff; /* in case 'getcode' messes up */ @@ -6539,6 +6549,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, continue; } + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } error: endinpos = s-starts; writer.min_length = end - s + writer.pos; @@ -6567,12 +6582,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, } PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s, +_PyUnicode_DecodeUnicodeEscapeStateful(const char *s, Py_ssize_t size, - const char *errors) + const char *errors, + Py_ssize_t *consumed) { const char *first_invalid_escape; - PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, + PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, + consumed, &first_invalid_escape); if (result == NULL) return NULL; @@ -6587,6 +6604,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s, return result; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL); +} + /* Return a Unicode-Escape string version of the Unicode object. */ PyObject * |