diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2021-10-14 18:23:39 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-14 18:23:39 (GMT) |
commit | 4641afef661e6a22bc64194bd334b161c95edfe2 (patch) | |
tree | d053338921b36eb41bd0a1880c801a33cbec7878 /Objects | |
parent | 0bff4ccbfd3297b0adf690655d3e9ddb0033bc69 (diff) | |
download | cpython-4641afef661e6a22bc64194bd334b161c95edfe2.zip cpython-4641afef661e6a22bc64194bd334b161c95edfe2.tar.gz cpython-4641afef661e6a22bc64194bd334b161c95edfe2.tar.bz2 |
[3.10] bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) (GH-28952)
They support now splitting escape sequences between input chunks.
Add the third parameter "final" in codecs.raw_unicode_escape_decode().
It is True by default to match the former behavior.
(cherry picked from commit 39aa98346d5dd8ac591a7cafb467af21c53f1e5d)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 64 |
1 files changed, 44 insertions, 20 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e660834..c728710 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6445,8 +6445,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_ASCII_CHAR(ch) \ @@ -6473,7 +6471,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, continue; } - startinpos = s - starts - 1; + Py_ssize_t startinpos = s - starts - 1; /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; @@ -6620,8 +6618,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, *consumed = startinpos; break; } - error: - endinpos = s-starts; + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6816,9 +6814,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, /* --- Raw Unicode Escape Codec ------------------------------------------- */ PyObject * -PyUnicode_DecodeRawUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { const char *starts = s; _PyUnicodeWriter writer; @@ -6827,6 +6826,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject *exc = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -6845,8 +6847,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_CHAR(ch) \ @@ -6861,11 +6861,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (c != '\\' || s >= end) { + if (c != '\\' || (s >= end && !consumed)) { WRITE_CHAR(c); continue; } + Py_ssize_t startinpos = s - starts - 1; + /* \ - Escapes */ + if (s >= end) { + assert(consumed); + // Set message to silent compiler warning. + // Actually it is never used. + message = "\\ at end of string"; + goto incomplete; + } + c = (unsigned char) *s++; if (c == 'u') { count = 4; @@ -6881,10 +6891,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, WRITE_CHAR(c); continue; } - startinpos = s - starts - 2; /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6897,18 +6909,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (!count) { - if (ch <= MAX_UNICODE) { - WRITE_CHAR(ch); - continue; - } + if (ch > MAX_UNICODE) { message = "\\Uxxxxxxxx out of range"; + goto error; } + WRITE_CHAR(ch); + continue; - endinpos = s-starts; + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6930,7 +6947,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); return NULL; +} +PyObject * +PyUnicode_DecodeRawUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL); } |