diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2021-10-14 17:04:19 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-14 17:04:19 (GMT) |
commit | 39aa98346d5dd8ac591a7cafb467af21c53f1e5d (patch) | |
tree | d363b62f299171467fce0cd3e1fe155c2ca41a09 /Objects | |
parent | d413c503636cde2a6ab0ada25dccb0134633a8e6 (diff) | |
download | cpython-39aa98346d5dd8ac591a7cafb467af21c53f1e5d.zip cpython-39aa98346d5dd8ac591a7cafb467af21c53f1e5d.tar.gz cpython-39aa98346d5dd8ac591a7cafb467af21c53f1e5d.tar.bz2 |
bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944)
They support now splitting escape sequences between input chunks.
Add the third parameter "final" in codecs.raw_unicode_escape_decode().
It is True by default to match the former behavior.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 64 |
1 files changed, 44 insertions, 20 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index af3b333..386052f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_ASCII_CHAR(ch) \ @@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, continue; } - startinpos = s - starts - 1; + Py_ssize_t startinpos = s - starts - 1; /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; @@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, *consumed = startinpos; break; } - error: - endinpos = s-starts; + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* --- Raw Unicode Escape Codec ------------------------------------------- */ PyObject * -PyUnicode_DecodeRawUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { const char *starts = s; _PyUnicodeWriter writer; @@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject *exc = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_CHAR(ch) \ @@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (c != '\\' || s >= end) { + if (c != '\\' || (s >= end && !consumed)) { WRITE_CHAR(c); continue; } + Py_ssize_t startinpos = s - starts - 1; + /* \ - Escapes */ + if (s >= end) { + assert(consumed); + // Set message to silent compiler warning. + // Actually it is never used. + message = "\\ at end of string"; + goto incomplete; + } + c = (unsigned char) *s++; if (c == 'u') { count = 4; @@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, WRITE_CHAR(c); continue; } - startinpos = s - starts - 2; /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (!count) { - if (ch <= MAX_UNICODE) { - WRITE_CHAR(ch); - continue; - } + if (ch > MAX_UNICODE) { message = "\\Uxxxxxxxx out of range"; + goto error; } + WRITE_CHAR(ch); + continue; - endinpos = s-starts; + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); return NULL; +} +PyObject * +PyUnicode_DecodeRawUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL); } |