From c96d1546b11b4c282a7e21737cb1f5d16349656d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 14 Oct 2021 13:17:00 +0300 Subject: bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939) They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.unicode_escape_decode(). It is True by default to match the former behavior. --- Include/cpython/unicodeobject.h | 10 ++++- Lib/encodings/unicode_escape.py | 9 ++-- Lib/test/test_codecs.py | 50 ++++++++++++++++++++-- .../2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst | 2 + Modules/_codecsmodule.c | 13 +++--- Modules/clinic/_codecsmodule.c.h | 18 +++++--- Objects/unicodeobject.c | 49 +++++++++++++++------ Parser/string_parser.c | 2 +- 8 files changed, 121 insertions(+), 32 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 0cbdbdb..bc5a3b4 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -777,12 +777,20 @@ PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( /* --- Unicode-Escape Codecs ---------------------------------------------- */ +/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */ +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful( + const char *string, /* Unicode-Escape encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ +); /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape chars. */ -PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal( const char *string, /* Unicode-Escape encoded string */ Py_ssize_t length, /* size of string */ const char *errors, /* error handling */ + Py_ssize_t *consumed, /* bytes consumed */ const char **first_invalid_escape /* on return, points to first invalid escaped char in string. */ diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py index 817f932..9b1ce99b 100644 --- a/Lib/encodings/unicode_escape.py +++ b/Lib/encodings/unicode_escape.py @@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return codecs.unicode_escape_encode(input, self.errors)[0] -class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return codecs.unicode_escape_decode(input, self.errors)[0] +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return codecs.unicode_escape_decode(input, errors, final) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): - pass + def decode(self, input, errors='strict'): + return codecs.unicode_escape_decode(input, errors, False) ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index f1a149f..288a300 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -114,7 +114,7 @@ class ReadTest(MixInCheckStateHandling): q = Queue(b"") r = codecs.getreader(self.encoding)(q) result = "" - for (c, partialresult) in zip(input.encode(self.encoding), partialresults): + for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True): q.write(bytes([c])) result += r.read() self.assertEqual(result, partialresult) @@ -125,7 +125,7 @@ class ReadTest(MixInCheckStateHandling): # do the check again, this time using an incremental decoder d = codecs.getincrementaldecoder(self.encoding)() result = "" - for (c, partialresult) in zip(input.encode(self.encoding), partialresults): + for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True): result += d.decode(bytes([c])) self.assertEqual(result, partialresult) # check that there's nothing left in the buffers @@ -135,7 +135,7 @@ class ReadTest(MixInCheckStateHandling): # Check whether the reset method works properly d.reset() result = "" - for (c, partialresult) in zip(input.encode(self.encoding), partialresults): + for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True): result += d.decode(bytes([c])) self.assertEqual(result, partialresult) # check that there's nothing left in the buffers @@ -2353,7 +2353,11 @@ class TypesTest(unittest.TestCase): (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) -class UnicodeEscapeTest(unittest.TestCase): +class UnicodeEscapeTest(ReadTest, unittest.TestCase): + encoding = "unicode-escape" + + test_lone_surrogates = None + def test_empty(self): self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0)) @@ -2440,6 +2444,44 @@ class UnicodeEscapeTest(unittest.TestCase): self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) + def test_partial(self): + self.check_partial( + "\x00\t\n\r\\\xff\uffff\U00010000", + [ + '', + '', + '', + '\x00', + '\x00', + '\x00\t', + '\x00\t', + '\x00\t\n', + '\x00\t\n', + '\x00\t\n\r', + '\x00\t\n\r', + '\x00\t\n\r\\', + '\x00\t\n\r\\', + '\x00\t\n\r\\', + '\x00\t\n\r\\', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff\U00010000', + ] + ) class RawUnicodeEscapeTest(unittest.TestCase): def test_empty(self): diff --git a/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst b/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst new file mode 100644 index 0000000..c1c4ed1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst @@ -0,0 +1,2 @@ +Fix incremental decoder and stream reader in the "unicode-escape" codec. +Previously they failed if the escape sequence was split. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 2e8cb97..fc74127 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -489,17 +489,20 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data, _codecs.unicode_escape_decode data: Py_buffer(accept={str, buffer}) errors: str(accept={str, NoneType}) = None + final: bool(accept={int}) = True / [clinic start generated code]*/ static PyObject * _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, - const char *errors) -/*[clinic end generated code: output=3ca3c917176b82ab input=8328081a3a569bd6]*/ + const char *errors, int final) +/*[clinic end generated code: output=b284f97b12c635ee input=6154f039a9f7c639]*/ { - PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len, - errors); - return codec_tuple(decoded, data->len); + Py_ssize_t consumed = data->len; + PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len, + errors, + final ? NULL : &consumed); + return codec_tuple(decoded, consumed); } /*[clinic input] diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index 43378f9..a7086dd 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -1063,7 +1063,7 @@ exit: } PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__, -"unicode_escape_decode($module, data, errors=None, /)\n" +"unicode_escape_decode($module, data, errors=None, final=True, /)\n" "--\n" "\n"); @@ -1072,7 +1072,7 @@ PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__, static PyObject * _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, - const char *errors); + const char *errors, int final); static PyObject * _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs) @@ -1080,8 +1080,9 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_ PyObject *return_value = NULL; Py_buffer data = {NULL, NULL}; const char *errors = NULL; + int final = 1; - if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 2)) { + if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 3)) { goto exit; } if (PyUnicode_Check(args[0])) { @@ -1122,8 +1123,15 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_ _PyArg_BadArgument("unicode_escape_decode", "argument 2", "str or None", args[1]); goto exit; } + if (nargs < 3) { + goto skip_optional; + } + final = _PyLong_AsInt(args[2]); + if (final == -1 && PyErr_Occurred()) { + goto exit; + } skip_optional: - return_value = _codecs_unicode_escape_decode_impl(module, &data, errors); + return_value = _codecs_unicode_escape_decode_impl(module, &data, errors, final); exit: /* Cleanup for data */ @@ -2801,4 +2809,4 @@ exit: #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=557c3b37e4c492ac input=a9049054013a1b77]*/ +/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 741cf9d..af3b333 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6342,9 +6342,10 @@ PyUnicode_AsUTF16String(PyObject *unicode) static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; PyObject * -_PyUnicode_DecodeUnicodeEscape(const char *s, +_PyUnicode_DecodeUnicodeEscapeInternal(const char *s, Py_ssize_t size, const char *errors, + Py_ssize_t *consumed, const char **first_invalid_escape) { const char *starts = s; @@ -6357,6 +6358,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, *first_invalid_escape = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } /* Escaped strings will always be longer than the resulting @@ -6407,7 +6411,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; - goto error; + goto incomplete; } c = (unsigned char) *s++; @@ -6461,7 +6465,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, count = 8; message = "truncated \\UXXXXXXXX escape"; hexescape: - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6474,12 +6481,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (count) { - goto error; - } /* when we get here, ch is a 32-bit unicode character */ if (ch > MAX_UNICODE) { @@ -6506,14 +6510,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, } message = "malformed \\N character escape"; - if (s < end && *s == '{') { + if (s >= end) { + goto incomplete; + } + if (*s == '{') { const char *start = ++s; size_t namelen; /* look for the closing brace */ while (s < end && *s != '}') s++; + if (s >= end) { + goto incomplete; + } namelen = s - start; - if (namelen && s < end) { + if (namelen) { /* found a name. look it up in the unicode database */ s++; ch = 0xffffffff; /* in case 'getcode' messes up */ @@ -6539,6 +6549,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, continue; } + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } error: endinpos = s-starts; writer.min_length = end - s + writer.pos; @@ -6567,12 +6582,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, } PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s, +_PyUnicode_DecodeUnicodeEscapeStateful(const char *s, Py_ssize_t size, - const char *errors) + const char *errors, + Py_ssize_t *consumed) { const char *first_invalid_escape; - PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, + PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, + consumed, &first_invalid_escape); if (result == NULL) return NULL; @@ -6587,6 +6604,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s, return result; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL); +} + /* Return a Unicode-Escape string version of the Unicode object. */ PyObject * diff --git a/Parser/string_parser.c b/Parser/string_parser.c index cffe24e..c6fe99c 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -115,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) s = buf; const char *first_invalid_escape; - v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape); + v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape); if (v != NULL && first_invalid_escape != NULL) { if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) { -- cgit v0.12