summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2021-10-14 10:17:00 (GMT)
committerGitHub <noreply@github.com>2021-10-14 10:17:00 (GMT)
commitc96d1546b11b4c282a7e21737cb1f5d16349656d (patch)
tree5e6e49378a4207591316f08e6b7fd42d4e3ff40c
parente71662c1ae817e728233ce93882c5b20f4c31ebc (diff)
downloadcpython-c96d1546b11b4c282a7e21737cb1f5d16349656d.zip
cpython-c96d1546b11b4c282a7e21737cb1f5d16349656d.tar.gz
cpython-c96d1546b11b4c282a7e21737cb1f5d16349656d.tar.bz2
bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939)
They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.unicode_escape_decode(). It is True by default to match the former behavior.
-rw-r--r--Include/cpython/unicodeobject.h10
-rw-r--r--Lib/encodings/unicode_escape.py9
-rw-r--r--Lib/test/test_codecs.py50
-rw-r--r--Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst2
-rw-r--r--Modules/_codecsmodule.c13
-rw-r--r--Modules/clinic/_codecsmodule.c.h18
-rw-r--r--Objects/unicodeobject.c49
-rw-r--r--Parser/string_parser.c2
8 files changed, 121 insertions, 32 deletions
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index 0cbdbdb..bc5a3b4 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -777,12 +777,20 @@ PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
/* --- Unicode-Escape Codecs ---------------------------------------------- */
+/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
+ const char *string, /* Unicode-Escape encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ Py_ssize_t *consumed /* bytes consumed */
+);
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
chars. */
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
+ Py_ssize_t *consumed, /* bytes consumed */
const char **first_invalid_escape /* on return, points to first
invalid escaped char in
string. */
diff --git a/Lib/encodings/unicode_escape.py b/Lib/encodings/unicode_escape.py
index 817f932..9b1ce99b 100644
--- a/Lib/encodings/unicode_escape.py
+++ b/Lib/encodings/unicode_escape.py
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.unicode_escape_encode(input, self.errors)[0]
-class IncrementalDecoder(codecs.IncrementalDecoder):
- def decode(self, input, final=False):
- return codecs.unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ def _buffer_decode(self, input, errors, final):
+ return codecs.unicode_escape_decode(input, errors, final)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
- pass
+ def decode(self, input, errors='strict'):
+ return codecs.unicode_escape_decode(input, errors, False)
### encodings module API
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index f1a149f..288a300 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -114,7 +114,7 @@ class ReadTest(MixInCheckStateHandling):
q = Queue(b"")
r = codecs.getreader(self.encoding)(q)
result = ""
- for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+ for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
q.write(bytes([c]))
result += r.read()
self.assertEqual(result, partialresult)
@@ -125,7 +125,7 @@ class ReadTest(MixInCheckStateHandling):
# do the check again, this time using an incremental decoder
d = codecs.getincrementaldecoder(self.encoding)()
result = ""
- for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+ for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
result += d.decode(bytes([c]))
self.assertEqual(result, partialresult)
# check that there's nothing left in the buffers
@@ -135,7 +135,7 @@ class ReadTest(MixInCheckStateHandling):
# Check whether the reset method works properly
d.reset()
result = ""
- for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+ for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
result += d.decode(bytes([c]))
self.assertEqual(result, partialresult)
# check that there's nothing left in the buffers
@@ -2353,7 +2353,11 @@ class TypesTest(unittest.TestCase):
(r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
-class UnicodeEscapeTest(unittest.TestCase):
+class UnicodeEscapeTest(ReadTest, unittest.TestCase):
+ encoding = "unicode-escape"
+
+ test_lone_surrogates = None
+
def test_empty(self):
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2440,6 +2444,44 @@ class UnicodeEscapeTest(unittest.TestCase):
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
+ def test_partial(self):
+ self.check_partial(
+ "\x00\t\n\r\\\xff\uffff\U00010000",
+ [
+ '',
+ '',
+ '',
+ '\x00',
+ '\x00',
+ '\x00\t',
+ '\x00\t',
+ '\x00\t\n',
+ '\x00\t\n',
+ '\x00\t\n\r',
+ '\x00\t\n\r',
+ '\x00\t\n\r\\',
+ '\x00\t\n\r\\',
+ '\x00\t\n\r\\',
+ '\x00\t\n\r\\',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff\U00010000',
+ ]
+ )
class RawUnicodeEscapeTest(unittest.TestCase):
def test_empty(self):
diff --git a/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst b/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst
new file mode 100644
index 0000000..c1c4ed1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst
@@ -0,0 +1,2 @@
+Fix incremental decoder and stream reader in the "unicode-escape" codec.
+Previously they failed if the escape sequence was split.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index 2e8cb97..fc74127 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -489,17 +489,20 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
_codecs.unicode_escape_decode
data: Py_buffer(accept={str, buffer})
errors: str(accept={str, NoneType}) = None
+ final: bool(accept={int}) = True
/
[clinic start generated code]*/
static PyObject *
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
- const char *errors)
-/*[clinic end generated code: output=3ca3c917176b82ab input=8328081a3a569bd6]*/
+ const char *errors, int final)
+/*[clinic end generated code: output=b284f97b12c635ee input=6154f039a9f7c639]*/
{
- PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
- errors);
- return codec_tuple(decoded, data->len);
+ Py_ssize_t consumed = data->len;
+ PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
+ errors,
+ final ? NULL : &consumed);
+ return codec_tuple(decoded, consumed);
}
/*[clinic input]
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h
index 43378f9..a7086dd 100644
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -1063,7 +1063,7 @@ exit:
}
PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,
-"unicode_escape_decode($module, data, errors=None, /)\n"
+"unicode_escape_decode($module, data, errors=None, final=True, /)\n"
"--\n"
"\n");
@@ -1072,7 +1072,7 @@ PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,
static PyObject *
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
- const char *errors);
+ const char *errors, int final);
static PyObject *
_codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
@@ -1080,8 +1080,9 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_
PyObject *return_value = NULL;
Py_buffer data = {NULL, NULL};
const char *errors = NULL;
+ int final = 1;
- if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 2)) {
+ if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 3)) {
goto exit;
}
if (PyUnicode_Check(args[0])) {
@@ -1122,8 +1123,15 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_
_PyArg_BadArgument("unicode_escape_decode", "argument 2", "str or None", args[1]);
goto exit;
}
+ if (nargs < 3) {
+ goto skip_optional;
+ }
+ final = _PyLong_AsInt(args[2]);
+ if (final == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
skip_optional:
- return_value = _codecs_unicode_escape_decode_impl(module, &data, errors);
+ return_value = _codecs_unicode_escape_decode_impl(module, &data, errors, final);
exit:
/* Cleanup for data */
@@ -2801,4 +2809,4 @@ exit:
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=557c3b37e4c492ac input=a9049054013a1b77]*/
+/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 741cf9d..af3b333 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6342,9 +6342,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
PyObject *
-_PyUnicode_DecodeUnicodeEscape(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Py_ssize_t size,
const char *errors,
+ Py_ssize_t *consumed,
const char **first_invalid_escape)
{
const char *starts = s;
@@ -6357,6 +6358,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
*first_invalid_escape = NULL;
if (size == 0) {
+ if (consumed) {
+ *consumed = 0;
+ }
_Py_RETURN_UNICODE_EMPTY();
}
/* Escaped strings will always be longer than the resulting
@@ -6407,7 +6411,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
- goto error;
+ goto incomplete;
}
c = (unsigned char) *s++;
@@ -6461,7 +6465,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
count = 8;
message = "truncated \\UXXXXXXXX escape";
hexescape:
- for (ch = 0; count && s < end; ++s, --count) {
+ for (ch = 0; count; ++s, --count) {
+ if (s >= end) {
+ goto incomplete;
+ }
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
@@ -6474,12 +6481,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
- break;
+ goto error;
}
}
- if (count) {
- goto error;
- }
/* when we get here, ch is a 32-bit unicode character */
if (ch > MAX_UNICODE) {
@@ -6506,14 +6510,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
}
message = "malformed \\N character escape";
- if (s < end && *s == '{') {
+ if (s >= end) {
+ goto incomplete;
+ }
+ if (*s == '{') {
const char *start = ++s;
size_t namelen;
/* look for the closing brace */
while (s < end && *s != '}')
s++;
+ if (s >= end) {
+ goto incomplete;
+ }
namelen = s - start;
- if (namelen && s < end) {
+ if (namelen) {
/* found a name. look it up in the unicode database */
s++;
ch = 0xffffffff; /* in case 'getcode' messes up */
@@ -6539,6 +6549,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
continue;
}
+ incomplete:
+ if (consumed) {
+ *consumed = startinpos;
+ break;
+ }
error:
endinpos = s-starts;
writer.min_length = end - s + writer.pos;
@@ -6567,12 +6582,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
}
PyObject *
-PyUnicode_DecodeUnicodeEscape(const char *s,
+_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
- const char *errors)
+ const char *errors,
+ Py_ssize_t *consumed)
{
const char *first_invalid_escape;
- PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
+ PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+ consumed,
&first_invalid_escape);
if (result == NULL)
return NULL;
@@ -6587,6 +6604,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
return result;
}
+PyObject *
+PyUnicode_DecodeUnicodeEscape(const char *s,
+ Py_ssize_t size,
+ const char *errors)
+{
+ return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
+}
+
/* Return a Unicode-Escape string version of the Unicode object. */
PyObject *
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
index cffe24e..c6fe99c 100644
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -115,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
s = buf;
const char *first_invalid_escape;
- v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
+ v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
if (v != NULL && first_invalid_escape != NULL) {
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {