summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2021-10-14 17:04:19 (GMT)
committerGitHub <noreply@github.com>2021-10-14 17:04:19 (GMT)
commit39aa98346d5dd8ac591a7cafb467af21c53f1e5d (patch)
treed363b62f299171467fce0cd3e1fe155c2ca41a09
parentd413c503636cde2a6ab0ada25dccb0134633a8e6 (diff)
downloadcpython-39aa98346d5dd8ac591a7cafb467af21c53f1e5d.zip
cpython-39aa98346d5dd8ac591a7cafb467af21c53f1e5d.tar.gz
cpython-39aa98346d5dd8ac591a7cafb467af21c53f1e5d.tar.bz2
bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944)
They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior.
-rw-r--r--Include/cpython/unicodeobject.h10
-rw-r--r--Lib/encodings/raw_unicode_escape.py9
-rw-r--r--Lib/test/test_codecs.py35
-rw-r--r--Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst2
-rw-r--r--Modules/_codecsmodule.c13
-rw-r--r--Modules/clinic/_codecsmodule.c.h18
-rw-r--r--Objects/unicodeobject.c64
7 files changed, 116 insertions, 35 deletions
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index bc5a3b4..ab4aebf 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
string. */
);
+/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
+
+/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
+ const char *string, /* Unicode-Escape encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ Py_ssize_t *consumed /* bytes consumed */
+);
+
/* --- Latin-1 Codecs ----------------------------------------------------- */
PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py
index 2b919b4..46c8e07 100644
--- a/Lib/encodings/raw_unicode_escape.py
+++ b/Lib/encodings/raw_unicode_escape.py
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.raw_unicode_escape_encode(input, self.errors)[0]
-class IncrementalDecoder(codecs.IncrementalDecoder):
- def decode(self, input, final=False):
- return codecs.raw_unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ def _buffer_decode(self, input, errors, final):
+ return codecs.raw_unicode_escape_decode(input, errors, final)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
- pass
+ def decode(self, input, errors='strict'):
+ return codecs.raw_unicode_escape_decode(input, errors, False)
### encodings module API
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 288a300..506b51c 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -2483,7 +2483,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase):
]
)
-class RawUnicodeEscapeTest(unittest.TestCase):
+class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
+ encoding = "raw-unicode-escape"
+
+ test_lone_surrogates = None
+
def test_empty(self):
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
@@ -2532,6 +2536,35 @@ class RawUnicodeEscapeTest(unittest.TestCase):
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
+ def test_partial(self):
+ self.check_partial(
+ "\x00\t\n\r\\\xff\uffff\U00010000",
+ [
+ '\x00',
+ '\x00\t',
+ '\x00\t\n',
+ '\x00\t\n\r',
+ '\x00\t\n\r',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff',
+ '\x00\t\n\r\\\xff\uffff\U00010000',
+ ]
+ )
+
class EscapeEncodeTest(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst
new file mode 100644
index 0000000..f2c0ae4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst
@@ -0,0 +1,2 @@
+Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
+Previously they failed if the escape sequence was split.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index fc74127..50afc09 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
_codecs.raw_unicode_escape_decode
data: Py_buffer(accept={str, buffer})
errors: str(accept={str, NoneType}) = None
+ final: bool(accept={int}) = True
/
[clinic start generated code]*/
static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
- const char *errors)
-/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/
+ const char *errors, int final)
+/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
{
- PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
- errors);
- return codec_tuple(decoded, data->len);
+ Py_ssize_t consumed = data->len;
+ PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
+ errors,
+ final ? NULL : &consumed);
+ return codec_tuple(decoded, consumed);
}
/*[clinic input]
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h
index a7086dd..855ac77 100644
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -1143,7 +1143,7 @@ exit:
}
PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
-"raw_unicode_escape_decode($module, data, errors=None, /)\n"
+"raw_unicode_escape_decode($module, data, errors=None, final=True, /)\n"
"--\n"
"\n");
@@ -1152,7 +1152,7 @@ PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
- const char *errors);
+ const char *errors, int final);
static PyObject *
_codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
@@ -1160,8 +1160,9 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
PyObject *return_value = NULL;
Py_buffer data = {NULL, NULL};
const char *errors = NULL;
+ int final = 1;
- if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 2)) {
+ if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 3)) {
goto exit;
}
if (PyUnicode_Check(args[0])) {
@@ -1202,8 +1203,15 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
_PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]);
goto exit;
}
+ if (nargs < 3) {
+ goto skip_optional;
+ }
+ final = _PyLong_AsInt(args[2]);
+ if (final == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
skip_optional:
- return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors);
+ return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors, final);
exit:
/* Cleanup for data */
@@ -2809,4 +2817,4 @@ exit:
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=814dae36b6f885cb input=a9049054013a1b77]*/
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index af3b333..386052f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
const char *message;
#define WRITE_ASCII_CHAR(ch) \
@@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
continue;
}
- startinpos = s - starts - 1;
+ Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
@@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
*consumed = startinpos;
break;
}
- error:
- endinpos = s-starts;
+ error:;
+ Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
@@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* --- Raw Unicode Escape Codec ------------------------------------------- */
PyObject *
-PyUnicode_DecodeRawUnicodeEscape(const char *s,
- Py_ssize_t size,
- const char *errors)
+_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
{
const char *starts = s;
_PyUnicodeWriter writer;
@@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *exc = NULL;
if (size == 0) {
+ if (consumed) {
+ *consumed = 0;
+ }
_Py_RETURN_UNICODE_EMPTY();
}
@@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
const char *message;
#define WRITE_CHAR(ch) \
@@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
} while(0)
/* Non-escape characters are interpreted as Unicode ordinals */
- if (c != '\\' || s >= end) {
+ if (c != '\\' || (s >= end && !consumed)) {
WRITE_CHAR(c);
continue;
}
+ Py_ssize_t startinpos = s - starts - 1;
+ /* \ - Escapes */
+ if (s >= end) {
+ assert(consumed);
+ // Set message to silent compiler warning.
+ // Actually it is never used.
+ message = "\\ at end of string";
+ goto incomplete;
+ }
+
c = (unsigned char) *s++;
if (c == 'u') {
count = 4;
@@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
WRITE_CHAR(c);
continue;
}
- startinpos = s - starts - 2;
/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
- for (ch = 0; count && s < end; ++s, --count) {
+ for (ch = 0; count; ++s, --count) {
+ if (s >= end) {
+ goto incomplete;
+ }
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
@@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
- break;
+ goto error;
}
}
- if (!count) {
- if (ch <= MAX_UNICODE) {
- WRITE_CHAR(ch);
- continue;
- }
+ if (ch > MAX_UNICODE) {
message = "\\Uxxxxxxxx out of range";
+ goto error;
}
+ WRITE_CHAR(ch);
+ continue;
- endinpos = s-starts;
+ incomplete:
+ if (consumed) {
+ *consumed = startinpos;
+ break;
+ }
+ error:;
+ Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
@@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
+}
+PyObject *
+PyUnicode_DecodeRawUnicodeEscape(const char *s,
+ Py_ssize_t size,
+ const char *errors)
+{
+ return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
}