summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2021-10-14 18:23:39 (GMT)
committerGitHub <noreply@github.com>2021-10-14 18:23:39 (GMT)
commit4641afef661e6a22bc64194bd334b161c95edfe2 (patch)
treed053338921b36eb41bd0a1880c801a33cbec7878 /Objects
parent0bff4ccbfd3297b0adf690655d3e9ddb0033bc69 (diff)
downloadcpython-4641afef661e6a22bc64194bd334b161c95edfe2.zip
cpython-4641afef661e6a22bc64194bd334b161c95edfe2.tar.gz
cpython-4641afef661e6a22bc64194bd334b161c95edfe2.tar.bz2
[3.10] bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) (GH-28952)
They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior. (cherry picked from commit 39aa98346d5dd8ac591a7cafb467af21c53f1e5d) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c64
1 files changed, 44 insertions, 20 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e660834..c728710 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6445,8 +6445,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
const char *message;
#define WRITE_ASCII_CHAR(ch) \
@@ -6473,7 +6471,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
continue;
}
- startinpos = s - starts - 1;
+ Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
message = "\\ at end of string";
@@ -6620,8 +6618,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
*consumed = startinpos;
break;
}
- error:
- endinpos = s-starts;
+ error:;
+ Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
@@ -6816,9 +6814,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
/* --- Raw Unicode Escape Codec ------------------------------------------- */
PyObject *
-PyUnicode_DecodeRawUnicodeEscape(const char *s,
- Py_ssize_t size,
- const char *errors)
+_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
{
const char *starts = s;
_PyUnicodeWriter writer;
@@ -6827,6 +6826,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *exc = NULL;
if (size == 0) {
+ if (consumed) {
+ *consumed = 0;
+ }
_Py_RETURN_UNICODE_EMPTY();
}
@@ -6845,8 +6847,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
unsigned char c = (unsigned char) *s++;
Py_UCS4 ch;
int count;
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
const char *message;
#define WRITE_CHAR(ch) \
@@ -6861,11 +6861,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
} while(0)
/* Non-escape characters are interpreted as Unicode ordinals */
- if (c != '\\' || s >= end) {
+ if (c != '\\' || (s >= end && !consumed)) {
WRITE_CHAR(c);
continue;
}
+ Py_ssize_t startinpos = s - starts - 1;
+ /* \ - Escapes */
+ if (s >= end) {
+ assert(consumed);
+ // Set message to silent compiler warning.
+ // Actually it is never used.
+ message = "\\ at end of string";
+ goto incomplete;
+ }
+
c = (unsigned char) *s++;
if (c == 'u') {
count = 4;
@@ -6881,10 +6891,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
WRITE_CHAR(c);
continue;
}
- startinpos = s - starts - 2;
/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
- for (ch = 0; count && s < end; ++s, --count) {
+ for (ch = 0; count; ++s, --count) {
+ if (s >= end) {
+ goto incomplete;
+ }
c = (unsigned char)*s;
ch <<= 4;
if (c >= '0' && c <= '9') {
@@ -6897,18 +6909,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
ch += c - ('A' - 10);
}
else {
- break;
+ goto error;
}
}
- if (!count) {
- if (ch <= MAX_UNICODE) {
- WRITE_CHAR(ch);
- continue;
- }
+ if (ch > MAX_UNICODE) {
message = "\\Uxxxxxxxx out of range";
+ goto error;
}
+ WRITE_CHAR(ch);
+ continue;
- endinpos = s-starts;
+ incomplete:
+ if (consumed) {
+ *consumed = startinpos;
+ break;
+ }
+ error:;
+ Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler,
@@ -6930,7 +6947,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
+}
+PyObject *
+PyUnicode_DecodeRawUnicodeEscape(const char *s,
+ Py_ssize_t size,
+ const char *errors)
+{
+ return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
}