diff options
author | Victor Stinner <vstinner@redhat.com> | 2018-08-29 20:21:32 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-29 20:21:32 (GMT) |
commit | 3d4226a832cabc630402589cc671cc4035d504e5 (patch) | |
tree | a1c5b1c51cbbca3aedd52593c979a5c15d72dd52 /Objects | |
parent | c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2 (diff) | |
download | cpython-3d4226a832cabc630402589cc671cc4035d504e5.zip cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.gz cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.bz2 |
bpo-34523: Support surrogatepass in locale codecs (GH-8995)
Add support for the "surrogatepass" error handler in
PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault()
for the UTF-8 encoding.
Changes:
* _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the
surrogatepass error handler (_Py_ERROR_SURROGATEPASS).
* _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use
the _Py_error_handler enum instead of "int surrogateescape" to pass
the error handler. These functions now return -3 if the error
handler is unknown.
* Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx()
in test_codecs.
* Rename get_error_handler() to _Py_GetErrorHandler() and expose it
as a private function.
* _freeze_importlib doesn't need config.filesystem_errors="strict"
workaround anymore.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/stringlib/codecs.h | 2 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 173 |
2 files changed, 102 insertions, 73 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index f019d9a..0abb4c8 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, Py_ssize_t startpos, endpos, newpos; Py_ssize_t k; if (error_handler == _Py_ERROR_UNKNOWN) { - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); } startpos = i-1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 60adcd9..a797f83 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr); #include "clinic/unicodeobject.c.h" -typedef enum { - _Py_ERROR_UNKNOWN=0, - _Py_ERROR_STRICT, - _Py_ERROR_SURROGATEESCAPE, - _Py_ERROR_REPLACE, - _Py_ERROR_IGNORE, - _Py_ERROR_BACKSLASHREPLACE, - _Py_ERROR_SURROGATEPASS, - _Py_ERROR_XMLCHARREFREPLACE, - _Py_ERROR_OTHER -} _Py_error_handler; - -static _Py_error_handler -get_error_handler(const char *errors) +_Py_error_handler +_Py_GetErrorHandler(const char *errors) { if (errors == NULL || strcmp(errors, "strict") == 0) { return _Py_ERROR_STRICT; @@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } -static int -locale_error_handler(const char *errors, int *surrogateescape) -{ - _Py_error_handler error_handler = get_error_handler(errors); - switch (error_handler) - { - case _Py_ERROR_STRICT: - *surrogateescape = 0; - return 0; - case _Py_ERROR_SURROGATEESCAPE: - *surrogateescape = 1; - return 0; - default: - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; - } -} static PyObject * unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) { - int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) - return NULL; + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); Py_ssize_t wlen; wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); @@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors, size_t error_pos; const char *reason; int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, - current_locale, surrogateescape); + current_locale, error_handler); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors, } return NULL; } + else if (res == -3) { + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + } else { PyErr_NoMemory(); PyMem_Free(wstr); @@ -3571,9 +3540,7 @@ static PyObject* unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, int current_locale) { - int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) - return NULL; + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); if (str[len] != '\0' || (size_t)len != strlen(str)) { PyErr_SetString(PyExc_ValueError, "embedded null byte"); @@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, size_t wlen; const char *reason; int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, - current_locale, surrogateescape); + current_locale, error_handler); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, Py_DECREF(exc); } } + else if (res == -3) { + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + } else { PyErr_NoMemory(); } @@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, } if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { case _Py_ERROR_IGNORE: @@ -4932,13 +4902,29 @@ onError: is not NULL, write the decoding error message into *reason. */ int _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { const char *orig_s = s; const char *e; wchar_t *unicode; Py_ssize_t outpos; + int surrogateescape = 0; + int surrogatepass = 0; + switch (errors) + { + case _Py_ERROR_STRICT: + break; + case _Py_ERROR_SURROGATEESCAPE: + surrogateescape = 1; + break; + case _Py_ERROR_SURROGATEPASS: + surrogatepass = 1; + break; + default: + return -3; + } + /* Note: size will always be longer than the resulting Unicode character count */ if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { @@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, #endif } else { - if (!ch && s == e) + if (!ch && s == e) { break; - if (!surrogateescape) { - PyMem_RawFree(unicode ); - if (reason != NULL) { - switch (ch) { - case 0: - *reason = "unexpected end of data"; - break; - case 1: - *reason = "invalid start byte"; - break; - /* 2, 3, 4 */ - default: - *reason = "invalid continuation byte"; - break; - } + } + + if (surrogateescape) { + unicode[outpos++] = 0xDC00 + (unsigned char)*s++; + } + else { + /* Is it a valid three-byte code? */ + if (surrogatepass + && (e - s) >= 3 + && (s[0] & 0xf0) == 0xe0 + && (s[1] & 0xc0) == 0x80 + && (s[2] & 0xc0) == 0x80) + { + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); + s += 3; + unicode[outpos++] = ch; } - if (wlen != NULL) { - *wlen = s - orig_s; + else { + PyMem_RawFree(unicode ); + if (reason != NULL) { + switch (ch) { + case 0: + *reason = "unexpected end of data"; + break; + case 1: + *reason = "invalid start byte"; + break; + /* 2, 3, 4 */ + default: + *reason = "invalid continuation byte"; + break; + } + } + if (wlen != NULL) { + *wlen = s - orig_s; + } + return -2; } - return -2; } - /* surrogateescape */ - unicode[outpos++] = 0xDC00 + (unsigned char)*s++; } } unicode[outpos] = L'\0'; @@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen) On memory allocation failure, return -1. */ int _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, - const char **reason, int raw_malloc, int surrogateescape) + const char **reason, int raw_malloc, _Py_error_handler errors) { const Py_ssize_t max_char_size = 4; Py_ssize_t len = wcslen(text); assert(len >= 0); + int surrogateescape = 0; + int surrogatepass = 0; + switch (errors) + { + case _Py_ERROR_STRICT: + break; + case _Py_ERROR_SURROGATEESCAPE: + surrogateescape = 1; + break; + case _Py_ERROR_SURROGATEPASS: + surrogatepass = 1; + break; + default: + return -3; + } + if (len > PY_SSIZE_T_MAX / max_char_size - 1) { return -1; } @@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, char *p = bytes; Py_ssize_t i; - for (i = 0; i < len; i++) { + for (i = 0; i < len; ) { + Py_ssize_t ch_pos = i; Py_UCS4 ch = text[i]; + i++; +#if Py_UNICODE_SIZE == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(text[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]); + i++; + } +#endif if (ch < 0x80) { /* Encode ASCII */ @@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); } - else if (Py_UNICODE_IS_SURROGATE(ch)) { + else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) { /* surrogateescape error handler */ if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { if (error_pos != NULL) { - *error_pos = (size_t)i; + *error_pos = (size_t)ch_pos; } if (reason != NULL) { *reason = "encoding error"; @@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode, /* cache callback name lookup (if not done yet, i.e. it's the first error) */ if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { case _Py_ERROR_STRICT: @@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s, /* byte outsize range 0x00..0x7f: call the error handler */ if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { @@ -8404,7 +8433,7 @@ charmap_encoding_error( /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ if (*error_handler == _Py_ERROR_UNKNOWN) - *error_handler = get_error_handler(errors); + *error_handler = _Py_GetErrorHandler(errors); switch (*error_handler) { case _Py_ERROR_STRICT: |