diff options
author | Victor Stinner <vstinner@redhat.com> | 2018-08-29 20:21:32 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-29 20:21:32 (GMT) |
commit | 3d4226a832cabc630402589cc671cc4035d504e5 (patch) | |
tree | a1c5b1c51cbbca3aedd52593c979a5c15d72dd52 /Python | |
parent | c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2 (diff) | |
download | cpython-3d4226a832cabc630402589cc671cc4035d504e5.zip cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.gz cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.bz2 |
bpo-34523: Support surrogatepass in locale codecs (GH-8995)
Add support for the "surrogatepass" error handler in
PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault()
for the UTF-8 encoding.
Changes:
* _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the
surrogatepass error handler (_Py_ERROR_SURROGATEPASS).
* _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use
the _Py_error_handler enum instead of "int surrogateescape" to pass
the error handler. These functions now return -3 if the error
handler is unknown.
* Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx()
in test_codecs.
* Rename get_error_handler() to _Py_GetErrorHandler() and expose it
as a private function.
* _freeze_importlib doesn't need config.filesystem_errors="strict"
workaround anymore.
Diffstat (limited to 'Python')
-rw-r--r-- | Python/fileutils.c | 112 |
1 files changed, 87 insertions, 25 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c index 9a3c334..0486f86 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -32,6 +32,24 @@ extern int winerror_to_errno(int); int _Py_open_cloexec_works = -1; #endif + +static int +get_surrogateescape(_Py_error_handler errors, int *surrogateescape) +{ + switch (errors) + { + case _Py_ERROR_STRICT: + *surrogateescape = 0; + return 0; + case _Py_ERROR_SURROGATEESCAPE: + *surrogateescape = 1; + return 0; + default: + return -1; + } +} + + PyObject * _Py_device_encoding(int fd) { @@ -215,12 +233,17 @@ _Py_GetForceASCII(void) static int encode_ascii(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int surrogateescape) + int raw_malloc, _Py_error_handler errors) { char *result = NULL, *out; size_t len, i; wchar_t ch; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + len = wcslen(text); /* +1 for NULL byte */ @@ -278,13 +301,18 @@ _Py_GetForceASCII(void) #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII) static int decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { wchar_t *res; unsigned char *in; wchar_t *out; size_t argsize = strlen(arg) + 1; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { return -1; } @@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, static int decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { wchar_t *res; size_t argsize; @@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, mbstate_t mbs; #endif + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -456,7 +489,7 @@ decode_error: /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ - return decode_ascii(arg, wstr, wlen, reason, surrogateescape); + return decode_ascii(arg, wstr, wlen, reason, errors); #endif /* HAVE_MBRTOWC */ } @@ -479,33 +512,35 @@ decode_error: invalid byte sequence in the input string into *wlen. If reason is not NULL, write the decoding error message into *reason. + Return -3 if the error handler 'errors' is not supported. + Use the Py_EncodeLocaleEx() function to encode the character string back to a byte string. */ int _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, const char **reason, - int current_locale, int surrogateescape) + int current_locale, _Py_error_handler errors) { if (current_locale) { #ifdef __ANDROID__ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + errors); #else - return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + return decode_current_locale(arg, wstr, wlen, reason, errors); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + errors); #else int use_utf8 = (Py_UTF8Mode == 1); #ifdef MS_WINDOWS use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; #endif if (use_utf8) { - return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, - reason, surrogateescape); + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + errors); } #ifdef USE_FORCE_ASCII @@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ - return decode_ascii(arg, wstr, wlen, reason, surrogateescape); + return decode_ascii(arg, wstr, wlen, reason, errors); } #endif - return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + return decode_current_locale(arg, wstr, wlen, reason, errors); #endif /* __APPLE__ or __ANDROID__ */ } @@ -547,8 +582,11 @@ wchar_t* Py_DecodeLocale(const char* arg, size_t *wlen) { wchar_t *wstr; - int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1); + int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, + NULL, 0, + _Py_ERROR_SURROGATEESCAPE); if (res != 0) { + assert(res != -3); if (wlen != NULL) { *wlen = (size_t)res; } @@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen) static int encode_current_locale(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int surrogateescape) + int raw_malloc, _Py_error_handler errors) { const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -646,32 +689,50 @@ encode_error: return -2; } + +/* Encode a string to the locale encoding. + + Parameters: + + * raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead + of PyMem_Malloc(). + * current_locale: if non-zero, use the current LC_CTYPE, otherwise use + Python filesystem encoding. + * errors: error handler like "strict" or "surrogateescape". + + Return value: + + 0: success, *str is set to a newly allocated decoded string. + -1: memory allocation failure + -2: encoding error, set *error_pos and *reason (if set). + -3: the error handler 'errors' is not supported. + */ static int encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int current_locale, int surrogateescape) + int raw_malloc, int current_locale, _Py_error_handler errors) { if (current_locale) { #ifdef __ANDROID__ return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #else return encode_current_locale(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); -#else /* __APPLE__ */ + raw_malloc, errors); +#else int use_utf8 = (Py_UTF8Mode == 1); #ifdef MS_WINDOWS use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; #endif if (use_utf8) { return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); } #ifdef USE_FORCE_ASCII @@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, if (force_ascii) { return encode_ascii(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); } #endif return encode_current_locale(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #endif /* __APPLE__ or __ANDROID__ */ } @@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos, { char *str; int res = encode_locale_ex(text, &str, error_pos, NULL, - raw_malloc, current_locale, 1); + raw_malloc, current_locale, + _Py_ERROR_SURROGATEESCAPE); if (res != -2 && error_pos) { *error_pos = (size_t)-1; } @@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) int _Py_EncodeLocaleEx(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int current_locale, int surrogateescape) + int current_locale, _Py_error_handler errors) { return encode_locale_ex(text, str, error_pos, reason, 1, - current_locale, surrogateescape); + current_locale, errors); } |