diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2018-01-15 09:45:49 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-15 09:45:49 (GMT) |
commit | 7ed7aead9503102d2ed316175f198104e0cd674c (patch) | |
tree | 0b70b3b7d2eed5ea92552c1b93953d0333f5a869 /Python/fileutils.c | |
parent | ee3b83547c6b0cac1da2cb44aaaea533a1d1bbc8 (diff) | |
download | cpython-7ed7aead9503102d2ed316175f198104e0cd674c.zip cpython-7ed7aead9503102d2ed316175f198104e0cd674c.tar.gz cpython-7ed7aead9503102d2ed316175f198104e0cd674c.tar.bz2 |
bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)
Modify locale.localeconv(), time.tzname, os.strerror() and other
functions to ignore the UTF-8 Mode: always use the current locale
encoding.
Changes:
* Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or
encoding error, they return the position of the error and an error
message which are used to raise Unicode errors in
PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale().
* Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx().
* PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all
cases, especially for the strict error handler.
* Add _Py_DecodeUTF8Ex(): return more information on decoding error
and supports the strict error handler.
* Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex().
* Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx().
* Ignore the UTF-8 mode to encode/decode localeconv(), strerror()
and time zone name.
* Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use
the "current" locale.
* Remove _PyUnicode_DecodeCurrentLocale(),
_PyUnicode_DecodeCurrentLocaleAndSize() and
_PyUnicode_EncodeCurrentLocale().
Diffstat (limited to 'Python/fileutils.c')
-rw-r--r-- | Python/fileutils.c | 385 |
1 files changed, 246 insertions, 139 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c index 9275494..a50075e 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -20,9 +20,6 @@ extern int winerror_to_errno(int); #include <fcntl.h> #endif /* HAVE_FCNTL_H */ -extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text, - size_t *error_pos, int raw_malloc); - #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -69,7 +66,10 @@ _Py_device_encoding(int fd) Py_RETURN_NONE; } -#if !defined(__APPLE__) && !defined(MS_WINDOWS) +#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) + +#define USE_FORCE_ASCII + extern int _Py_normalize_encoding(const char *, char *, size_t); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. @@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t); 1: the workaround is used: Py_EncodeLocale() uses encode_ascii_surrogateescape() and Py_DecodeLocale() uses - decode_ascii_surrogateescape() + decode_ascii() 0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and Py_DecodeLocale() uses mbstowcs() -1: unknown, need to call check_force_ascii() to get the value @@ -180,16 +180,15 @@ error: return 1; } -static char* -encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc) +static int +encode_ascii(const wchar_t *text, char **str, + size_t *error_pos, const char **reason, + int raw_malloc, int surrogateescape) { char *result = NULL, *out; size_t len, i; wchar_t ch; - if (error_pos != NULL) - *error_pos = (size_t)-1; - len = wcslen(text); /* +1 for NULL byte */ @@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal else { result = PyMem_Malloc(len + 1); } - if (result == NULL) - return NULL; + if (result == NULL) { + return -1; + } out = result; for (i=0; i<len; i++) { @@ -210,60 +210,84 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal /* ASCII character */ *out++ = (char)ch; } - else if (0xdc80 <= ch && ch <= 0xdcff) { + else if (surrogateescape && 0xdc80 <= ch && ch <= 0xdcff) { /* UTF-8b surrogate */ *out++ = (char)(ch - 0xdc00); } else { - if (error_pos != NULL) { - *error_pos = i; - } if (raw_malloc) { PyMem_RawFree(result); } else { PyMem_Free(result); } - return NULL; + if (error_pos != NULL) { + *error_pos = i; + } + if (reason) { + *reason = "encoding error"; + } + return -2; } } *out = '\0'; - return result; + *str = result; + return 0; } -#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */ +#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */ + -#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC)) -static wchar_t* -decode_ascii_surrogateescape(const char *arg, size_t *size) +#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII) +static int +decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, + const char **reason, int surrogateescape) { wchar_t *res; unsigned char *in; wchar_t *out; size_t argsize = strlen(arg) + 1; - if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) - return NULL; - res = PyMem_RawMalloc(argsize*sizeof(wchar_t)); - if (!res) - return NULL; + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { + return -1; + } + res = PyMem_RawMalloc(argsize * sizeof(wchar_t)); + if (!res) { + return -1; + } - in = (unsigned char*)arg; out = res; - while(*in) - if(*in < 128) - *out++ = *in++; - else - *out++ = 0xdc00 + *in++; + for (in = (unsigned char*)arg; *in; in++) { + unsigned char ch = *in; + if (ch < 128) { + *out++ = ch; + } + else { + if (!surrogateescape) { + PyMem_RawFree(res); + if (wlen) { + *wlen = in - (unsigned char*)arg; + } + if (reason) { + *reason = "decoding error"; + } + return -2; + } + *out++ = 0xdc00 + ch; + } + } *out = 0; - if (size != NULL) - *size = out - res; - return res; + + if (wlen != NULL) { + *wlen = out - res; + } + *wstr = res; + return 0; } -#endif +#endif /* !HAVE_MBRTOWC */ -#if !defined(__APPLE__) && !defined(__ANDROID__) -static wchar_t* -decode_current_locale(const char* arg, size_t *size) +static int +decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, + const char **reason, int surrogateescape) { wchar_t *res; size_t argsize; @@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size) argsize = mbstowcs(NULL, arg, 0); #endif if (argsize != (size_t)-1) { - if (argsize == PY_SSIZE_T_MAX) - goto oom; - argsize += 1; - if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) - goto oom; - res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t)); - if (!res) - goto oom; - count = mbstowcs(res, arg, argsize); + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { + return -1; + } + res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t)); + if (!res) { + return -1; + } + + count = mbstowcs(res, arg, argsize + 1); if (count != (size_t)-1) { wchar_t *tmp; /* Only use the result if it contains no @@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size) !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) ; if (*tmp == 0) { - if (size != NULL) - *size = count; - return res; + if (wlen != NULL) { + *wlen = count; + } + *wstr = res; + return 0; } } PyMem_RawFree(res); } + /* Conversion failed. Fall back to escaping with surrogateescape. */ #ifdef HAVE_MBRTOWC /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ @@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size) /* Overallocate; as multi-byte characters are in the argument, the actual output could use less memory. */ argsize = strlen(arg) + 1; - if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) - goto oom; - res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t)); - if (!res) - goto oom; + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { + return -1; + } + res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t)); + if (!res) { + return -1; + } + in = (unsigned char*)arg; out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); - if (converted == 0) + if (converted == 0) { /* Reached end of string; null char stored. */ break; + } + if (converted == (size_t)-2) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ - PyMem_RawFree(res); - if (size != NULL) - *size = (size_t)-2; - return NULL; + goto decode_error; } + if (converted == (size_t)-1) { + if (!surrogateescape) { + goto decode_error; + } + /* Conversion error. Escape as UTF-8b, and start over in the initial shift state. */ *out++ = 0xdc00 + *in++; @@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size) memset(&mbs, 0, sizeof mbs); continue; } + if (Py_UNICODE_IS_SURROGATE(*out)) { + if (!surrogateescape) { + goto decode_error; + } + /* Surrogate character. Escape the original byte sequence with surrogateescape. */ argsize -= converted; - while (converted--) + while (converted--) { *out++ = 0xdc00 + *in++; + } continue; } /* successfully converted some bytes */ @@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size) argsize -= converted; out++; } - if (size != NULL) - *size = out - res; + if (wlen != NULL) { + *wlen = out - res; + } + *wstr = res; + return 0; + +decode_error: + PyMem_RawFree(res); + if (wlen) { + *wlen = in - (unsigned char*)arg; + } + if (reason) { + *reason = "decoding error"; + } + return -2; #else /* HAVE_MBRTOWC */ /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ - res = decode_ascii_surrogateescape(arg, size); - if (res == NULL) - goto oom; + return decode_ascii(arg, wstr, wlen, reason, surrogateescape); #endif /* HAVE_MBRTOWC */ - return res; - -oom: - if (size != NULL) { - *size = (size_t)-1; - } - return NULL; } -#endif -static wchar_t* -decode_locale(const char* arg, size_t *size, int ignore_utf8_mode) +/* Decode a byte string from the locale encoding. + + Use the strict error handler if 'surrogateescape' is zero. Use the + surrogateescape error handler if 'surrogateescape' is non-zero: undecodable + bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence + can be decoded as a surrogate character, escape the bytes using the + surrogateescape error handler instead of decoding them. + + On sucess, return 0 and write the newly allocated wide character string into + *wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write + the number of wide characters excluding the null character into *wlen. + + On memory allocation failure, return -1. + + On decoding error, return -2. If wlen is not NULL, write the start of + invalid byte sequence in the input string into *wlen. If reason is not NULL, + write the decoding error message into *reason. + + Use the Py_EncodeLocaleEx() function to encode the character string back to + a byte string. */ +int +_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, + const char **reason, + int current_locale, int surrogateescape) { + if (current_locale) { + return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + } + #if defined(__APPLE__) || defined(__ANDROID__) - return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + surrogateescape); #else - if (!ignore_utf8_mode && Py_UTF8Mode == 1) { - return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + if (Py_UTF8Mode == 1) { + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + surrogateescape); } -#ifndef MS_WINDOWS - if (force_ascii == -1) +#ifdef USE_FORCE_ASCII + if (force_ascii == -1) { force_ascii = check_force_ascii(); + } if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ - wchar_t *wstr = decode_ascii_surrogateescape(arg, size); - if (wstr == NULL) { - if (size != NULL) { - *size = (size_t)-1; - } - return NULL; - } - return wstr; + return decode_ascii(arg, wstr, wlen, reason, surrogateescape); } #endif - return decode_current_locale(arg, size); + return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); #endif /* __APPLE__ or __ANDROID__ */ } @@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode) Use the Py_EncodeLocale() function to encode the character string back to a byte string. */ wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +Py_DecodeLocale(const char* arg, size_t *wlen) { - return decode_locale(arg, size, 0); -} - - -/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */ -wchar_t* -_Py_DecodeCurrentLocale(const char* arg, size_t *size) -{ - return decode_locale(arg, size, 1); + wchar_t *wstr; + int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1); + if (res != 0) { + if (wlen != NULL) { + *wlen = (size_t)res; + } + return NULL; + } + return wstr; } -#if !defined(__APPLE__) && !defined(__ANDROID__) -static char* -encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) +static int +encode_current_locale(const wchar_t *text, char **str, + size_t *error_pos, const char **reason, + int raw_malloc, int surrogateescape) { const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; @@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) for (i=0; i < len; i++) { c = text[i]; if (c >= 0xdc80 && c <= 0xdcff) { + if (!surrogateescape) { + goto encode_error; + } /* UTF-8b surrogate */ if (bytes != NULL) { *bytes++ = c - 0xdc00; size--; } - else + else { size++; + } continue; } else { buf[0] = c; - if (bytes != NULL) + if (bytes != NULL) { converted = wcstombs(bytes, buf, size); - else + } + else { converted = wcstombs(NULL, buf, 0); + } if (converted == (size_t)-1) { - if (raw_malloc) { - PyMem_RawFree(result); - } - else { - PyMem_Free(result); - } - if (error_pos != NULL) - *error_pos = i; - return NULL; + goto encode_error; } if (bytes != NULL) { bytes += converted; size -= converted; } - else + else { size += converted; + } } } if (result != NULL) { @@ -511,40 +576,80 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) result = PyMem_Malloc(size); } if (result == NULL) { - if (error_pos != NULL) { - *error_pos = (size_t)-1; - } - return NULL; + return -1; } bytes = result; } - return result; + *str = result; + return 0; + +encode_error: + if (raw_malloc) { + PyMem_RawFree(result); + } + else { + PyMem_Free(result); + } + if (error_pos != NULL) { + *error_pos = i; + } + if (reason) { + *reason = "encoding error"; + } + return -2; } -#endif -static char* -encode_locale(const wchar_t *text, size_t *error_pos, - int raw_malloc, int ignore_utf8_mode) +static int +encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, + const char **reason, + int raw_malloc, int current_locale, int surrogateescape) { + if (current_locale) { + return encode_current_locale(text, str, error_pos, reason, + raw_malloc, surrogateescape); + } + #if defined(__APPLE__) || defined(__ANDROID__) - return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); + return _Py_EncodeUTF8Ex(text, str, error_pos, reason, + raw_malloc, surrogateescape); #else /* __APPLE__ */ - if (!ignore_utf8_mode && Py_UTF8Mode == 1) { - return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); + if (Py_UTF8Mode == 1) { + return _Py_EncodeUTF8Ex(text, str, error_pos, reason, + raw_malloc, surrogateescape); } -#ifndef MS_WINDOWS - if (force_ascii == -1) +#ifdef USE_FORCE_ASCII + if (force_ascii == -1) { force_ascii = check_force_ascii(); + } - if (force_ascii) - return encode_ascii_surrogateescape(text, error_pos, raw_malloc); + if (force_ascii) { + return encode_ascii(text, str, error_pos, reason, + raw_malloc, surrogateescape); + } #endif - return encode_current_locale(text, error_pos, raw_malloc); + return encode_current_locale(text, str, error_pos, reason, + raw_malloc, surrogateescape); #endif /* __APPLE__ or __ANDROID__ */ } +static char* +encode_locale(const wchar_t *text, size_t *error_pos, + int raw_malloc, int current_locale) +{ + char *str; + int res = encode_locale_ex(text, &str, error_pos, NULL, + raw_malloc, current_locale, 1); + if (res != -2 && error_pos) { + *error_pos = (size_t)-1; + } + if (res != 0) { + return NULL; + } + return str; +} + /* Encode a wide character string to the locale encoding with the surrogateescape error handler: surrogate characters in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. @@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) } -/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */ -char* -_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos) +int +_Py_EncodeLocaleEx(const wchar_t *text, char **str, + size_t *error_pos, const char **reason, + int current_locale, int surrogateescape) { - return encode_locale(text, error_pos, 1, 1); + return encode_locale_ex(text, str, error_pos, reason, 1, + current_locale, surrogateescape); } |