diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2017-12-13 11:29:09 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-13 11:29:09 (GMT) |
commit | 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch) | |
tree | ff002e0532736a97f3ddd367c1491e7b04611816 /Python | |
parent | c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff) | |
download | cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2 |
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable
and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
_winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
mode. As a side effect, open() now uses the UTF-8 encoding by
default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
always copy flag values, rather than only copying if the new value
is greater than the old value.
Diffstat (limited to 'Python')
-rw-r--r-- | Python/bltinmodule.c | 3 | ||||
-rw-r--r-- | Python/fileutils.c | 174 | ||||
-rw-r--r-- | Python/pylifecycle.c | 29 | ||||
-rw-r--r-- | Python/sysmodule.c | 6 |
4 files changed, 127 insertions, 85 deletions
diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c index 81774dc..23d7aa4 100644 --- a/Python/bltinmodule.c +++ b/Python/bltinmodule.c @@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */ int Py_HasFileSystemDefaultEncoding = 0; #endif const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape"; +/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin + and stdout error handler to "surrogateescape". */ +int Py_UTF8Mode = 0; _Py_IDENTIFIER(__builtins__); _Py_IDENTIFIER(__dict__); diff --git a/Python/fileutils.c b/Python/fileutils.c index eab58c5..03cc379 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -20,9 +20,8 @@ extern int winerror_to_errno(int); #include <fcntl.h> #endif /* HAVE_FCNTL_H */ -#if defined(__APPLE__) || defined(__ANDROID__) -extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); -#endif +extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, + size_t *p_wlen); #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size) } #endif - -/* Decode a byte string from the locale encoding with the - surrogateescape error handler: undecodable bytes are decoded as characters - in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate - character, escape the bytes using the surrogateescape error handler instead - of decoding them. - - Return a pointer to a newly allocated wide character string, use - PyMem_RawFree() to free the memory. If size is not NULL, write the number of - wide characters excluding the null character into *size - - Return NULL on decoding error or memory allocation error. If *size* is not - NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on - decoding error. - - Decoding errors should never happen, unless there is a bug in the C - library. - - Use the Py_EncodeLocale() function to encode the character string back to a - byte string. */ -wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +static wchar_t* +decode_locale(const char* arg, size_t *size) { -#if defined(__APPLE__) || defined(__ANDROID__) - wchar_t *wstr; - wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); - if (size != NULL) { - if (wstr != NULL) - *size = wcslen(wstr); - else - *size = (size_t)-1; - } - return wstr; -#else wchar_t *res; size_t argsize; size_t count; @@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size) mbstate_t mbs; #endif -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) { - /* force ASCII encoding to workaround mbstowcs() issue */ - res = decode_ascii_surrogateescape(arg, size); - if (res == NULL) - goto oom; - return res; - } -#endif - #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size) goto oom; #endif /* HAVE_MBRTOWC */ return res; + oom: - if (size != NULL) + if (size != NULL) { *size = (size_t)-1; + } return NULL; -#endif /* __APPLE__ or __ANDROID__ */ } -/* Encode a wide character string to the locale encoding with the - surrogateescape error handler: surrogate characters in the range - U+DC80..U+DCFF are converted to bytes 0x80..0xFF. - Return a pointer to a newly allocated byte string, use PyMem_Free() to free - the memory. Return NULL on encoding or memory allocation error. +/* Decode a byte string from the locale encoding with the + surrogateescape error handler: undecodable bytes are decoded as characters + in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate + character, escape the bytes using the surrogateescape error handler instead + of decoding them. - If error_pos is not NULL, *error_pos is set to the index of the invalid - character on encoding error, or set to (size_t)-1 otherwise. + Return a pointer to a newly allocated wide character string, use + PyMem_RawFree() to free the memory. If size is not NULL, write the number of + wide characters excluding the null character into *size - Use the Py_DecodeLocale() function to decode the bytes string back to a wide - character string. */ -char* -Py_EncodeLocale(const wchar_t *text, size_t *error_pos) + Return NULL on decoding error or memory allocation error. If *size* is not + NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on + decoding error. + + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the Py_EncodeLocale() function to encode the character string back to a + byte string. */ +wchar_t* +Py_DecodeLocale(const char* arg, size_t *size) { #if defined(__APPLE__) || defined(__ANDROID__) + return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); +#else + if (Py_UTF8Mode) { + return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + } + +#ifndef MS_WINDOWS + if (force_ascii == -1) + force_ascii = check_force_ascii(); + + if (force_ascii) { + /* force ASCII encoding to workaround mbstowcs() issue */ + wchar_t *wstr = decode_ascii_surrogateescape(arg, size); + if (wstr == NULL) { + if (size != NULL) { + *size = (size_t)-1; + } + return NULL; + } + return wstr; + } +#endif + + return decode_locale(arg, size); +#endif /* __APPLE__ or __ANDROID__ */ +} + +static char* +_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos) +{ Py_ssize_t len; PyObject *unicode, *bytes = NULL; char *cpath; unicode = PyUnicode_FromWideChar(text, wcslen(text)); - if (unicode == NULL) + if (unicode == NULL) { return NULL; + } bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); Py_DECREF(unicode); if (bytes == NULL) { PyErr_Clear(); - if (error_pos != NULL) + if (error_pos != NULL) { *error_pos = (size_t)-1; + } return NULL; } @@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos) if (cpath == NULL) { PyErr_Clear(); Py_DECREF(bytes); - if (error_pos != NULL) + if (error_pos != NULL) { *error_pos = (size_t)-1; + } return NULL; } memcpy(cpath, PyBytes_AsString(bytes), len + 1); Py_DECREF(bytes); return cpath; -#else /* __APPLE__ */ +} + +static char* +encode_locale(const wchar_t *text, size_t *error_pos) +{ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) - return encode_ascii_surrogateescape(text, error_pos); -#endif - /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos) bytes = result; } return result; +} + +/* Encode a wide character string to the locale encoding with the + surrogateescape error handler: surrogate characters in the range + U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + + Return a pointer to a newly allocated byte string, use PyMem_Free() to free + the memory. Return NULL on encoding or memory allocation error. + + If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set + to the index of the invalid character on encoding error. + + Use the Py_DecodeLocale() function to decode the bytes string back to a wide + character string. */ +char* +Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +{ +#if defined(__APPLE__) || defined(__ANDROID__) + return _Py_EncodeLocaleUTF8(text, error_pos); +#else /* __APPLE__ */ + if (Py_UTF8Mode) { + return _Py_EncodeLocaleUTF8(text, error_pos); + } + +#ifndef MS_WINDOWS + if (force_ascii == -1) + force_ascii = check_force_ascii(); + + if (force_ascii) + return encode_ascii_surrogateescape(text, error_pos); +#endif + + return encode_locale(text, error_pos); #endif /* __APPLE__ or __ANDROID__ */ } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index f284855..2bac23d 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */ static _PyInitError add_main_module(PyInterpreterState *interp); static _PyInitError initfsencoding(PyInterpreterState *interp); static _PyInitError initsite(void); -static _PyInitError init_sys_streams(void); +static _PyInitError init_sys_streams(PyInterpreterState *interp); static _PyInitError initsigs(void); static void call_py_exitfuncs(void); static void wait_for_thread_shutdown(void); @@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config) return err; } - err = init_sys_streams(); + err = init_sys_streams(interp); if (_Py_INIT_FAILED(err)) { return err; } @@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p) return err; } - err = init_sys_streams(); + err = init_sys_streams(interp); if (_Py_INIT_FAILED(err)) { return err; } @@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp) Py_FileSystemDefaultEncodeErrors = "surrogatepass"; } #else - if (Py_FileSystemDefaultEncoding == NULL) { + if (Py_FileSystemDefaultEncoding == NULL && + interp->core_config.utf8_mode) + { + Py_FileSystemDefaultEncoding = "utf-8"; + Py_HasFileSystemDefaultEncoding = 1; + } + else if (Py_FileSystemDefaultEncoding == NULL) { Py_FileSystemDefaultEncoding = get_locale_encoding(); if (Py_FileSystemDefaultEncoding == NULL) { return _Py_INIT_ERR("Unable to get the locale encoding"); @@ -1749,7 +1755,7 @@ error: /* Initialize sys.stdin, stdout, stderr and builtins.open */ static _PyInitError -init_sys_streams(void) +init_sys_streams(PyInterpreterState *interp) { PyObject *iomod = NULL, *wrapper; PyObject *bimod = NULL; @@ -1794,10 +1800,10 @@ init_sys_streams(void) encoding = _Py_StandardStreamEncoding; errors = _Py_StandardStreamErrors; if (!encoding || !errors) { - pythonioencoding = Py_GETENV("PYTHONIOENCODING"); - if (pythonioencoding) { + char *opt = Py_GETENV("PYTHONIOENCODING"); + if (opt && opt[0] != '\0') { char *err; - pythonioencoding = _PyMem_Strdup(pythonioencoding); + pythonioencoding = _PyMem_Strdup(opt); if (pythonioencoding == NULL) { PyErr_NoMemory(); goto error; @@ -1814,7 +1820,12 @@ init_sys_streams(void) encoding = pythonioencoding; } } - if (!errors && !(pythonioencoding && *pythonioencoding)) { + else if (interp->core_config.utf8_mode) { + encoding = "utf-8"; + errors = "surrogateescape"; + } + + if (!errors && !pythonioencoding) { /* Choose the default error handler based on the current locale */ errors = get_default_standard_stream_error_handler(); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index f10099b..141e189 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = { {"hash_randomization", "-R"}, {"isolated", "-I"}, {"dev_mode", "-X dev"}, + {"utf8_mode", "-X utf8"}, {0} }; @@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = { "sys.flags", /* name */ flags__doc__, /* doc */ flags_fields, /* fields */ - 14 + 15 }; static PyObject* @@ -1853,8 +1854,9 @@ make_flags(void) SetFlag(Py_QuietFlag); SetFlag(Py_HashRandomizationFlag); SetFlag(Py_IsolatedFlag); -#undef SetFlag PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode)); + SetFlag(Py_UTF8Mode); +#undef SetFlag if (PyErr_Occurred()) { Py_DECREF(seq); |