bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
author: Victor Stinner <victor.stinner@gmail.com> 2017-12-13 11:29:09 (GMT)
committer: GitHub <noreply@github.com> 2017-12-13 11:29:09 (GMT)
commit: 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
tree: ff002e0532736a97f3ddd367c1491e7b04611816 /Python
parent: c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
download: cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2
4 files changed, 127 insertions, 85 deletions
diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c
index 81774dc..23d7aa4 100644
--- a/Python/bltinmodule.c
+++ b/Python/bltinmodule.c
@@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
 int Py_HasFileSystemDefaultEncoding = 0;
 #endif
 const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
+/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
+   and stdout error handler to "surrogateescape". */
+int Py_UTF8Mode = 0;
 
 _Py_IDENTIFIER(__builtins__);
 _Py_IDENTIFIER(__dict__);
diff --git a/Python/fileutils.c b/Python/fileutils.c
index eab58c5..03cc379 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
 #include <fcntl.h>
 #endif /* HAVE_FCNTL_H */
 
-#if defined(__APPLE__) || defined(__ANDROID__)
-extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
-#endif
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
+                                               size_t *p_wlen);
 
 #ifdef O_CLOEXEC
 /* Does open() support the O_CLOEXEC flag? Possible values:
@@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
 }
 #endif
 
-
-/* Decode a byte string from the locale encoding with the
-   surrogateescape error handler: undecodable bytes are decoded as characters
-   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
-   character, escape the bytes using the surrogateescape error handler instead
-   of decoding them.
-
-   Return a pointer to a newly allocated wide character string, use
-   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
-   wide characters excluding the null character into *size
-
-   Return NULL on decoding error or memory allocation error. If *size* is not
-   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
-   decoding error.
-
-   Decoding errors should never happen, unless there is a bug in the C
-   library.
-
-   Use the Py_EncodeLocale() function to encode the character string back to a
-   byte string. */
-wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+static wchar_t*
+decode_locale(const char* arg, size_t *size)
 {
-#if defined(__APPLE__) || defined(__ANDROID__)
-    wchar_t *wstr;
-    wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
-    if (size != NULL) {
-        if (wstr != NULL)
-            *size = wcslen(wstr);
-        else
-            *size = (size_t)-1;
-    }
-    return wstr;
-#else
     wchar_t *res;
     size_t argsize;
     size_t count;
@@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
     mbstate_t mbs;
 #endif
 
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii) {
-        /* force ASCII encoding to workaround mbstowcs() issue */
-        res = decode_ascii_surrogateescape(arg, size);
-        if (res == NULL)
-            goto oom;
-        return res;
-    }
-#endif
-
 #ifdef HAVE_BROKEN_MBSTOWCS
     /* Some platforms have a broken implementation of
      * mbstowcs which does not count the characters that
@@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
         goto oom;
 #endif   /* HAVE_MBRTOWC */
     return res;
+
 oom:
-    if (size != NULL)
+    if (size != NULL) {
         *size = (size_t)-1;
+    }
     return NULL;
-#endif   /* __APPLE__ or __ANDROID__ */
 }
 
-/* Encode a wide character string to the locale encoding with the
-   surrogateescape error handler: surrogate characters in the range
-   U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
 
-   Return a pointer to a newly allocated byte string, use PyMem_Free() to free
-   the memory. Return NULL on encoding or memory allocation error.
+/* Decode a byte string from the locale encoding with the
+   surrogateescape error handler: undecodable bytes are decoded as characters
+   in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
+   character, escape the bytes using the surrogateescape error handler instead
+   of decoding them.
 
-   If error_pos is not NULL, *error_pos is set to the index of the invalid
-   character on encoding error, or set to (size_t)-1 otherwise.
+   Return a pointer to a newly allocated wide character string, use
+   PyMem_RawFree() to free the memory. If size is not NULL, write the number of
+   wide characters excluding the null character into *size
 
-   Use the Py_DecodeLocale() function to decode the bytes string back to a wide
-   character string. */
-char*
-Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+   Return NULL on decoding error or memory allocation error. If *size* is not
+   NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
+   decoding error.
+
+   Decoding errors should never happen, unless there is a bug in the C
+   library.
+
+   Use the Py_EncodeLocale() function to encode the character string back to a
+   byte string. */
+wchar_t*
+Py_DecodeLocale(const char* arg, size_t *size)
 {
 #if defined(__APPLE__) || defined(__ANDROID__)
+    return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+#else
+    if (Py_UTF8Mode) {
+        return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+    }
+
+#ifndef MS_WINDOWS
+    if (force_ascii == -1)
+        force_ascii = check_force_ascii();
+
+    if (force_ascii) {
+        /* force ASCII encoding to workaround mbstowcs() issue */
+        wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
+        if (wstr == NULL) {
+            if (size != NULL) {
+                *size = (size_t)-1;
+            }
+            return NULL;
+        }
+        return wstr;
+    }
+#endif
+
+    return decode_locale(arg, size);
+#endif   /* __APPLE__ or __ANDROID__ */
+}
+
+static char*
+_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
+{
     Py_ssize_t len;
     PyObject *unicode, *bytes = NULL;
     char *cpath;
 
     unicode = PyUnicode_FromWideChar(text, wcslen(text));
-    if (unicode == NULL)
+    if (unicode == NULL) {
         return NULL;
+    }
 
     bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
     Py_DECREF(unicode);
     if (bytes == NULL) {
         PyErr_Clear();
-        if (error_pos != NULL)
+        if (error_pos != NULL) {
             *error_pos = (size_t)-1;
+        }
         return NULL;
     }
 
@@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
     if (cpath == NULL) {
         PyErr_Clear();
         Py_DECREF(bytes);
-        if (error_pos != NULL)
+        if (error_pos != NULL) {
             *error_pos = (size_t)-1;
+        }
         return NULL;
     }
     memcpy(cpath, PyBytes_AsString(bytes), len + 1);
     Py_DECREF(bytes);
     return cpath;
-#else   /* __APPLE__ */
+}
+
+static char*
+encode_locale(const wchar_t *text, size_t *error_pos)
+{
     const size_t len = wcslen(text);
     char *result = NULL, *bytes = NULL;
     size_t i, size, converted;
     wchar_t c, buf[2];
 
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii)
-        return encode_ascii_surrogateescape(text, error_pos);
-#endif
-
     /* The function works in two steps:
        1. compute the length of the output buffer in bytes (size)
        2. outputs the bytes */
@@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
         bytes = result;
     }
     return result;
+}
+
+/* Encode a wide character string to the locale encoding with the
+   surrogateescape error handler: surrogate characters in the range
+   U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
+
+   Return a pointer to a newly allocated byte string, use PyMem_Free() to free
+   the memory. Return NULL on encoding or memory allocation error.
+
+   If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
+   to the index of the invalid character on encoding error.
+
+   Use the Py_DecodeLocale() function to decode the bytes string back to a wide
+   character string. */
+char*
+Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+{
+#if defined(__APPLE__) || defined(__ANDROID__)
+    return _Py_EncodeLocaleUTF8(text, error_pos);
+#else   /* __APPLE__ */
+    if (Py_UTF8Mode) {
+        return _Py_EncodeLocaleUTF8(text, error_pos);
+    }
+
+#ifndef MS_WINDOWS
+    if (force_ascii == -1)
+        force_ascii = check_force_ascii();
+
+    if (force_ascii)
+        return encode_ascii_surrogateescape(text, error_pos);
+#endif
+
+    return encode_locale(text, error_pos);
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index f284855..2bac23d 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */
 static _PyInitError add_main_module(PyInterpreterState *interp);
 static _PyInitError initfsencoding(PyInterpreterState *interp);
 static _PyInitError initsite(void);
-static _PyInitError init_sys_streams(void);
+static _PyInitError init_sys_streams(PyInterpreterState *interp);
 static _PyInitError initsigs(void);
 static void call_py_exitfuncs(void);
 static void wait_for_thread_shutdown(void);
@@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config)
         return err;
     }
 
-    err = init_sys_streams();
+    err = init_sys_streams(interp);
     if (_Py_INIT_FAILED(err)) {
         return err;
     }
@@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p)
             return err;
         }
 
-        err = init_sys_streams();
+        err = init_sys_streams(interp);
         if (_Py_INIT_FAILED(err)) {
             return err;
         }
@@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp)
         Py_FileSystemDefaultEncodeErrors = "surrogatepass";
     }
 #else
-    if (Py_FileSystemDefaultEncoding == NULL) {
+    if (Py_FileSystemDefaultEncoding == NULL &&
+        interp->core_config.utf8_mode)
+    {
+        Py_FileSystemDefaultEncoding = "utf-8";
+        Py_HasFileSystemDefaultEncoding = 1;
+    }
+    else if (Py_FileSystemDefaultEncoding == NULL) {
         Py_FileSystemDefaultEncoding = get_locale_encoding();
         if (Py_FileSystemDefaultEncoding == NULL) {
             return _Py_INIT_ERR("Unable to get the locale encoding");
@@ -1749,7 +1755,7 @@ error:
 
 /* Initialize sys.stdin, stdout, stderr and builtins.open */
 static _PyInitError
-init_sys_streams(void)
+init_sys_streams(PyInterpreterState *interp)
 {
     PyObject *iomod = NULL, *wrapper;
     PyObject *bimod = NULL;
@@ -1794,10 +1800,10 @@ init_sys_streams(void)
     encoding = _Py_StandardStreamEncoding;
     errors = _Py_StandardStreamErrors;
     if (!encoding || !errors) {
-        pythonioencoding = Py_GETENV("PYTHONIOENCODING");
-        if (pythonioencoding) {
+        char *opt = Py_GETENV("PYTHONIOENCODING");
+        if (opt && opt[0] != '\0') {
             char *err;
-            pythonioencoding = _PyMem_Strdup(pythonioencoding);
+            pythonioencoding = _PyMem_Strdup(opt);
             if (pythonioencoding == NULL) {
                 PyErr_NoMemory();
                 goto error;
@@ -1814,7 +1820,12 @@ init_sys_streams(void)
                 encoding = pythonioencoding;
             }
         }
-        if (!errors && !(pythonioencoding && *pythonioencoding)) {
+        else if (interp->core_config.utf8_mode) {
+            encoding = "utf-8";
+            errors = "surrogateescape";
+        }
+
+        if (!errors && !pythonioencoding) {
             /* Choose the default error handler based on the current locale */
             errors = get_default_standard_stream_error_handler();
         }
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index f10099b..141e189 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = {
     {"hash_randomization",      "-R"},
     {"isolated",                "-I"},
     {"dev_mode",                "-X dev"},
+    {"utf8_mode",               "-X utf8"},
     {0}
 };
 
@@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = {
     "sys.flags",        /* name */
     flags__doc__,       /* doc */
     flags_fields,       /* fields */
-    14
+    15
 };
 
 static PyObject*
@@ -1853,8 +1854,9 @@ make_flags(void)
     SetFlag(Py_QuietFlag);
     SetFlag(Py_HashRandomizationFlag);
     SetFlag(Py_IsolatedFlag);
-#undef SetFlag
     PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode));
+    SetFlag(Py_UTF8Mode);
+#undef SetFlag
 
     if (PyErr_Occurred()) {
         Py_DECREF(seq);
author	Victor Stinner <victor.stinner@gmail.com>	2017-12-13 11:29:09 (GMT)
committer	GitHub <noreply@github.com>	2017-12-13 11:29:09 (GMT)
commit	91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
tree	ff002e0532736a97f3ddd367c1491e7b04611816 /Python
parent	c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
download	cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2