bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
author: Victor Stinner <victor.stinner@gmail.com> 2017-12-13 11:29:09 (GMT)
committer: GitHub <noreply@github.com> 2017-12-13 11:29:09 (GMT)
commit: 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
tree: ff002e0532736a97f3ddd367c1491e7b04611816 /Objects/unicodeobject.c
parent: c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
download: cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2
1 files changed, 21 insertions, 9 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 8d4fea8..c7480a0 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5079,16 +5079,17 @@ onError:
     return NULL;
 }
 
-#if defined(__APPLE__) || defined(__ANDROID__)
 
-/* Simplified UTF-8 decoder using surrogateescape error handler,
-   used to decode the command line arguments on Mac OS X and Android.
+/* UTF-8 decoder using the surrogateescape error handler .
 
-   Return a pointer to a newly allocated wide character string (use
-   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
+   On success, return a pointer to a newly allocated wide character string (use
+   PyMem_RawFree() to free the memory) and write the output length (in number
+   of wchar_t units) into *p_wlen (if p_wlen is set).
 
+   On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
+   (if p_wlen is set). */
 wchar_t*
-_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
+_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
 {
     const char *e;
     wchar_t *unicode;
@@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
 
     /* Note: size will always be longer than the resulting Unicode
        character count */
-    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
+    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
+        if (p_wlen) {
+            *p_wlen = (size_t)-1;
+        }
         return NULL;
+    }
+
     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
-    if (!unicode)
+    if (!unicode) {
+        if (p_wlen) {
+            *p_wlen = (size_t)-1;
+        }
         return NULL;
+    }
 
     /* Unpack UTF-8 encoded data */
     e = s + size;
@@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
         }
     }
     unicode[outpos] = L'\0';
+    if (p_wlen) {
+        *p_wlen = outpos;
+    }
     return unicode;
 }
 
-#endif /* __APPLE__ or __ANDROID__ */
 
 /* Primary internal function which creates utf8 encoded bytes objects.
author	Victor Stinner <victor.stinner@gmail.com>	2017-12-13 11:29:09 (GMT)
committer	GitHub <noreply@github.com>	2017-12-13 11:29:09 (GMT)
commit	91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
tree	ff002e0532736a97f3ddd367c1491e7b04611816 /Objects/unicodeobject.c
parent	c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
download	cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2