summaryrefslogtreecommitdiffstats
path: root/Python
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2017-12-13 11:29:09 (GMT)
committerGitHub <noreply@github.com>2017-12-13 11:29:09 (GMT)
commit91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
treeff002e0532736a97f3ddd367c1491e7b04611816 /Python
parentc3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
downloadcpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
Diffstat (limited to 'Python')
-rw-r--r--Python/bltinmodule.c3
-rw-r--r--Python/fileutils.c174
-rw-r--r--Python/pylifecycle.c29
-rw-r--r--Python/sysmodule.c6
4 files changed, 127 insertions, 85 deletions
diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c
index 81774dc..23d7aa4 100644
--- a/Python/bltinmodule.c
+++ b/Python/bltinmodule.c
@@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
int Py_HasFileSystemDefaultEncoding = 0;
#endif
const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
+/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
+ and stdout error handler to "surrogateescape". */
+int Py_UTF8Mode = 0;
_Py_IDENTIFIER(__builtins__);
_Py_IDENTIFIER(__dict__);
diff --git a/Python/fileutils.c b/Python/fileutils.c
index eab58c5..03cc379 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
-#if defined(__APPLE__) || defined(__ANDROID__)
-extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
-#endif
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
+ size_t *p_wlen);
#ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values:
@@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
}
#endif
-
-/* Decode a byte string from the locale encoding with the
- surrogateescape error handler: undecodable bytes are decoded as characters
- in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
- character, escape the bytes using the surrogateescape error handler instead
- of decoding them.
-
- Return a pointer to a newly allocated wide character string, use
- PyMem_RawFree() to free the memory. If size is not NULL, write the number of
- wide characters excluding the null character into *size
-
- Return NULL on decoding error or memory allocation error. If *size* is not
- NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
- decoding error.
-
- Decoding errors should never happen, unless there is a bug in the C
- library.
-
- Use the Py_EncodeLocale() function to encode the character string back to a
- byte string. */
-wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+static wchar_t*
+decode_locale(const char* arg, size_t *size)
{
-#if defined(__APPLE__) || defined(__ANDROID__)
- wchar_t *wstr;
- wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
- if (size != NULL) {
- if (wstr != NULL)
- *size = wcslen(wstr);
- else
- *size = (size_t)-1;
- }
- return wstr;
-#else
wchar_t *res;
size_t argsize;
size_t count;
@@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
mbstate_t mbs;
#endif
-#ifndef MS_WINDOWS
- if (force_ascii == -1)
- force_ascii = check_force_ascii();
-
- if (force_ascii) {
- /* force ASCII encoding to workaround mbstowcs() issue */
- res = decode_ascii_surrogateescape(arg, size);
- if (res == NULL)
- goto oom;
- return res;
- }
-#endif
-
#ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that
@@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
goto oom;
#endif /* HAVE_MBRTOWC */
return res;
+
oom:
- if (size != NULL)
+ if (size != NULL) {
*size = (size_t)-1;
+ }
return NULL;
-#endif /* __APPLE__ or __ANDROID__ */
}
-/* Encode a wide character string to the locale encoding with the
- surrogateescape error handler: surrogate characters in the range
- U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
- Return a pointer to a newly allocated byte string, use PyMem_Free() to free
- the memory. Return NULL on encoding or memory allocation error.
+/* Decode a byte string from the locale encoding with the
+ surrogateescape error handler: undecodable bytes are decoded as characters
+ in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
+ character, escape the bytes using the surrogateescape error handler instead
+ of decoding them.
- If error_pos is not NULL, *error_pos is set to the index of the invalid
- character on encoding error, or set to (size_t)-1 otherwise.
+ Return a pointer to a newly allocated wide character string, use
+ PyMem_RawFree() to free the memory. If size is not NULL, write the number of
+ wide characters excluding the null character into *size
- Use the Py_DecodeLocale() function to decode the bytes string back to a wide
- character string. */
-char*
-Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+ Return NULL on decoding error or memory allocation error. If *size* is not
+ NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
+ decoding error.
+
+ Decoding errors should never happen, unless there is a bug in the C
+ library.
+
+ Use the Py_EncodeLocale() function to encode the character string back to a
+ byte string. */
+wchar_t*
+Py_DecodeLocale(const char* arg, size_t *size)
{
#if defined(__APPLE__) || defined(__ANDROID__)
+ return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+#else
+ if (Py_UTF8Mode) {
+ return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+ }
+
+#ifndef MS_WINDOWS
+ if (force_ascii == -1)
+ force_ascii = check_force_ascii();
+
+ if (force_ascii) {
+ /* force ASCII encoding to workaround mbstowcs() issue */
+ wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
+ if (wstr == NULL) {
+ if (size != NULL) {
+ *size = (size_t)-1;
+ }
+ return NULL;
+ }
+ return wstr;
+ }
+#endif
+
+ return decode_locale(arg, size);
+#endif /* __APPLE__ or __ANDROID__ */
+}
+
+static char*
+_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
+{
Py_ssize_t len;
PyObject *unicode, *bytes = NULL;
char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text));
- if (unicode == NULL)
+ if (unicode == NULL) {
return NULL;
+ }
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Py_DECREF(unicode);
if (bytes == NULL) {
PyErr_Clear();
- if (error_pos != NULL)
+ if (error_pos != NULL) {
*error_pos = (size_t)-1;
+ }
return NULL;
}
@@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
if (cpath == NULL) {
PyErr_Clear();
Py_DECREF(bytes);
- if (error_pos != NULL)
+ if (error_pos != NULL) {
*error_pos = (size_t)-1;
+ }
return NULL;
}
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes);
return cpath;
-#else /* __APPLE__ */
+}
+
+static char*
+encode_locale(const wchar_t *text, size_t *error_pos)
+{
const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL;
size_t i, size, converted;
wchar_t c, buf[2];
-#ifndef MS_WINDOWS
- if (force_ascii == -1)
- force_ascii = check_force_ascii();
-
- if (force_ascii)
- return encode_ascii_surrogateescape(text, error_pos);
-#endif
-
/* The function works in two steps:
1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */
@@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
bytes = result;
}
return result;
+}
+
+/* Encode a wide character string to the locale encoding with the
+ surrogateescape error handler: surrogate characters in the range
+ U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
+
+ Return a pointer to a newly allocated byte string, use PyMem_Free() to free
+ the memory. Return NULL on encoding or memory allocation error.
+
+ If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
+ to the index of the invalid character on encoding error.
+
+ Use the Py_DecodeLocale() function to decode the bytes string back to a wide
+ character string. */
+char*
+Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
+{
+#if defined(__APPLE__) || defined(__ANDROID__)
+ return _Py_EncodeLocaleUTF8(text, error_pos);
+#else /* __APPLE__ */
+ if (Py_UTF8Mode) {
+ return _Py_EncodeLocaleUTF8(text, error_pos);
+ }
+
+#ifndef MS_WINDOWS
+ if (force_ascii == -1)
+ force_ascii = check_force_ascii();
+
+ if (force_ascii)
+ return encode_ascii_surrogateescape(text, error_pos);
+#endif
+
+ return encode_locale(text, error_pos);
#endif /* __APPLE__ or __ANDROID__ */
}
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index f284855..2bac23d 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */
static _PyInitError add_main_module(PyInterpreterState *interp);
static _PyInitError initfsencoding(PyInterpreterState *interp);
static _PyInitError initsite(void);
-static _PyInitError init_sys_streams(void);
+static _PyInitError init_sys_streams(PyInterpreterState *interp);
static _PyInitError initsigs(void);
static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void);
@@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config)
return err;
}
- err = init_sys_streams();
+ err = init_sys_streams(interp);
if (_Py_INIT_FAILED(err)) {
return err;
}
@@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p)
return err;
}
- err = init_sys_streams();
+ err = init_sys_streams(interp);
if (_Py_INIT_FAILED(err)) {
return err;
}
@@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp)
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
}
#else
- if (Py_FileSystemDefaultEncoding == NULL) {
+ if (Py_FileSystemDefaultEncoding == NULL &&
+ interp->core_config.utf8_mode)
+ {
+ Py_FileSystemDefaultEncoding = "utf-8";
+ Py_HasFileSystemDefaultEncoding = 1;
+ }
+ else if (Py_FileSystemDefaultEncoding == NULL) {
Py_FileSystemDefaultEncoding = get_locale_encoding();
if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding");
@@ -1749,7 +1755,7 @@ error:
/* Initialize sys.stdin, stdout, stderr and builtins.open */
static _PyInitError
-init_sys_streams(void)
+init_sys_streams(PyInterpreterState *interp)
{
PyObject *iomod = NULL, *wrapper;
PyObject *bimod = NULL;
@@ -1794,10 +1800,10 @@ init_sys_streams(void)
encoding = _Py_StandardStreamEncoding;
errors = _Py_StandardStreamErrors;
if (!encoding || !errors) {
- pythonioencoding = Py_GETENV("PYTHONIOENCODING");
- if (pythonioencoding) {
+ char *opt = Py_GETENV("PYTHONIOENCODING");
+ if (opt && opt[0] != '\0') {
char *err;
- pythonioencoding = _PyMem_Strdup(pythonioencoding);
+ pythonioencoding = _PyMem_Strdup(opt);
if (pythonioencoding == NULL) {
PyErr_NoMemory();
goto error;
@@ -1814,7 +1820,12 @@ init_sys_streams(void)
encoding = pythonioencoding;
}
}
- if (!errors && !(pythonioencoding && *pythonioencoding)) {
+ else if (interp->core_config.utf8_mode) {
+ encoding = "utf-8";
+ errors = "surrogateescape";
+ }
+
+ if (!errors && !pythonioencoding) {
/* Choose the default error handler based on the current locale */
errors = get_default_standard_stream_error_handler();
}
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index f10099b..141e189 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = {
{"hash_randomization", "-R"},
{"isolated", "-I"},
{"dev_mode", "-X dev"},
+ {"utf8_mode", "-X utf8"},
{0}
};
@@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = {
"sys.flags", /* name */
flags__doc__, /* doc */
flags_fields, /* fields */
- 14
+ 15
};
static PyObject*
@@ -1853,8 +1854,9 @@ make_flags(void)
SetFlag(Py_QuietFlag);
SetFlag(Py_HashRandomizationFlag);
SetFlag(Py_IsolatedFlag);
-#undef SetFlag
PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode));
+ SetFlag(Py_UTF8Mode);
+#undef SetFlag
if (PyErr_Occurred()) {
Py_DECREF(seq);