From 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 13 Dec 2017 12:29:09 +0100 Subject: bpo-29240: PEP 540: Add a new UTF-8 Mode (#855) * Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value. --- Doc/c-api/sys.rst | 13 +- Doc/library/locale.rst | 7 + Doc/library/sys.rst | 13 +- Doc/using/cmdline.rst | 13 +- Doc/whatsnew/3.7.rst | 21 +++ Include/fileobject.h | 4 + Include/pystate.h | 1 + Lib/_bootlocale.py | 6 + Lib/encodings/__init__.py | 5 +- Lib/locale.py | 6 + Lib/subprocess.py | 2 +- Lib/test/test_builtin.py | 1 + Lib/test/test_c_locale_coercion.py | 2 +- Lib/test/test_codecs.py | 10 +- Lib/test/test_io.py | 2 + Lib/test/test_sys.py | 8 +- Lib/test/test_utf8_mode.py | 206 +++++++++++++++++++++ .../2017-12-05-23-10-58.bpo-29240.qpJP5l.rst | 1 + Modules/_winapi.c | 19 +- Modules/clinic/_winapi.c.h | 20 +- Modules/main.c | 146 +++++++++------ Objects/unicodeobject.c | 30 ++- Programs/python.c | 23 ++- Python/bltinmodule.c | 3 + Python/fileutils.c | 174 +++++++++-------- Python/pylifecycle.c | 29 ++- Python/sysmodule.c | 6 +- 27 files changed, 593 insertions(+), 178 deletions(-) create mode 100644 Lib/test/test_utf8_mode.py create mode 100644 Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst diff --git a/Doc/c-api/sys.rst b/Doc/c-api/sys.rst index 95d9d65..20bc7bd 100644 --- a/Doc/c-api/sys.rst +++ b/Doc/c-api/sys.rst @@ -127,6 +127,9 @@ Operating System Utilities .. versionadded:: 3.5 + .. versionchanged:: 3.7 + The function now uses the UTF-8 encoding in the UTF-8 mode. + .. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) @@ -138,12 +141,15 @@ Operating System Utilities to free the memory. Return ``NULL`` on encoding error or memory allocation error - If error_pos is not ``NULL``, ``*error_pos`` is set to the index of the - invalid character on encoding error, or set to ``(size_t)-1`` otherwise. + If error_pos is not ``NULL``, ``*error_pos`` is set to ``(size_t)-1`` on + success, or set to the index of the invalid character on encoding error. Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back to a wide character string. + .. versionchanged:: 3.7 + The function now uses the UTF-8 encoding in the UTF-8 mode. + .. seealso:: The :c:func:`PyUnicode_EncodeFSDefault` and @@ -151,6 +157,9 @@ Operating System Utilities .. versionadded:: 3.5 + .. versionchanged:: 3.7 + The function now supports the UTF-8 mode. + .. _systemfunctions: diff --git a/Doc/library/locale.rst b/Doc/library/locale.rst index e8567a7..7da94a2 100644 --- a/Doc/library/locale.rst +++ b/Doc/library/locale.rst @@ -316,6 +316,13 @@ The :mod:`locale` module defines the following exception and functions: preferences, so this function is not thread-safe. If invoking setlocale is not necessary or desired, *do_setlocale* should be set to ``False``. + On Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), always + return ``'UTF-8'``, the locale and the *do_setlocale* argument are ignored. + + .. versionchanged:: 3.7 + The function now always returns ``UTF-8`` on Android or if the UTF-8 mode + is enabled. + .. function:: normalize(localename) diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst index 9e47681..957d02b 100644 --- a/Doc/library/sys.rst +++ b/Doc/library/sys.rst @@ -313,6 +313,9 @@ always available. has caught :exc:`SystemExit` (such as an error flushing buffered data in the standard streams), the exit status is changed to 120. + .. versionchanged:: 3.7 + Added ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag. + .. data:: flags @@ -335,6 +338,7 @@ always available. :const:`quiet` :option:`-q` :const:`hash_randomization` :option:`-R` :const:`dev_mode` :option:`-X` ``dev`` + :const:`utf8_mode` :option:`-X` ``utf8`` ============================= ============================= .. versionchanged:: 3.2 @@ -347,7 +351,8 @@ always available. Removed obsolete ``division_warning`` attribute. .. versionchanged:: 3.7 - Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag. + Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag + and ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag. .. data:: float_info @@ -492,6 +497,8 @@ always available. :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that the correct encoding and errors mode are used. + * In the UTF-8 mode, the encoding is ``utf-8`` on any platform. + * On Mac OS X, the encoding is ``'utf-8'``. * On Unix, the encoding is the locale encoding. @@ -506,6 +513,10 @@ always available. Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529` and :func:`_enablelegacywindowsfsencoding` for more information. + .. versionchanged:: 3.7 + Return 'utf-8' in the UTF-8 mode. + + .. function:: getfilesystemencodeerrors() Return the name of the error mode used to convert between Unicode filenames diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index e32f77e..5cb9071 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -439,6 +439,9 @@ Miscellaneous options * Set the :attr:`~sys.flags.dev_mode` attribute of :attr:`sys.flags` to ``True`` + * ``-X utf8`` enables the UTF-8 mode, whereas ``-X utf8=0`` disables the + UTF-8 mode. + It also allows passing arbitrary values and retrieving them through the :data:`sys._xoptions` dictionary. @@ -455,7 +458,7 @@ Miscellaneous options The ``-X showalloccount`` option. .. versionadded:: 3.7 - The ``-X importtime`` and ``-X dev`` options. + The ``-X importtime``, ``-X dev`` and ``-X utf8`` options. Options you shouldn't use @@ -816,6 +819,14 @@ conflict. .. versionadded:: 3.7 +.. envvar:: PYTHONUTF8 + + If set to ``1``, enable the UTF-8 mode. If set to ``0``, disable the UTF-8 + mode. Any other non-empty string cause an error. + + .. versionadded:: 3.7 + + Debug-mode variables ~~~~~~~~~~~~~~~~~~~~ diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 58bfaef..81a88a0 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -185,6 +185,23 @@ resolution on Linux and Windows. PEP written and implemented by Victor Stinner +PEP 540: Add a new UTF-8 mode +----------------------------- + +Add a new UTF-8 mode to ignore the locale, use the UTF-8 encoding, and change +:data:`sys.stdin` and :data:`sys.stdout` error handlers to ``surrogateescape``. +This mode is enabled by default in the POSIX locale, but otherwise disabled by +default. + +The new :option:`-X` ``utf8`` command line option and :envvar:`PYTHONUTF8` +environment variable are added to control the UTF-8 mode. + +.. seealso:: + + :pep:`540` -- Add a new UTF-8 mode + PEP written and implemented by Victor Stinner + + New Development Mode: -X dev ---------------------------- @@ -353,6 +370,10 @@ Added another argument *monetary* in :meth:`format_string` of :mod:`locale`. If *monetary* is true, the conversion uses monetary thousands separator and grouping strings. (Contributed by Garvit in :issue:`10379`.) +The :func:`locale.getpreferredencoding` function now always returns ``'UTF-8'`` +on Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), the locale and +the *do_setlocale* argument are ignored. + math ---- diff --git a/Include/fileobject.h b/Include/fileobject.h index 0b1678e..89e8dd6 100644 --- a/Include/fileobject.h +++ b/Include/fileobject.h @@ -28,6 +28,10 @@ PyAPI_DATA(const char *) Py_FileSystemDefaultEncodeErrors; #endif PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding; +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000 +PyAPI_DATA(int) Py_UTF8Mode; +#endif + /* Internal API The std printer acts as a preliminary sys.stderr until the new io diff --git a/Include/pystate.h b/Include/pystate.h index d149aeb..c7ea179 100644 --- a/Include/pystate.h +++ b/Include/pystate.h @@ -38,6 +38,7 @@ typedef struct { int show_alloc_count; /* -X showalloccount */ int dump_refs; /* PYTHONDUMPREFS */ int malloc_stats; /* PYTHONMALLOCSTATS */ + int utf8_mode; /* -X utf8 or PYTHONUTF8 environment variable */ } _PyCoreConfig; #define _PyCoreConfig_INIT (_PyCoreConfig){.use_hash_seed = -1} diff --git a/Lib/_bootlocale.py b/Lib/_bootlocale.py index 0c61b0d..3273a3b 100644 --- a/Lib/_bootlocale.py +++ b/Lib/_bootlocale.py @@ -9,6 +9,8 @@ import _locale if sys.platform.startswith("win"): def getpreferredencoding(do_setlocale=True): + if sys.flags.utf8_mode: + return 'UTF-8' return _locale._getdefaultlocale()[1] else: try: @@ -21,6 +23,8 @@ else: return 'UTF-8' else: def getpreferredencoding(do_setlocale=True): + if sys.flags.utf8_mode: + return 'UTF-8' # This path for legacy systems needs the more complex # getdefaultlocale() function, import the full locale module. import locale @@ -28,6 +32,8 @@ else: else: def getpreferredencoding(do_setlocale=True): assert not do_setlocale + if sys.flags.utf8_mode: + return 'UTF-8' result = _locale.nl_langinfo(_locale.CODESET) if not result and sys.platform == 'darwin': # nl_langinfo can return an empty string diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index aa2fb7c..025b7a8 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -158,8 +158,9 @@ codecs.register(search_function) if sys.platform == 'win32': def _alias_mbcs(encoding): try: - import _bootlocale - if encoding == _bootlocale.getpreferredencoding(False): + import _winapi + ansi_code_page = "cp%s" % _winapi.GetACP() + if encoding == ansi_code_page: import encodings.mbcs return encodings.mbcs.getregentry() except ImportError: diff --git a/Lib/locale.py b/Lib/locale.py index f1d157d..18079e7 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -617,6 +617,8 @@ if sys.platform.startswith("win"): # On Win32, this will return the ANSI code page def getpreferredencoding(do_setlocale = True): """Return the charset that the user is likely using.""" + if sys.flags.utf8_mode: + return 'UTF-8' import _bootlocale return _bootlocale.getpreferredencoding(False) else: @@ -634,6 +636,8 @@ else: def getpreferredencoding(do_setlocale = True): """Return the charset that the user is likely using, by looking at environment variables.""" + if sys.flags.utf8_mode: + return 'UTF-8' res = getdefaultlocale()[1] if res is None: # LANG not set, default conservatively to ASCII @@ -643,6 +647,8 @@ else: def getpreferredencoding(do_setlocale = True): """Return the charset that the user is likely using, according to the system configuration.""" + if sys.flags.utf8_mode: + return 'UTF-8' import _bootlocale if do_setlocale: oldloc = setlocale(LC_CTYPE) diff --git a/Lib/subprocess.py b/Lib/subprocess.py index 301433c..65b4086 100644 --- a/Lib/subprocess.py +++ b/Lib/subprocess.py @@ -280,7 +280,7 @@ def _args_from_interpreter_flags(): if dev_mode: args.extend(('-X', 'dev')) for opt in ('faulthandler', 'tracemalloc', 'importtime', - 'showalloccount', 'showrefcount'): + 'showalloccount', 'showrefcount', 'utf8'): if opt in xoptions: value = xoptions[opt] if value is True: diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 0a61c05..9329318 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -1022,6 +1022,7 @@ class BuiltinTest(unittest.TestCase): self.assertRaises(ValueError, open, 'a\x00b') self.assertRaises(ValueError, open, b'a\x00b') + @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_open_default_encoding(self): old_environ = dict(os.environ) try: diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 2a22739..c0845d7 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -130,7 +130,7 @@ class EncodingDetails(_EncodingDetails): that. """ result, py_cmd = run_python_until_end( - "-c", cls.CHILD_PROCESS_SCRIPT, + "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, __isolated=True, **env_vars ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index eb21a39..a59a5e2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -5,6 +5,7 @@ import locale import sys import unittest import encodings +from unittest import mock from test import support @@ -3180,16 +3181,9 @@ class CodePageTest(unittest.TestCase): def test_mbcs_alias(self): # Check that looking up our 'default' codepage will return # mbcs when we don't have a more specific one available - import _bootlocale - def _get_fake_codepage(*a): - return 'cp123' - old_getpreferredencoding = _bootlocale.getpreferredencoding - _bootlocale.getpreferredencoding = _get_fake_codepage - try: + with mock.patch('_winapi.GetACP', return_value=123): codec = codecs.lookup('cp123') self.assertEqual(codec.name, 'mbcs') - finally: - _bootlocale.getpreferredencoding = old_getpreferredencoding class ASCIITest(unittest.TestCase): diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 6bb4127..6674831 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -2580,6 +2580,7 @@ class TextIOWrapperTest(unittest.TestCase): t.reconfigure(line_buffering=None) self.assertEqual(t.line_buffering, True) + @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_default_encoding(self): old_environ = dict(os.environ) try: @@ -2599,6 +2600,7 @@ class TextIOWrapperTest(unittest.TestCase): os.environ.update(old_environ) @support.cpython_only + @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_device_encoding(self): # Issue 15989 import _testcapi diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 6346094..6933b41 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -527,7 +527,7 @@ class SysModuleTest(unittest.TestCase): "inspect", "interactive", "optimize", "dont_write_bytecode", "no_user_site", "no_site", "ignore_environment", "verbose", "bytes_warning", "quiet", "hash_randomization", "isolated", - "dev_mode") + "dev_mode", "utf8_mode") for attr in attrs: self.assertTrue(hasattr(sys.flags, attr), attr) attr_type = bool if attr == "dev_mode" else int @@ -535,6 +535,8 @@ class SysModuleTest(unittest.TestCase): self.assertTrue(repr(sys.flags)) self.assertEqual(len(sys.flags), len(attrs)) + self.assertIn(sys.flags.utf8_mode, {0, 1, 2}) + def assert_raise_on_new_sys_type(self, sys_attr): # Users are intentionally prevented from creating new instances of # sys.flags, sys.version_info, and sys.getwindowsversion. @@ -710,8 +712,8 @@ class SysModuleTest(unittest.TestCase): # have no any effect out = self.c_locale_get_error_handler(encoding=':') self.assertEqual(out, - 'stdin: surrogateescape\n' - 'stdout: surrogateescape\n' + 'stdin: strict\n' + 'stdout: strict\n' 'stderr: backslashreplace\n') out = self.c_locale_get_error_handler(encoding='') self.assertEqual(out, diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py new file mode 100644 index 0000000..275a6ea --- /dev/null +++ b/Lib/test/test_utf8_mode.py @@ -0,0 +1,206 @@ +""" +Test the implementation of the PEP 540: the UTF-8 Mode. +""" + +import locale +import os +import sys +import textwrap +import unittest +from test.support.script_helper import assert_python_ok, assert_python_failure + + +MS_WINDOWS = (sys.platform == 'win32') + + +class UTF8ModeTests(unittest.TestCase): + # Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment + # variables by default + DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''} + + def posix_locale(self): + loc = locale.setlocale(locale.LC_CTYPE, None) + return (loc == 'C') + + def get_output(self, *args, failure=False, **kw): + kw = dict(self.DEFAULT_ENV, **kw) + if failure: + out = assert_python_failure(*args, **kw) + out = out[2] + else: + out = assert_python_ok(*args, **kw) + out = out[1] + return out.decode().rstrip("\n\r") + + @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale') + def test_posix_locale(self): + code = 'import sys; print(sys.flags.utf8_mode)' + + out = self.get_output('-c', code, LC_ALL='C') + self.assertEqual(out, '1') + + def test_xoption(self): + code = 'import sys; print(sys.flags.utf8_mode)' + + out = self.get_output('-X', 'utf8', '-c', code) + self.assertEqual(out, '1') + + # undocumented but accepted syntax: -X utf8=1 + out = self.get_output('-X', 'utf8=1', '-c', code) + self.assertEqual(out, '1') + + out = self.get_output('-X', 'utf8=0', '-c', code) + self.assertEqual(out, '0') + + if MS_WINDOWS: + # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 + # and has the priority over -X utf8 + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONLEGACYWINDOWSFSENCODING='1') + self.assertEqual(out, '0') + + def test_env_var(self): + code = 'import sys; print(sys.flags.utf8_mode)' + + out = self.get_output('-c', code, PYTHONUTF8='1') + self.assertEqual(out, '1') + + out = self.get_output('-c', code, PYTHONUTF8='0') + self.assertEqual(out, '0') + + # -X utf8 has the priority over PYTHONUTF8 + out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1') + self.assertEqual(out, '0') + + if MS_WINDOWS: + # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode + # and has the priority over PYTHONUTF8 + out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1', + PYTHONLEGACYWINDOWSFSENCODING='1') + self.assertEqual(out, '0') + + # Cannot test with the POSIX locale, since the POSIX locale enables + # the UTF-8 mode + if not self.posix_locale(): + # PYTHONUTF8 should be ignored if -E is used + out = self.get_output('-E', '-c', code, PYTHONUTF8='1') + self.assertEqual(out, '0') + + # invalid mode + out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True) + self.assertIn('invalid PYTHONUTF8 environment variable value', + out.rstrip()) + + def test_filesystemencoding(self): + code = textwrap.dedent(''' + import sys + print("{}/{}".format(sys.getfilesystemencoding(), + sys.getfilesystemencodeerrors())) + ''') + + if MS_WINDOWS: + expected = 'utf-8/surrogatepass' + else: + expected = 'utf-8/surrogateescape' + + out = self.get_output('-X', 'utf8', '-c', code) + self.assertEqual(out, expected) + + if MS_WINDOWS: + # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode + # and has the priority over -X utf8 and PYTHONUTF8 + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONUTF8='strict', + PYTHONLEGACYWINDOWSFSENCODING='1') + self.assertEqual(out, 'mbcs/replace') + + def test_stdio(self): + code = textwrap.dedent(''' + import sys + print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}") + print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}") + print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}") + ''') + + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONIOENCODING='') + self.assertEqual(out.splitlines(), + ['stdin: utf-8/surrogateescape', + 'stdout: utf-8/surrogateescape', + 'stderr: utf-8/backslashreplace']) + + # PYTHONIOENCODING has the priority over PYTHONUTF8 + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONIOENCODING="latin1") + self.assertEqual(out.splitlines(), + ['stdin: latin1/strict', + 'stdout: latin1/strict', + 'stderr: latin1/backslashreplace']) + + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONIOENCODING=":namereplace") + self.assertEqual(out.splitlines(), + ['stdin: UTF-8/namereplace', + 'stdout: UTF-8/namereplace', + 'stderr: UTF-8/backslashreplace']) + + def test_io(self): + code = textwrap.dedent(''' + import sys + filename = sys.argv[1] + with open(filename) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') + filename = __file__ + + out = self.get_output('-c', code, filename, PYTHONUTF8='1') + self.assertEqual(out, 'UTF-8/strict') + + def _check_io_encoding(self, module, encoding=None, errors=None): + filename = __file__ + + # Encoding explicitly set + args = [] + if encoding: + args.append(f'encoding={encoding!r}') + if errors: + args.append(f'errors={errors!r}') + code = textwrap.dedent(''' + import sys + from %s import open + filename = sys.argv[1] + with open(filename, %s) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') % (module, ', '.join(args)) + out = self.get_output('-c', code, filename, + PYTHONUTF8='1') + + if not encoding: + encoding = 'UTF-8' + if not errors: + errors = 'strict' + self.assertEqual(out, f'{encoding}/{errors}') + + def check_io_encoding(self, module): + self._check_io_encoding(module, encoding="latin1") + self._check_io_encoding(module, errors="namereplace") + self._check_io_encoding(module, + encoding="latin1", errors="namereplace") + + def test_io_encoding(self): + self.check_io_encoding('io') + + def test_io_encoding(self): + self.check_io_encoding('_pyio') + + def test_locale_getpreferredencoding(self): + code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))' + out = self.get_output('-X', 'utf8', '-c', code) + self.assertEqual(out, 'UTF-8 UTF-8') + + out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C') + self.assertEqual(out, 'UTF-8 UTF-8') + + +if __name__ == "__main__": + unittest.main() diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst b/Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst new file mode 100644 index 0000000..dbd9d61 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2017-12-05-23-10-58.bpo-29240.qpJP5l.rst @@ -0,0 +1 @@ +Add a new UTF-8 mode: implementation of the :pep:`540`. diff --git a/Modules/_winapi.c b/Modules/_winapi.c index 0a1d139..604c05d 100644 --- a/Modules/_winapi.c +++ b/Modules/_winapi.c @@ -1490,6 +1490,20 @@ _winapi_WriteFile_impl(PyObject *module, HANDLE handle, PyObject *buffer, } +/*[clinic input] +_winapi.GetACP + +Get the current Windows ANSI code page identifier. +[clinic start generated code]*/ + +static PyObject * +_winapi_GetACP_impl(PyObject *module) +/*[clinic end generated code: output=f7ee24bf705dbb88 input=1433c96d03a05229]*/ +{ + return PyLong_FromUnsignedLong(GetACP()); +} + + static PyMethodDef winapi_functions[] = { _WINAPI_CLOSEHANDLE_METHODDEF _WINAPI_CONNECTNAMEDPIPE_METHODDEF @@ -1515,6 +1529,7 @@ static PyMethodDef winapi_functions[] = { _WINAPI_WAITFORMULTIPLEOBJECTS_METHODDEF _WINAPI_WAITFORSINGLEOBJECT_METHODDEF _WINAPI_WRITEFILE_METHODDEF + _WINAPI_GETACP_METHODDEF {NULL, NULL} }; @@ -1595,14 +1610,14 @@ PyInit__winapi(void) WINAPI_CONSTANT(F_DWORD, WAIT_OBJECT_0); WINAPI_CONSTANT(F_DWORD, WAIT_ABANDONED_0); WINAPI_CONSTANT(F_DWORD, WAIT_TIMEOUT); - + WINAPI_CONSTANT(F_DWORD, ABOVE_NORMAL_PRIORITY_CLASS); WINAPI_CONSTANT(F_DWORD, BELOW_NORMAL_PRIORITY_CLASS); WINAPI_CONSTANT(F_DWORD, HIGH_PRIORITY_CLASS); WINAPI_CONSTANT(F_DWORD, IDLE_PRIORITY_CLASS); WINAPI_CONSTANT(F_DWORD, NORMAL_PRIORITY_CLASS); WINAPI_CONSTANT(F_DWORD, REALTIME_PRIORITY_CLASS); - + WINAPI_CONSTANT(F_DWORD, CREATE_NO_WINDOW); WINAPI_CONSTANT(F_DWORD, DETACHED_PROCESS); WINAPI_CONSTANT(F_DWORD, CREATE_DEFAULT_ERROR_MODE); diff --git a/Modules/clinic/_winapi.c.h b/Modules/clinic/_winapi.c.h index 01bba36..e5781efb 100644 --- a/Modules/clinic/_winapi.c.h +++ b/Modules/clinic/_winapi.c.h @@ -889,4 +889,22 @@ _winapi_WriteFile(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject exit: return return_value; } -/*[clinic end generated code: output=fba2ad7bf1a87e4a input=a9049054013a1b77]*/ + +PyDoc_STRVAR(_winapi_GetACP__doc__, +"GetACP($module, /)\n" +"--\n" +"\n" +"Get the current Windows ANSI code page identifier."); + +#define _WINAPI_GETACP_METHODDEF \ + {"GetACP", (PyCFunction)_winapi_GetACP, METH_NOARGS, _winapi_GetACP__doc__}, + +static PyObject * +_winapi_GetACP_impl(PyObject *module); + +static PyObject * +_winapi_GetACP(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ + return _winapi_GetACP_impl(module); +} +/*[clinic end generated code: output=fd91c1ec286f0bf3 input=a9049054013a1b77]*/ diff --git a/Modules/main.c b/Modules/main.c index ac8a38c..9ce111c 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -1114,50 +1114,32 @@ pymain_set_argv(_PyMain *pymain) } -static void -pymain_get_flag(int flag, int *value) -{ - if (flag) { - *value = flag; - } -} - -static void -pymain_set_flag(int *flag, int value) -{ - /* Helper to set flag variables from command line options - * - uses the higher of the two values if they're both set - * - otherwise leaves the flag unset - */ - if (*flag < value) { - *flag = value; - } -} - - /* Get Py_xxx global configuration variables */ static void pymain_get_global_config(_PyMain *pymain) { _Py_CommandLineDetails *cmdline = &pymain->cmdline; - pymain_get_flag(Py_BytesWarningFlag, &cmdline->bytes_warning); - pymain_get_flag(Py_DebugFlag, &cmdline->debug); - pymain_get_flag(Py_InspectFlag, &cmdline->inspect); - pymain_get_flag(Py_InteractiveFlag, &cmdline->interactive); - pymain_get_flag(Py_IsolatedFlag, &cmdline->isolated); - pymain_get_flag(Py_OptimizeFlag, &cmdline->optimization_level); - pymain_get_flag(Py_DontWriteBytecodeFlag, &cmdline->dont_write_bytecode); - pymain_get_flag(Py_NoUserSiteDirectory, &cmdline->no_user_site_directory); - pymain_get_flag(Py_NoSiteFlag, &cmdline->no_site_import); - pymain_get_flag(Py_UnbufferedStdioFlag, &cmdline->use_unbuffered_io); - pymain_get_flag(Py_VerboseFlag, &cmdline->verbosity); - pymain_get_flag(Py_QuietFlag, &cmdline->quiet_flag); + + cmdline->bytes_warning = Py_BytesWarningFlag; + cmdline->debug = Py_DebugFlag; + cmdline->inspect = Py_InspectFlag; + cmdline->interactive = Py_InteractiveFlag; + cmdline->isolated = Py_IsolatedFlag; + cmdline->optimization_level = Py_OptimizeFlag; + cmdline->dont_write_bytecode = Py_DontWriteBytecodeFlag; + cmdline->no_user_site_directory = Py_NoUserSiteDirectory; + cmdline->no_site_import = Py_NoSiteFlag; + cmdline->use_unbuffered_io = Py_UnbufferedStdioFlag; + cmdline->verbosity = Py_VerboseFlag; + cmdline->quiet_flag = Py_QuietFlag; #ifdef MS_WINDOWS - pymain_get_flag(Py_LegacyWindowsFSEncodingFlag, &cmdline->legacy_windows_fs_encoding); - pymain_get_flag(Py_LegacyWindowsStdioFlag, &cmdline->legacy_windows_stdio); + cmdline->legacy_windows_fs_encoding = Py_LegacyWindowsFSEncodingFlag; + cmdline->legacy_windows_stdio = Py_LegacyWindowsStdioFlag; #endif + cmdline->check_hash_pycs_mode = _Py_CheckHashBasedPycsMode ; - pymain_get_flag(Py_IgnoreEnvironmentFlag, &pymain->core_config.ignore_environment); + pymain->core_config.ignore_environment = Py_IgnoreEnvironmentFlag; + pymain->core_config.utf8_mode = Py_UTF8Mode; } @@ -1166,26 +1148,27 @@ static void pymain_set_global_config(_PyMain *pymain) { _Py_CommandLineDetails *cmdline = &pymain->cmdline; - pymain_set_flag(&Py_BytesWarningFlag, cmdline->bytes_warning); - pymain_set_flag(&Py_DebugFlag, cmdline->debug); - pymain_set_flag(&Py_InspectFlag, cmdline->inspect); - pymain_set_flag(&Py_InteractiveFlag, cmdline->interactive); - pymain_set_flag(&Py_IsolatedFlag, cmdline->isolated); - pymain_set_flag(&Py_OptimizeFlag, cmdline->optimization_level); - pymain_set_flag(&Py_DontWriteBytecodeFlag, cmdline->dont_write_bytecode); - pymain_set_flag(&Py_NoUserSiteDirectory, cmdline->no_user_site_directory); - pymain_set_flag(&Py_NoSiteFlag, cmdline->no_site_import); - pymain_set_flag(&Py_UnbufferedStdioFlag, cmdline->use_unbuffered_io); - pymain_set_flag(&Py_VerboseFlag, cmdline->verbosity); - pymain_set_flag(&Py_QuietFlag, cmdline->quiet_flag); - if (cmdline->check_hash_pycs_mode) - _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode; + + Py_BytesWarningFlag = cmdline->bytes_warning; + Py_DebugFlag = cmdline->debug; + Py_InspectFlag = cmdline->inspect; + Py_InteractiveFlag = cmdline->interactive; + Py_IsolatedFlag = cmdline->isolated; + Py_OptimizeFlag = cmdline->optimization_level; + Py_DontWriteBytecodeFlag = cmdline->dont_write_bytecode; + Py_NoUserSiteDirectory = cmdline->no_user_site_directory; + Py_NoSiteFlag = cmdline->no_site_import; + Py_UnbufferedStdioFlag = cmdline->use_unbuffered_io; + Py_VerboseFlag = cmdline->verbosity; + Py_QuietFlag = cmdline->quiet_flag; + _Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode; #ifdef MS_WINDOWS - pymain_set_flag(&Py_LegacyWindowsFSEncodingFlag, cmdline->legacy_windows_fs_encoding); - pymain_set_flag(&Py_LegacyWindowsStdioFlag, cmdline->legacy_windows_stdio); + Py_LegacyWindowsFSEncodingFlag = cmdline->legacy_windows_fs_encoding; + Py_LegacyWindowsStdioFlag = cmdline->legacy_windows_stdio; #endif - pymain_set_flag(&Py_IgnoreEnvironmentFlag, pymain->core_config.ignore_environment); + Py_IgnoreEnvironmentFlag = pymain->core_config.ignore_environment; + Py_UTF8Mode = pymain->core_config.utf8_mode; } @@ -1609,6 +1592,57 @@ _PyMainInterpreterConfig_ReadEnv(_PyMainInterpreterConfig *config) } +static int +pymain_init_utf8_mode(_PyMain *pymain) +{ + _PyCoreConfig *core_config = &pymain->core_config; + +#ifdef MS_WINDOWS + if (pymain->cmdline.legacy_windows_fs_encoding) { + core_config->utf8_mode = 0; + return 0; + } +#endif + + wchar_t *xopt = pymain_get_xoption(pymain, L"utf8"); + if (xopt) { + wchar_t *sep = wcschr(xopt, L'='); + if (sep) { + xopt = sep + 1; + if (wcscmp(xopt, L"1") == 0) { + core_config->utf8_mode = 1; + } + else if (wcscmp(xopt, L"0") == 0) { + core_config->utf8_mode = 0; + } + else { + pymain->err = _Py_INIT_USER_ERR("invalid -X utf8 option value"); + return -1; + } + } + else { + core_config->utf8_mode = 1; + } + return 0; + } + + char *opt = pymain_get_env_var("PYTHONUTF8"); + if (opt) { + if (strcmp(opt, "1") == 0) { + core_config->utf8_mode = 1; + } + else if (strcmp(opt, "0") == 0) { + core_config->utf8_mode = 0; + } + else { + pymain->err = _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment " + "variable value"); + return -1; + } + return 0; + } + return 0; +} static int @@ -1674,6 +1708,9 @@ pymain_parse_envvars(_PyMain *pymain) pymain->core_config.malloc_stats = 1; } + if (pymain_init_utf8_mode(pymain) < 0) { + return -1; + } return 0; } @@ -1702,6 +1739,7 @@ pymain_parse_cmdline_envvars_impl(_PyMain *pymain) if (pymain_parse_envvars(pymain) < 0) { return -1; } + /* FIXME: if utf8_mode value changed, parse again cmdline */ _PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config); if (_Py_INIT_FAILED(err)) { @@ -1730,6 +1768,7 @@ pymain_parse_cmdline_envvars(_PyMain *pymain) static int pymain_init_python(_PyMain *pymain) { + pymain_set_global_config(pymain); pymain_init_stdio(pymain); @@ -1788,6 +1827,7 @@ pymain_init(_PyMain *pymain) return -1; } + pymain->core_config.utf8_mode = Py_UTF8Mode; pymain->core_config._disable_importlib = 0; pymain->config.install_signal_handlers = 1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8d4fea8..c7480a0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5079,16 +5079,17 @@ onError: return NULL; } -#if defined(__APPLE__) || defined(__ANDROID__) -/* Simplified UTF-8 decoder using surrogateescape error handler, - used to decode the command line arguments on Mac OS X and Android. +/* UTF-8 decoder using the surrogateescape error handler . - Return a pointer to a newly allocated wide character string (use - PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ + On success, return a pointer to a newly allocated wide character string (use + PyMem_RawFree() to free the memory) and write the output length (in number + of wchar_t units) into *p_wlen (if p_wlen is set). + On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen + (if p_wlen is set). */ wchar_t* -_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) +_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) { const char *e; wchar_t *unicode; @@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) /* Note: size will always be longer than the resulting Unicode character count */ - if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) + if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { + if (p_wlen) { + *p_wlen = (size_t)-1; + } return NULL; + } + unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); - if (!unicode) + if (!unicode) { + if (p_wlen) { + *p_wlen = (size_t)-1; + } return NULL; + } /* Unpack UTF-8 encoded data */ e = s + size; @@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) } } unicode[outpos] = L'\0'; + if (p_wlen) { + *p_wlen = outpos; + } return unicode; } -#endif /* __APPLE__ or __ANDROID__ */ /* Primary internal function which creates utf8 encoded bytes objects. diff --git a/Programs/python.c b/Programs/python.c index 22d55bb..aef7122 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -17,6 +17,15 @@ wmain(int argc, wchar_t **argv) #else +static void _Py_NO_RETURN +fatal_error(const char *msg) +{ + fprintf(stderr, "Fatal Python error: %s\n", msg); + fflush(stderr); + exit(1); +} + + int main(int argc, char **argv) { @@ -28,9 +37,7 @@ main(int argc, char **argv) _PyInitError err = _PyRuntime_Initialize(); if (_Py_INIT_FAILED(err)) { - fprintf(stderr, "Fatal Python error: %s\n", err.msg); - fflush(stderr); - exit(1); + fatal_error(err.msg); } /* Force default allocator, to be able to release memory above @@ -40,7 +47,7 @@ main(int argc, char **argv) argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); if (!argv_copy || !argv_copy2) { - fprintf(stderr, "out of memory\n"); + fatal_error("out of memory"); return 1; } @@ -55,7 +62,7 @@ main(int argc, char **argv) oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL)); if (!oldloc) { - fprintf(stderr, "out of memory\n"); + fatal_error("out of memory"); return 1; } @@ -73,6 +80,7 @@ main(int argc, char **argv) * details. */ if (_Py_LegacyLocaleDetected()) { + Py_UTF8Mode = 1; _Py_CoerceLegacyLocale(); } @@ -81,10 +89,7 @@ main(int argc, char **argv) argv_copy[i] = Py_DecodeLocale(argv[i], NULL); if (!argv_copy[i]) { PyMem_RawFree(oldloc); - fprintf(stderr, "Fatal Python error: " - "unable to decode the command line argument #%i\n", - i + 1); - return 1; + fatal_error("unable to decode the command line arguments"); } argv_copy2[i] = argv_copy[i]; } diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c index 81774dc..23d7aa4 100644 --- a/Python/bltinmodule.c +++ b/Python/bltinmodule.c @@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */ int Py_HasFileSystemDefaultEncoding = 0; #endif const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape"; +/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin + and stdout error handler to "surrogateescape". */ +int Py_UTF8Mode = 0; _Py_IDENTIFIER(__builtins__); _Py_IDENTIFIER(__dict__); diff --git a/Python/fileutils.c b/Python/fileutils.c index eab58c5..03cc379 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -20,9 +20,8 @@ extern int winerror_to_errno(int); #include #endif /* HAVE_FCNTL_H */ -#if defined(__APPLE__) || defined(__ANDROID__) -extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); -#endif +extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, + size_t *p_wlen); #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size) } #endif - -/* Decode a byte string from the locale encoding with the - surrogateescape error handler: undecodable bytes are decoded as characters - in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate - character, escape the bytes using the surrogateescape error handler instead - of decoding them. - - Return a pointer to a newly allocated wide character string, use - PyMem_RawFree() to free the memory. If size is not NULL, write the number of - wide characters excluding the null character into *size - - Return NULL on decoding error or memory allocation error. If *size* is not - NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on - decoding error. - - Decoding errors should never happen, unless there is a bug in the C - library. - - Use the Py_EncodeLocale() function to encode the character string back to a - byte string. */ -wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +static wchar_t* +decode_locale(const char* arg, size_t *size) { -#if defined(__APPLE__) || defined(__ANDROID__) - wchar_t *wstr; - wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); - if (size != NULL) { - if (wstr != NULL) - *size = wcslen(wstr); - else - *size = (size_t)-1; - } - return wstr; -#else wchar_t *res; size_t argsize; size_t count; @@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size) mbstate_t mbs; #endif -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) { - /* force ASCII encoding to workaround mbstowcs() issue */ - res = decode_ascii_surrogateescape(arg, size); - if (res == NULL) - goto oom; - return res; - } -#endif - #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size) goto oom; #endif /* HAVE_MBRTOWC */ return res; + oom: - if (size != NULL) + if (size != NULL) { *size = (size_t)-1; + } return NULL; -#endif /* __APPLE__ or __ANDROID__ */ } -/* Encode a wide character string to the locale encoding with the - surrogateescape error handler: surrogate characters in the range - U+DC80..U+DCFF are converted to bytes 0x80..0xFF. - Return a pointer to a newly allocated byte string, use PyMem_Free() to free - the memory. Return NULL on encoding or memory allocation error. +/* Decode a byte string from the locale encoding with the + surrogateescape error handler: undecodable bytes are decoded as characters + in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate + character, escape the bytes using the surrogateescape error handler instead + of decoding them. - If error_pos is not NULL, *error_pos is set to the index of the invalid - character on encoding error, or set to (size_t)-1 otherwise. + Return a pointer to a newly allocated wide character string, use + PyMem_RawFree() to free the memory. If size is not NULL, write the number of + wide characters excluding the null character into *size - Use the Py_DecodeLocale() function to decode the bytes string back to a wide - character string. */ -char* -Py_EncodeLocale(const wchar_t *text, size_t *error_pos) + Return NULL on decoding error or memory allocation error. If *size* is not + NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on + decoding error. + + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the Py_EncodeLocale() function to encode the character string back to a + byte string. */ +wchar_t* +Py_DecodeLocale(const char* arg, size_t *size) { #if defined(__APPLE__) || defined(__ANDROID__) + return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); +#else + if (Py_UTF8Mode) { + return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + } + +#ifndef MS_WINDOWS + if (force_ascii == -1) + force_ascii = check_force_ascii(); + + if (force_ascii) { + /* force ASCII encoding to workaround mbstowcs() issue */ + wchar_t *wstr = decode_ascii_surrogateescape(arg, size); + if (wstr == NULL) { + if (size != NULL) { + *size = (size_t)-1; + } + return NULL; + } + return wstr; + } +#endif + + return decode_locale(arg, size); +#endif /* __APPLE__ or __ANDROID__ */ +} + +static char* +_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos) +{ Py_ssize_t len; PyObject *unicode, *bytes = NULL; char *cpath; unicode = PyUnicode_FromWideChar(text, wcslen(text)); - if (unicode == NULL) + if (unicode == NULL) { return NULL; + } bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); Py_DECREF(unicode); if (bytes == NULL) { PyErr_Clear(); - if (error_pos != NULL) + if (error_pos != NULL) { *error_pos = (size_t)-1; + } return NULL; } @@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos) if (cpath == NULL) { PyErr_Clear(); Py_DECREF(bytes); - if (error_pos != NULL) + if (error_pos != NULL) { *error_pos = (size_t)-1; + } return NULL; } memcpy(cpath, PyBytes_AsString(bytes), len + 1); Py_DECREF(bytes); return cpath; -#else /* __APPLE__ */ +} + +static char* +encode_locale(const wchar_t *text, size_t *error_pos) +{ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) - return encode_ascii_surrogateescape(text, error_pos); -#endif - /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos) bytes = result; } return result; +} + +/* Encode a wide character string to the locale encoding with the + surrogateescape error handler: surrogate characters in the range + U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + + Return a pointer to a newly allocated byte string, use PyMem_Free() to free + the memory. Return NULL on encoding or memory allocation error. + + If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set + to the index of the invalid character on encoding error. + + Use the Py_DecodeLocale() function to decode the bytes string back to a wide + character string. */ +char* +Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +{ +#if defined(__APPLE__) || defined(__ANDROID__) + return _Py_EncodeLocaleUTF8(text, error_pos); +#else /* __APPLE__ */ + if (Py_UTF8Mode) { + return _Py_EncodeLocaleUTF8(text, error_pos); + } + +#ifndef MS_WINDOWS + if (force_ascii == -1) + force_ascii = check_force_ascii(); + + if (force_ascii) + return encode_ascii_surrogateescape(text, error_pos); +#endif + + return encode_locale(text, error_pos); #endif /* __APPLE__ or __ANDROID__ */ } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index f284855..2bac23d 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */ static _PyInitError add_main_module(PyInterpreterState *interp); static _PyInitError initfsencoding(PyInterpreterState *interp); static _PyInitError initsite(void); -static _PyInitError init_sys_streams(void); +static _PyInitError init_sys_streams(PyInterpreterState *interp); static _PyInitError initsigs(void); static void call_py_exitfuncs(void); static void wait_for_thread_shutdown(void); @@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config) return err; } - err = init_sys_streams(); + err = init_sys_streams(interp); if (_Py_INIT_FAILED(err)) { return err; } @@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p) return err; } - err = init_sys_streams(); + err = init_sys_streams(interp); if (_Py_INIT_FAILED(err)) { return err; } @@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp) Py_FileSystemDefaultEncodeErrors = "surrogatepass"; } #else - if (Py_FileSystemDefaultEncoding == NULL) { + if (Py_FileSystemDefaultEncoding == NULL && + interp->core_config.utf8_mode) + { + Py_FileSystemDefaultEncoding = "utf-8"; + Py_HasFileSystemDefaultEncoding = 1; + } + else if (Py_FileSystemDefaultEncoding == NULL) { Py_FileSystemDefaultEncoding = get_locale_encoding(); if (Py_FileSystemDefaultEncoding == NULL) { return _Py_INIT_ERR("Unable to get the locale encoding"); @@ -1749,7 +1755,7 @@ error: /* Initialize sys.stdin, stdout, stderr and builtins.open */ static _PyInitError -init_sys_streams(void) +init_sys_streams(PyInterpreterState *interp) { PyObject *iomod = NULL, *wrapper; PyObject *bimod = NULL; @@ -1794,10 +1800,10 @@ init_sys_streams(void) encoding = _Py_StandardStreamEncoding; errors = _Py_StandardStreamErrors; if (!encoding || !errors) { - pythonioencoding = Py_GETENV("PYTHONIOENCODING"); - if (pythonioencoding) { + char *opt = Py_GETENV("PYTHONIOENCODING"); + if (opt && opt[0] != '\0') { char *err; - pythonioencoding = _PyMem_Strdup(pythonioencoding); + pythonioencoding = _PyMem_Strdup(opt); if (pythonioencoding == NULL) { PyErr_NoMemory(); goto error; @@ -1814,7 +1820,12 @@ init_sys_streams(void) encoding = pythonioencoding; } } - if (!errors && !(pythonioencoding && *pythonioencoding)) { + else if (interp->core_config.utf8_mode) { + encoding = "utf-8"; + errors = "surrogateescape"; + } + + if (!errors && !pythonioencoding) { /* Choose the default error handler based on the current locale */ errors = get_default_standard_stream_error_handler(); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index f10099b..141e189 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = { {"hash_randomization", "-R"}, {"isolated", "-I"}, {"dev_mode", "-X dev"}, + {"utf8_mode", "-X utf8"}, {0} }; @@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = { "sys.flags", /* name */ flags__doc__, /* doc */ flags_fields, /* fields */ - 14 + 15 }; static PyObject* @@ -1853,8 +1854,9 @@ make_flags(void) SetFlag(Py_QuietFlag); SetFlag(Py_HashRandomizationFlag); SetFlag(Py_IsolatedFlag); -#undef SetFlag PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode)); + SetFlag(Py_UTF8Mode); +#undef SetFlag if (PyErr_Occurred()) { Py_DECREF(seq); -- cgit v0.12