diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2017-12-13 11:29:09 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-13 11:29:09 (GMT) |
commit | 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch) | |
tree | ff002e0532736a97f3ddd367c1491e7b04611816 /Lib | |
parent | c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff) | |
download | cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2 |
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable
and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
_winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
mode. As a side effect, open() now uses the UTF-8 encoding by
default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
always copy flag values, rather than only copying if the new value
is greater than the old value.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/_bootlocale.py | 6 | ||||
-rw-r--r-- | Lib/encodings/__init__.py | 5 | ||||
-rw-r--r-- | Lib/locale.py | 6 | ||||
-rw-r--r-- | Lib/subprocess.py | 2 | ||||
-rw-r--r-- | Lib/test/test_builtin.py | 1 | ||||
-rw-r--r-- | Lib/test/test_c_locale_coercion.py | 2 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 10 | ||||
-rw-r--r-- | Lib/test/test_io.py | 2 | ||||
-rw-r--r-- | Lib/test/test_sys.py | 8 | ||||
-rw-r--r-- | Lib/test/test_utf8_mode.py | 206 |
10 files changed, 233 insertions, 15 deletions
diff --git a/Lib/_bootlocale.py b/Lib/_bootlocale.py index 0c61b0d..3273a3b 100644 --- a/Lib/_bootlocale.py +++ b/Lib/_bootlocale.py @@ -9,6 +9,8 @@ import _locale if sys.platform.startswith("win"): def getpreferredencoding(do_setlocale=True): + if sys.flags.utf8_mode: + return 'UTF-8' return _locale._getdefaultlocale()[1] else: try: @@ -21,6 +23,8 @@ else: return 'UTF-8' else: def getpreferredencoding(do_setlocale=True): + if sys.flags.utf8_mode: + return 'UTF-8' # This path for legacy systems needs the more complex # getdefaultlocale() function, import the full locale module. import locale @@ -28,6 +32,8 @@ else: else: def getpreferredencoding(do_setlocale=True): assert not do_setlocale + if sys.flags.utf8_mode: + return 'UTF-8' result = _locale.nl_langinfo(_locale.CODESET) if not result and sys.platform == 'darwin': # nl_langinfo can return an empty string diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index aa2fb7c..025b7a8 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -158,8 +158,9 @@ codecs.register(search_function) if sys.platform == 'win32': def _alias_mbcs(encoding): try: - import _bootlocale - if encoding == _bootlocale.getpreferredencoding(False): + import _winapi + ansi_code_page = "cp%s" % _winapi.GetACP() + if encoding == ansi_code_page: import encodings.mbcs return encodings.mbcs.getregentry() except ImportError: diff --git a/Lib/locale.py b/Lib/locale.py index f1d157d..18079e7 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -617,6 +617,8 @@ if sys.platform.startswith("win"): # On Win32, this will return the ANSI code page def getpreferredencoding(do_setlocale = True): """Return the charset that the user is likely using.""" + if sys.flags.utf8_mode: + return 'UTF-8' import _bootlocale return _bootlocale.getpreferredencoding(False) else: @@ -634,6 +636,8 @@ else: def getpreferredencoding(do_setlocale = True): """Return the charset that the user is likely using, by looking at environment variables.""" + if sys.flags.utf8_mode: + return 'UTF-8' res = getdefaultlocale()[1] if res is None: # LANG not set, default conservatively to ASCII @@ -643,6 +647,8 @@ else: def getpreferredencoding(do_setlocale = True): """Return the charset that the user is likely using, according to the system configuration.""" + if sys.flags.utf8_mode: + return 'UTF-8' import _bootlocale if do_setlocale: oldloc = setlocale(LC_CTYPE) diff --git a/Lib/subprocess.py b/Lib/subprocess.py index 301433c..65b4086 100644 --- a/Lib/subprocess.py +++ b/Lib/subprocess.py @@ -280,7 +280,7 @@ def _args_from_interpreter_flags(): if dev_mode: args.extend(('-X', 'dev')) for opt in ('faulthandler', 'tracemalloc', 'importtime', - 'showalloccount', 'showrefcount'): + 'showalloccount', 'showrefcount', 'utf8'): if opt in xoptions: value = xoptions[opt] if value is True: diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 0a61c05..9329318 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -1022,6 +1022,7 @@ class BuiltinTest(unittest.TestCase): self.assertRaises(ValueError, open, 'a\x00b') self.assertRaises(ValueError, open, b'a\x00b') + @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_open_default_encoding(self): old_environ = dict(os.environ) try: diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 2a22739..c0845d7 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -130,7 +130,7 @@ class EncodingDetails(_EncodingDetails): that. """ result, py_cmd = run_python_until_end( - "-c", cls.CHILD_PROCESS_SCRIPT, + "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, __isolated=True, **env_vars ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index eb21a39..a59a5e2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -5,6 +5,7 @@ import locale import sys import unittest import encodings +from unittest import mock from test import support @@ -3180,16 +3181,9 @@ class CodePageTest(unittest.TestCase): def test_mbcs_alias(self): # Check that looking up our 'default' codepage will return # mbcs when we don't have a more specific one available - import _bootlocale - def _get_fake_codepage(*a): - return 'cp123' - old_getpreferredencoding = _bootlocale.getpreferredencoding - _bootlocale.getpreferredencoding = _get_fake_codepage - try: + with mock.patch('_winapi.GetACP', return_value=123): codec = codecs.lookup('cp123') self.assertEqual(codec.name, 'mbcs') - finally: - _bootlocale.getpreferredencoding = old_getpreferredencoding class ASCIITest(unittest.TestCase): diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 6bb4127..6674831 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -2580,6 +2580,7 @@ class TextIOWrapperTest(unittest.TestCase): t.reconfigure(line_buffering=None) self.assertEqual(t.line_buffering, True) + @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_default_encoding(self): old_environ = dict(os.environ) try: @@ -2599,6 +2600,7 @@ class TextIOWrapperTest(unittest.TestCase): os.environ.update(old_environ) @support.cpython_only + @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_device_encoding(self): # Issue 15989 import _testcapi diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 6346094..6933b41 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -527,7 +527,7 @@ class SysModuleTest(unittest.TestCase): "inspect", "interactive", "optimize", "dont_write_bytecode", "no_user_site", "no_site", "ignore_environment", "verbose", "bytes_warning", "quiet", "hash_randomization", "isolated", - "dev_mode") + "dev_mode", "utf8_mode") for attr in attrs: self.assertTrue(hasattr(sys.flags, attr), attr) attr_type = bool if attr == "dev_mode" else int @@ -535,6 +535,8 @@ class SysModuleTest(unittest.TestCase): self.assertTrue(repr(sys.flags)) self.assertEqual(len(sys.flags), len(attrs)) + self.assertIn(sys.flags.utf8_mode, {0, 1, 2}) + def assert_raise_on_new_sys_type(self, sys_attr): # Users are intentionally prevented from creating new instances of # sys.flags, sys.version_info, and sys.getwindowsversion. @@ -710,8 +712,8 @@ class SysModuleTest(unittest.TestCase): # have no any effect out = self.c_locale_get_error_handler(encoding=':') self.assertEqual(out, - 'stdin: surrogateescape\n' - 'stdout: surrogateescape\n' + 'stdin: strict\n' + 'stdout: strict\n' 'stderr: backslashreplace\n') out = self.c_locale_get_error_handler(encoding='') self.assertEqual(out, diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py new file mode 100644 index 0000000..275a6ea --- /dev/null +++ b/Lib/test/test_utf8_mode.py @@ -0,0 +1,206 @@ +""" +Test the implementation of the PEP 540: the UTF-8 Mode. +""" + +import locale +import os +import sys +import textwrap +import unittest +from test.support.script_helper import assert_python_ok, assert_python_failure + + +MS_WINDOWS = (sys.platform == 'win32') + + +class UTF8ModeTests(unittest.TestCase): + # Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment + # variables by default + DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''} + + def posix_locale(self): + loc = locale.setlocale(locale.LC_CTYPE, None) + return (loc == 'C') + + def get_output(self, *args, failure=False, **kw): + kw = dict(self.DEFAULT_ENV, **kw) + if failure: + out = assert_python_failure(*args, **kw) + out = out[2] + else: + out = assert_python_ok(*args, **kw) + out = out[1] + return out.decode().rstrip("\n\r") + + @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale') + def test_posix_locale(self): + code = 'import sys; print(sys.flags.utf8_mode)' + + out = self.get_output('-c', code, LC_ALL='C') + self.assertEqual(out, '1') + + def test_xoption(self): + code = 'import sys; print(sys.flags.utf8_mode)' + + out = self.get_output('-X', 'utf8', '-c', code) + self.assertEqual(out, '1') + + # undocumented but accepted syntax: -X utf8=1 + out = self.get_output('-X', 'utf8=1', '-c', code) + self.assertEqual(out, '1') + + out = self.get_output('-X', 'utf8=0', '-c', code) + self.assertEqual(out, '0') + + if MS_WINDOWS: + # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 + # and has the priority over -X utf8 + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONLEGACYWINDOWSFSENCODING='1') + self.assertEqual(out, '0') + + def test_env_var(self): + code = 'import sys; print(sys.flags.utf8_mode)' + + out = self.get_output('-c', code, PYTHONUTF8='1') + self.assertEqual(out, '1') + + out = self.get_output('-c', code, PYTHONUTF8='0') + self.assertEqual(out, '0') + + # -X utf8 has the priority over PYTHONUTF8 + out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1') + self.assertEqual(out, '0') + + if MS_WINDOWS: + # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode + # and has the priority over PYTHONUTF8 + out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1', + PYTHONLEGACYWINDOWSFSENCODING='1') + self.assertEqual(out, '0') + + # Cannot test with the POSIX locale, since the POSIX locale enables + # the UTF-8 mode + if not self.posix_locale(): + # PYTHONUTF8 should be ignored if -E is used + out = self.get_output('-E', '-c', code, PYTHONUTF8='1') + self.assertEqual(out, '0') + + # invalid mode + out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True) + self.assertIn('invalid PYTHONUTF8 environment variable value', + out.rstrip()) + + def test_filesystemencoding(self): + code = textwrap.dedent(''' + import sys + print("{}/{}".format(sys.getfilesystemencoding(), + sys.getfilesystemencodeerrors())) + ''') + + if MS_WINDOWS: + expected = 'utf-8/surrogatepass' + else: + expected = 'utf-8/surrogateescape' + + out = self.get_output('-X', 'utf8', '-c', code) + self.assertEqual(out, expected) + + if MS_WINDOWS: + # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode + # and has the priority over -X utf8 and PYTHONUTF8 + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONUTF8='strict', + PYTHONLEGACYWINDOWSFSENCODING='1') + self.assertEqual(out, 'mbcs/replace') + + def test_stdio(self): + code = textwrap.dedent(''' + import sys + print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}") + print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}") + print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}") + ''') + + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONIOENCODING='') + self.assertEqual(out.splitlines(), + ['stdin: utf-8/surrogateescape', + 'stdout: utf-8/surrogateescape', + 'stderr: utf-8/backslashreplace']) + + # PYTHONIOENCODING has the priority over PYTHONUTF8 + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONIOENCODING="latin1") + self.assertEqual(out.splitlines(), + ['stdin: latin1/strict', + 'stdout: latin1/strict', + 'stderr: latin1/backslashreplace']) + + out = self.get_output('-X', 'utf8', '-c', code, + PYTHONIOENCODING=":namereplace") + self.assertEqual(out.splitlines(), + ['stdin: UTF-8/namereplace', + 'stdout: UTF-8/namereplace', + 'stderr: UTF-8/backslashreplace']) + + def test_io(self): + code = textwrap.dedent(''' + import sys + filename = sys.argv[1] + with open(filename) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') + filename = __file__ + + out = self.get_output('-c', code, filename, PYTHONUTF8='1') + self.assertEqual(out, 'UTF-8/strict') + + def _check_io_encoding(self, module, encoding=None, errors=None): + filename = __file__ + + # Encoding explicitly set + args = [] + if encoding: + args.append(f'encoding={encoding!r}') + if errors: + args.append(f'errors={errors!r}') + code = textwrap.dedent(''' + import sys + from %s import open + filename = sys.argv[1] + with open(filename, %s) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') % (module, ', '.join(args)) + out = self.get_output('-c', code, filename, + PYTHONUTF8='1') + + if not encoding: + encoding = 'UTF-8' + if not errors: + errors = 'strict' + self.assertEqual(out, f'{encoding}/{errors}') + + def check_io_encoding(self, module): + self._check_io_encoding(module, encoding="latin1") + self._check_io_encoding(module, errors="namereplace") + self._check_io_encoding(module, + encoding="latin1", errors="namereplace") + + def test_io_encoding(self): + self.check_io_encoding('io') + + def test_io_encoding(self): + self.check_io_encoding('_pyio') + + def test_locale_getpreferredencoding(self): + code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))' + out = self.get_output('-X', 'utf8', '-c', code) + self.assertEqual(out, 'UTF-8 UTF-8') + + out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C') + self.assertEqual(out, 'UTF-8 UTF-8') + + +if __name__ == "__main__": + unittest.main() |