bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)

* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
author: Victor Stinner <victor.stinner@gmail.com> 2017-12-13 11:29:09 (GMT)
committer: GitHub <noreply@github.com> 2017-12-13 11:29:09 (GMT)
commit: 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
tree: ff002e0532736a97f3ddd367c1491e7b04611816 /Lib
parent: c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
download: cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2
10 files changed, 233 insertions, 15 deletions
diff --git a/Lib/_bootlocale.py b/Lib/_bootlocale.py
index 0c61b0d..3273a3b 100644
--- a/Lib/_bootlocale.py
+++ b/Lib/_bootlocale.py
@@ -9,6 +9,8 @@ import _locale
 
 if sys.platform.startswith("win"):
     def getpreferredencoding(do_setlocale=True):
+        if sys.flags.utf8_mode:
+            return 'UTF-8'
         return _locale._getdefaultlocale()[1]
 else:
     try:
@@ -21,6 +23,8 @@ else:
                 return 'UTF-8'
         else:
             def getpreferredencoding(do_setlocale=True):
+                if sys.flags.utf8_mode:
+                    return 'UTF-8'
                 # This path for legacy systems needs the more complex
                 # getdefaultlocale() function, import the full locale module.
                 import locale
@@ -28,6 +32,8 @@ else:
     else:
         def getpreferredencoding(do_setlocale=True):
             assert not do_setlocale
+            if sys.flags.utf8_mode:
+                return 'UTF-8'
             result = _locale.nl_langinfo(_locale.CODESET)
             if not result and sys.platform == 'darwin':
                 # nl_langinfo can return an empty string
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
index aa2fb7c..025b7a8 100644
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -158,8 +158,9 @@ codecs.register(search_function)
 if sys.platform == 'win32':
     def _alias_mbcs(encoding):
         try:
-            import _bootlocale
-            if encoding == _bootlocale.getpreferredencoding(False):
+            import _winapi
+            ansi_code_page = "cp%s" % _winapi.GetACP()
+            if encoding == ansi_code_page:
                 import encodings.mbcs
                 return encodings.mbcs.getregentry()
         except ImportError:
diff --git a/Lib/locale.py b/Lib/locale.py
index f1d157d..18079e7 100644
--- a/Lib/locale.py
+++ b/Lib/locale.py
@@ -617,6 +617,8 @@ if sys.platform.startswith("win"):
     # On Win32, this will return the ANSI code page
     def getpreferredencoding(do_setlocale = True):
         """Return the charset that the user is likely using."""
+        if sys.flags.utf8_mode:
+            return 'UTF-8'
         import _bootlocale
         return _bootlocale.getpreferredencoding(False)
 else:
@@ -634,6 +636,8 @@ else:
             def getpreferredencoding(do_setlocale = True):
                 """Return the charset that the user is likely using,
                 by looking at environment variables."""
+                if sys.flags.utf8_mode:
+                    return 'UTF-8'
                 res = getdefaultlocale()[1]
                 if res is None:
                     # LANG not set, default conservatively to ASCII
@@ -643,6 +647,8 @@ else:
         def getpreferredencoding(do_setlocale = True):
             """Return the charset that the user is likely using,
             according to the system configuration."""
+            if sys.flags.utf8_mode:
+                return 'UTF-8'
             import _bootlocale
             if do_setlocale:
                 oldloc = setlocale(LC_CTYPE)
diff --git a/Lib/subprocess.py b/Lib/subprocess.py
index 301433c..65b4086 100644
--- a/Lib/subprocess.py
+++ b/Lib/subprocess.py
@@ -280,7 +280,7 @@ def _args_from_interpreter_flags():
     if dev_mode:
         args.extend(('-X', 'dev'))
     for opt in ('faulthandler', 'tracemalloc', 'importtime',
-                'showalloccount', 'showrefcount'):
+                'showalloccount', 'showrefcount', 'utf8'):
         if opt in xoptions:
             value = xoptions[opt]
             if value is True:
diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py
index 0a61c05..9329318 100644
--- a/Lib/test/test_builtin.py
+++ b/Lib/test/test_builtin.py
@@ -1022,6 +1022,7 @@ class BuiltinTest(unittest.TestCase):
         self.assertRaises(ValueError, open, 'a\x00b')
         self.assertRaises(ValueError, open, b'a\x00b')
 
+    @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
     def test_open_default_encoding(self):
         old_environ = dict(os.environ)
         try:
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py
index 2a22739..c0845d7 100644
--- a/Lib/test/test_c_locale_coercion.py
+++ b/Lib/test/test_c_locale_coercion.py
@@ -130,7 +130,7 @@ class EncodingDetails(_EncodingDetails):
         that.
         """
         result, py_cmd = run_python_until_end(
-            "-c", cls.CHILD_PROCESS_SCRIPT,
+            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
             __isolated=True,
             **env_vars
         )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index eb21a39..a59a5e2 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -5,6 +5,7 @@ import locale
 import sys
 import unittest
 import encodings
+from unittest import mock
 
 from test import support
 
@@ -3180,16 +3181,9 @@ class CodePageTest(unittest.TestCase):
     def test_mbcs_alias(self):
         # Check that looking up our 'default' codepage will return
         # mbcs when we don't have a more specific one available
-        import _bootlocale
-        def _get_fake_codepage(*a):
-            return 'cp123'
-        old_getpreferredencoding = _bootlocale.getpreferredencoding
-        _bootlocale.getpreferredencoding = _get_fake_codepage
-        try:
+        with mock.patch('_winapi.GetACP', return_value=123):
             codec = codecs.lookup('cp123')
             self.assertEqual(codec.name, 'mbcs')
-        finally:
-            _bootlocale.getpreferredencoding = old_getpreferredencoding
 
 
 class ASCIITest(unittest.TestCase):
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 6bb4127..6674831 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2580,6 +2580,7 @@ class TextIOWrapperTest(unittest.TestCase):
         t.reconfigure(line_buffering=None)
         self.assertEqual(t.line_buffering, True)
 
+    @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
     def test_default_encoding(self):
         old_environ = dict(os.environ)
         try:
@@ -2599,6 +2600,7 @@ class TextIOWrapperTest(unittest.TestCase):
             os.environ.update(old_environ)
 
     @support.cpython_only
+    @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
     def test_device_encoding(self):
         # Issue 15989
         import _testcapi
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index 6346094..6933b41 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -527,7 +527,7 @@ class SysModuleTest(unittest.TestCase):
                  "inspect", "interactive", "optimize", "dont_write_bytecode",
                  "no_user_site", "no_site", "ignore_environment", "verbose",
                  "bytes_warning", "quiet", "hash_randomization", "isolated",
-                 "dev_mode")
+                 "dev_mode", "utf8_mode")
         for attr in attrs:
             self.assertTrue(hasattr(sys.flags, attr), attr)
             attr_type = bool if attr == "dev_mode" else int
@@ -535,6 +535,8 @@ class SysModuleTest(unittest.TestCase):
         self.assertTrue(repr(sys.flags))
         self.assertEqual(len(sys.flags), len(attrs))
 
+        self.assertIn(sys.flags.utf8_mode, {0, 1, 2})
+
     def assert_raise_on_new_sys_type(self, sys_attr):
         # Users are intentionally prevented from creating new instances of
         # sys.flags, sys.version_info, and sys.getwindowsversion.
@@ -710,8 +712,8 @@ class SysModuleTest(unittest.TestCase):
         # have no any effect
         out = self.c_locale_get_error_handler(encoding=':')
         self.assertEqual(out,
-                         'stdin: surrogateescape\n'
-                         'stdout: surrogateescape\n'
+                         'stdin: strict\n'
+                         'stdout: strict\n'
                          'stderr: backslashreplace\n')
         out = self.c_locale_get_error_handler(encoding='')
         self.assertEqual(out,
diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py
new file mode 100644
index 0000000..275a6ea
--- /dev/null
+++ b/Lib/test/test_utf8_mode.py
@@ -0,0 +1,206 @@
+"""
+Test the implementation of the PEP 540: the UTF-8 Mode.
+"""
+
+import locale
+import os
+import sys
+import textwrap
+import unittest
+from test.support.script_helper import assert_python_ok, assert_python_failure
+
+
+MS_WINDOWS = (sys.platform == 'win32')
+
+
+class UTF8ModeTests(unittest.TestCase):
+    # Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment
+    # variables by default
+    DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''}
+
+    def posix_locale(self):
+        loc = locale.setlocale(locale.LC_CTYPE, None)
+        return (loc == 'C')
+
+    def get_output(self, *args, failure=False, **kw):
+        kw = dict(self.DEFAULT_ENV, **kw)
+        if failure:
+            out = assert_python_failure(*args, **kw)
+            out = out[2]
+        else:
+            out = assert_python_ok(*args, **kw)
+            out = out[1]
+        return out.decode().rstrip("\n\r")
+
+    @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
+    def test_posix_locale(self):
+        code = 'import sys; print(sys.flags.utf8_mode)'
+
+        out = self.get_output('-c', code, LC_ALL='C')
+        self.assertEqual(out, '1')
+
+    def test_xoption(self):
+        code = 'import sys; print(sys.flags.utf8_mode)'
+
+        out = self.get_output('-X', 'utf8', '-c', code)
+        self.assertEqual(out, '1')
+
+        # undocumented but accepted syntax: -X utf8=1
+        out = self.get_output('-X', 'utf8=1', '-c', code)
+        self.assertEqual(out, '1')
+
+        out = self.get_output('-X', 'utf8=0', '-c', code)
+        self.assertEqual(out, '0')
+
+        if MS_WINDOWS:
+            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8
+            # and has the priority over -X utf8
+            out = self.get_output('-X', 'utf8', '-c', code,
+                                  PYTHONLEGACYWINDOWSFSENCODING='1')
+            self.assertEqual(out, '0')
+
+    def test_env_var(self):
+        code = 'import sys; print(sys.flags.utf8_mode)'
+
+        out = self.get_output('-c', code, PYTHONUTF8='1')
+        self.assertEqual(out, '1')
+
+        out = self.get_output('-c', code, PYTHONUTF8='0')
+        self.assertEqual(out, '0')
+
+        # -X utf8 has the priority over PYTHONUTF8
+        out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
+        self.assertEqual(out, '0')
+
+        if MS_WINDOWS:
+            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
+            # and has the priority over PYTHONUTF8
+            out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
+                                  PYTHONLEGACYWINDOWSFSENCODING='1')
+            self.assertEqual(out, '0')
+
+        # Cannot test with the POSIX locale, since the POSIX locale enables
+        # the UTF-8 mode
+        if not self.posix_locale():
+            # PYTHONUTF8 should be ignored if -E is used
+            out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
+            self.assertEqual(out, '0')
+
+        # invalid mode
+        out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
+        self.assertIn('invalid PYTHONUTF8 environment variable value',
+                      out.rstrip())
+
+    def test_filesystemencoding(self):
+        code = textwrap.dedent('''
+            import sys
+            print("{}/{}".format(sys.getfilesystemencoding(),
+                                 sys.getfilesystemencodeerrors()))
+        ''')
+
+        if MS_WINDOWS:
+            expected = 'utf-8/surrogatepass'
+        else:
+            expected = 'utf-8/surrogateescape'
+
+        out = self.get_output('-X', 'utf8', '-c', code)
+        self.assertEqual(out, expected)
+
+        if MS_WINDOWS:
+            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
+            # and has the priority over -X utf8 and PYTHONUTF8
+            out = self.get_output('-X', 'utf8', '-c', code,
+                                  PYTHONUTF8='strict',
+                                  PYTHONLEGACYWINDOWSFSENCODING='1')
+            self.assertEqual(out, 'mbcs/replace')
+
+    def test_stdio(self):
+        code = textwrap.dedent('''
+            import sys
+            print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
+            print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
+            print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
+        ''')
+
+        out = self.get_output('-X', 'utf8', '-c', code,
+                              PYTHONIOENCODING='')
+        self.assertEqual(out.splitlines(),
+                         ['stdin: utf-8/surrogateescape',
+                          'stdout: utf-8/surrogateescape',
+                          'stderr: utf-8/backslashreplace'])
+
+        # PYTHONIOENCODING has the priority over PYTHONUTF8
+        out = self.get_output('-X', 'utf8', '-c', code,
+                              PYTHONIOENCODING="latin1")
+        self.assertEqual(out.splitlines(),
+                         ['stdin: latin1/strict',
+                          'stdout: latin1/strict',
+                          'stderr: latin1/backslashreplace'])
+
+        out = self.get_output('-X', 'utf8', '-c', code,
+                              PYTHONIOENCODING=":namereplace")
+        self.assertEqual(out.splitlines(),
+                         ['stdin: UTF-8/namereplace',
+                          'stdout: UTF-8/namereplace',
+                          'stderr: UTF-8/backslashreplace'])
+
+    def test_io(self):
+        code = textwrap.dedent('''
+            import sys
+            filename = sys.argv[1]
+            with open(filename) as fp:
+                print(f"{fp.encoding}/{fp.errors}")
+        ''')
+        filename = __file__
+
+        out = self.get_output('-c', code, filename, PYTHONUTF8='1')
+        self.assertEqual(out, 'UTF-8/strict')
+
+    def _check_io_encoding(self, module, encoding=None, errors=None):
+        filename = __file__
+
+        # Encoding explicitly set
+        args = []
+        if encoding:
+            args.append(f'encoding={encoding!r}')
+        if errors:
+            args.append(f'errors={errors!r}')
+        code = textwrap.dedent('''
+            import sys
+            from %s import open
+            filename = sys.argv[1]
+            with open(filename, %s) as fp:
+                print(f"{fp.encoding}/{fp.errors}")
+        ''') % (module, ', '.join(args))
+        out = self.get_output('-c', code, filename,
+                              PYTHONUTF8='1')
+
+        if not encoding:
+            encoding = 'UTF-8'
+        if not errors:
+            errors = 'strict'
+        self.assertEqual(out, f'{encoding}/{errors}')
+
+    def check_io_encoding(self, module):
+        self._check_io_encoding(module, encoding="latin1")
+        self._check_io_encoding(module, errors="namereplace")
+        self._check_io_encoding(module,
+                                encoding="latin1", errors="namereplace")
+
+    def test_io_encoding(self):
+        self.check_io_encoding('io')
+
+    def test_io_encoding(self):
+        self.check_io_encoding('_pyio')
+
+    def test_locale_getpreferredencoding(self):
+        code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
+        out = self.get_output('-X', 'utf8', '-c', code)
+        self.assertEqual(out, 'UTF-8 UTF-8')
+
+        out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C')
+        self.assertEqual(out, 'UTF-8 UTF-8')
+
+
+if __name__ == "__main__":
+    unittest.main()
author	Victor Stinner <victor.stinner@gmail.com>	2017-12-13 11:29:09 (GMT)
committer	GitHub <noreply@github.com>	2017-12-13 11:29:09 (GMT)
commit	91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
tree	ff002e0532736a97f3ddd367c1491e7b04611816 /Lib
parent	c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
download	cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2