summaryrefslogtreecommitdiffstats
path: root/Lib/test
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2017-12-13 11:29:09 (GMT)
committerGitHub <noreply@github.com>2017-12-13 11:29:09 (GMT)
commit91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch)
treeff002e0532736a97f3ddd367c1491e7b04611816 /Lib/test
parentc3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff)
downloadcpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz
cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
Diffstat (limited to 'Lib/test')
-rw-r--r--Lib/test/test_builtin.py1
-rw-r--r--Lib/test/test_c_locale_coercion.py2
-rw-r--r--Lib/test/test_codecs.py10
-rw-r--r--Lib/test/test_io.py2
-rw-r--r--Lib/test/test_sys.py8
-rw-r--r--Lib/test/test_utf8_mode.py206
6 files changed, 217 insertions, 12 deletions
diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py
index 0a61c05..9329318 100644
--- a/Lib/test/test_builtin.py
+++ b/Lib/test/test_builtin.py
@@ -1022,6 +1022,7 @@ class BuiltinTest(unittest.TestCase):
self.assertRaises(ValueError, open, 'a\x00b')
self.assertRaises(ValueError, open, b'a\x00b')
+ @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_open_default_encoding(self):
old_environ = dict(os.environ)
try:
diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py
index 2a22739..c0845d7 100644
--- a/Lib/test/test_c_locale_coercion.py
+++ b/Lib/test/test_c_locale_coercion.py
@@ -130,7 +130,7 @@ class EncodingDetails(_EncodingDetails):
that.
"""
result, py_cmd = run_python_until_end(
- "-c", cls.CHILD_PROCESS_SCRIPT,
+ "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
__isolated=True,
**env_vars
)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index eb21a39..a59a5e2 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -5,6 +5,7 @@ import locale
import sys
import unittest
import encodings
+from unittest import mock
from test import support
@@ -3180,16 +3181,9 @@ class CodePageTest(unittest.TestCase):
def test_mbcs_alias(self):
# Check that looking up our 'default' codepage will return
# mbcs when we don't have a more specific one available
- import _bootlocale
- def _get_fake_codepage(*a):
- return 'cp123'
- old_getpreferredencoding = _bootlocale.getpreferredencoding
- _bootlocale.getpreferredencoding = _get_fake_codepage
- try:
+ with mock.patch('_winapi.GetACP', return_value=123):
codec = codecs.lookup('cp123')
self.assertEqual(codec.name, 'mbcs')
- finally:
- _bootlocale.getpreferredencoding = old_getpreferredencoding
class ASCIITest(unittest.TestCase):
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 6bb4127..6674831 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2580,6 +2580,7 @@ class TextIOWrapperTest(unittest.TestCase):
t.reconfigure(line_buffering=None)
self.assertEqual(t.line_buffering, True)
+ @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_default_encoding(self):
old_environ = dict(os.environ)
try:
@@ -2599,6 +2600,7 @@ class TextIOWrapperTest(unittest.TestCase):
os.environ.update(old_environ)
@support.cpython_only
+ @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_device_encoding(self):
# Issue 15989
import _testcapi
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index 6346094..6933b41 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -527,7 +527,7 @@ class SysModuleTest(unittest.TestCase):
"inspect", "interactive", "optimize", "dont_write_bytecode",
"no_user_site", "no_site", "ignore_environment", "verbose",
"bytes_warning", "quiet", "hash_randomization", "isolated",
- "dev_mode")
+ "dev_mode", "utf8_mode")
for attr in attrs:
self.assertTrue(hasattr(sys.flags, attr), attr)
attr_type = bool if attr == "dev_mode" else int
@@ -535,6 +535,8 @@ class SysModuleTest(unittest.TestCase):
self.assertTrue(repr(sys.flags))
self.assertEqual(len(sys.flags), len(attrs))
+ self.assertIn(sys.flags.utf8_mode, {0, 1, 2})
+
def assert_raise_on_new_sys_type(self, sys_attr):
# Users are intentionally prevented from creating new instances of
# sys.flags, sys.version_info, and sys.getwindowsversion.
@@ -710,8 +712,8 @@ class SysModuleTest(unittest.TestCase):
# have no any effect
out = self.c_locale_get_error_handler(encoding=':')
self.assertEqual(out,
- 'stdin: surrogateescape\n'
- 'stdout: surrogateescape\n'
+ 'stdin: strict\n'
+ 'stdout: strict\n'
'stderr: backslashreplace\n')
out = self.c_locale_get_error_handler(encoding='')
self.assertEqual(out,
diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py
new file mode 100644
index 0000000..275a6ea
--- /dev/null
+++ b/Lib/test/test_utf8_mode.py
@@ -0,0 +1,206 @@
+"""
+Test the implementation of the PEP 540: the UTF-8 Mode.
+"""
+
+import locale
+import os
+import sys
+import textwrap
+import unittest
+from test.support.script_helper import assert_python_ok, assert_python_failure
+
+
+MS_WINDOWS = (sys.platform == 'win32')
+
+
+class UTF8ModeTests(unittest.TestCase):
+ # Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment
+ # variables by default
+ DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''}
+
+ def posix_locale(self):
+ loc = locale.setlocale(locale.LC_CTYPE, None)
+ return (loc == 'C')
+
+ def get_output(self, *args, failure=False, **kw):
+ kw = dict(self.DEFAULT_ENV, **kw)
+ if failure:
+ out = assert_python_failure(*args, **kw)
+ out = out[2]
+ else:
+ out = assert_python_ok(*args, **kw)
+ out = out[1]
+ return out.decode().rstrip("\n\r")
+
+ @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
+ def test_posix_locale(self):
+ code = 'import sys; print(sys.flags.utf8_mode)'
+
+ out = self.get_output('-c', code, LC_ALL='C')
+ self.assertEqual(out, '1')
+
+ def test_xoption(self):
+ code = 'import sys; print(sys.flags.utf8_mode)'
+
+ out = self.get_output('-X', 'utf8', '-c', code)
+ self.assertEqual(out, '1')
+
+ # undocumented but accepted syntax: -X utf8=1
+ out = self.get_output('-X', 'utf8=1', '-c', code)
+ self.assertEqual(out, '1')
+
+ out = self.get_output('-X', 'utf8=0', '-c', code)
+ self.assertEqual(out, '0')
+
+ if MS_WINDOWS:
+ # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8
+ # and has the priority over -X utf8
+ out = self.get_output('-X', 'utf8', '-c', code,
+ PYTHONLEGACYWINDOWSFSENCODING='1')
+ self.assertEqual(out, '0')
+
+ def test_env_var(self):
+ code = 'import sys; print(sys.flags.utf8_mode)'
+
+ out = self.get_output('-c', code, PYTHONUTF8='1')
+ self.assertEqual(out, '1')
+
+ out = self.get_output('-c', code, PYTHONUTF8='0')
+ self.assertEqual(out, '0')
+
+ # -X utf8 has the priority over PYTHONUTF8
+ out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
+ self.assertEqual(out, '0')
+
+ if MS_WINDOWS:
+ # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
+ # and has the priority over PYTHONUTF8
+ out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
+ PYTHONLEGACYWINDOWSFSENCODING='1')
+ self.assertEqual(out, '0')
+
+ # Cannot test with the POSIX locale, since the POSIX locale enables
+ # the UTF-8 mode
+ if not self.posix_locale():
+ # PYTHONUTF8 should be ignored if -E is used
+ out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
+ self.assertEqual(out, '0')
+
+ # invalid mode
+ out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
+ self.assertIn('invalid PYTHONUTF8 environment variable value',
+ out.rstrip())
+
+ def test_filesystemencoding(self):
+ code = textwrap.dedent('''
+ import sys
+ print("{}/{}".format(sys.getfilesystemencoding(),
+ sys.getfilesystemencodeerrors()))
+ ''')
+
+ if MS_WINDOWS:
+ expected = 'utf-8/surrogatepass'
+ else:
+ expected = 'utf-8/surrogateescape'
+
+ out = self.get_output('-X', 'utf8', '-c', code)
+ self.assertEqual(out, expected)
+
+ if MS_WINDOWS:
+ # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
+ # and has the priority over -X utf8 and PYTHONUTF8
+ out = self.get_output('-X', 'utf8', '-c', code,
+ PYTHONUTF8='strict',
+ PYTHONLEGACYWINDOWSFSENCODING='1')
+ self.assertEqual(out, 'mbcs/replace')
+
+ def test_stdio(self):
+ code = textwrap.dedent('''
+ import sys
+ print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
+ print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
+ print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
+ ''')
+
+ out = self.get_output('-X', 'utf8', '-c', code,
+ PYTHONIOENCODING='')
+ self.assertEqual(out.splitlines(),
+ ['stdin: utf-8/surrogateescape',
+ 'stdout: utf-8/surrogateescape',
+ 'stderr: utf-8/backslashreplace'])
+
+ # PYTHONIOENCODING has the priority over PYTHONUTF8
+ out = self.get_output('-X', 'utf8', '-c', code,
+ PYTHONIOENCODING="latin1")
+ self.assertEqual(out.splitlines(),
+ ['stdin: latin1/strict',
+ 'stdout: latin1/strict',
+ 'stderr: latin1/backslashreplace'])
+
+ out = self.get_output('-X', 'utf8', '-c', code,
+ PYTHONIOENCODING=":namereplace")
+ self.assertEqual(out.splitlines(),
+ ['stdin: UTF-8/namereplace',
+ 'stdout: UTF-8/namereplace',
+ 'stderr: UTF-8/backslashreplace'])
+
+ def test_io(self):
+ code = textwrap.dedent('''
+ import sys
+ filename = sys.argv[1]
+ with open(filename) as fp:
+ print(f"{fp.encoding}/{fp.errors}")
+ ''')
+ filename = __file__
+
+ out = self.get_output('-c', code, filename, PYTHONUTF8='1')
+ self.assertEqual(out, 'UTF-8/strict')
+
+ def _check_io_encoding(self, module, encoding=None, errors=None):
+ filename = __file__
+
+ # Encoding explicitly set
+ args = []
+ if encoding:
+ args.append(f'encoding={encoding!r}')
+ if errors:
+ args.append(f'errors={errors!r}')
+ code = textwrap.dedent('''
+ import sys
+ from %s import open
+ filename = sys.argv[1]
+ with open(filename, %s) as fp:
+ print(f"{fp.encoding}/{fp.errors}")
+ ''') % (module, ', '.join(args))
+ out = self.get_output('-c', code, filename,
+ PYTHONUTF8='1')
+
+ if not encoding:
+ encoding = 'UTF-8'
+ if not errors:
+ errors = 'strict'
+ self.assertEqual(out, f'{encoding}/{errors}')
+
+ def check_io_encoding(self, module):
+ self._check_io_encoding(module, encoding="latin1")
+ self._check_io_encoding(module, errors="namereplace")
+ self._check_io_encoding(module,
+ encoding="latin1", errors="namereplace")
+
+ def test_io_encoding(self):
+ self.check_io_encoding('io')
+
+ def test_io_encoding(self):
+ self.check_io_encoding('_pyio')
+
+ def test_locale_getpreferredencoding(self):
+ code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
+ out = self.get_output('-X', 'utf8', '-c', code)
+ self.assertEqual(out, 'UTF-8 UTF-8')
+
+ out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C')
+ self.assertEqual(out, 'UTF-8 UTF-8')
+
+
+if __name__ == "__main__":
+ unittest.main()