summaryrefslogtreecommitdiffstats
path: root/Lib/test
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2021-03-29 19:36:47 (GMT)
committerGitHub <noreply@github.com>2021-03-29 19:36:47 (GMT)
commit3b6e61ee0812359029cac176042d9c835c60f185 (patch)
tree1822604c7a133ca320ae917e4105a4015e0f74e7 /Lib/test
parente9092b221d4951609827e437178a557fd07353af (diff)
downloadcpython-3b6e61ee0812359029cac176042d9c835c60f185.zip
cpython-3b6e61ee0812359029cac176042d9c835c60f185.tar.gz
cpython-3b6e61ee0812359029cac176042d9c835c60f185.tar.bz2
bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843) (GH-24906)
Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c. (cherry picked from commit 9976834f807ea63ca51bc4f89be457d734148682) Co-authored-by: Victor Stinner <vstinner@python.org> Co-authored-by: Victor Stinner <vstinner@python.org>
Diffstat (limited to 'Lib/test')
-rw-r--r--Lib/test/test_cmd_line.py86
1 files changed, 60 insertions, 26 deletions
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index a5ece9b..871a9c7 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -190,38 +190,72 @@ class CmdLineTest(unittest.TestCase):
if not stdout.startswith(pattern):
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
+ @unittest.skipIf(sys.platform == 'win32',
+ 'Windows has a native unicode API')
+ def test_invalid_utf8_arg(self):
+ # bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
+ # byte sequence with surrogateescape rather than decoding it as the
+ # U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
+ # Python Unicode characters.
+ #
+ # Test with default config, in the C locale, in the Python UTF-8 Mode.
+ code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
+ base_cmd = [sys.executable, '-c', code]
+
+ def run_default(arg):
+ cmd = [sys.executable, '-c', code, arg]
+ return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+ def run_c_locale(arg):
+ cmd = [sys.executable, '-c', code, arg]
+ env = dict(os.environ)
+ env['LC_ALL'] = 'C'
+ return subprocess.run(cmd, stdout=subprocess.PIPE,
+ text=True, env=env)
+
+ def run_utf8_mode(arg):
+ cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
+ return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+ valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+ # invalid UTF-8 byte sequences with a valid UTF-8 sequence
+ # in the middle.
+ invalid_utf8 = (
+ b'\xff' # invalid byte
+ b'\xc3\xff' # invalid byte sequence
+ b'\xc3\xa9' # valid utf-8: U+00E9 character
+ b'\xed\xa0\x80' # lone surrogate character (invalid)
+ b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff]
+ )
+ test_args = [valid_utf8, invalid_utf8]
+
+ for run_cmd in (run_default, run_c_locale, run_utf8_mode):
+ with self.subTest(run_cmd=run_cmd):
+ for arg in test_args:
+ proc = run_cmd(arg)
+ self.assertEqual(proc.stdout.rstrip(), ascii(arg))
+
@unittest.skipUnless((sys.platform == 'darwin' or
support.is_android), 'test specific to Mac OS X and Android')
def test_osx_android_utf8(self):
- def check_output(text):
- decoded = text.decode('utf-8', 'surrogateescape')
- expected = ascii(decoded).encode('ascii') + b'\n'
+ text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+ code = "import sys; print(ascii(sys.argv[1]))"
- env = os.environ.copy()
- # C locale gives ASCII locale encoding, but Python uses UTF-8
- # to parse the command line arguments on Mac OS X and Android.
- env['LC_ALL'] = 'C'
+ decoded = text.decode('utf-8', 'surrogateescape')
+ expected = ascii(decoded).encode('ascii') + b'\n'
- p = subprocess.Popen(
- (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
- stdout=subprocess.PIPE,
- env=env)
- stdout, stderr = p.communicate()
- self.assertEqual(stdout, expected)
- self.assertEqual(p.returncode, 0)
+ env = os.environ.copy()
+ # C locale gives ASCII locale encoding, but Python uses UTF-8
+ # to parse the command line arguments on Mac OS X and Android.
+ env['LC_ALL'] = 'C'
- # test valid utf-8
- text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
- check_output(text)
-
- # test invalid utf-8
- text = (
- b'\xff' # invalid byte
- b'\xc3\xa9' # valid utf-8 character
- b'\xc3\xff' # invalid byte sequence
- b'\xed\xa0\x80' # lone surrogate character (invalid)
- )
- check_output(text)
+ p = subprocess.Popen(
+ (sys.executable, "-c", code, text),
+ stdout=subprocess.PIPE,
+ env=env)
+ stdout, stderr = p.communicate()
+ self.assertEqual(stdout, expected)
+ self.assertEqual(p.returncode, 0)
def test_unbuffered_output(self):
# Test expected operation of the '-u' switch