Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead of

the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment variable is not set, the locale encoding is ISO-8859-1, whereas most programs (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem encoding and to encode command line arguments on this OS.
author: Victor Stinner <victor.stinner@haypocalc.com> 2010-10-20 22:58:25 (GMT)
committer: Victor Stinner <victor.stinner@haypocalc.com> 2010-10-20 22:58:25 (GMT)
commit: f933e1ab6fdea76973384e38ea95520de422c340 (patch)
tree: 88a9a55449b4eb3a2167630127f2b9640f678e3e /Lib
parent: 073f759d65118674c4c7b82f778dde44ae22c6c9 (diff)
download: cpython-f933e1ab6fdea76973384e38ea95520de422c340.zip
cpython-f933e1ab6fdea76973384e38ea95520de422c340.tar.gz
cpython-f933e1ab6fdea76973384e38ea95520de422c340.tar.bz2
1 files changed, 32 insertions, 0 deletions
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index 73acb9f..c864cdd 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -148,6 +148,38 @@ class CmdLineTest(unittest.TestCase):
         if not stdout.startswith(pattern):
             raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
 
+    @unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X')
+    def test_osx_utf8(self):
+        def check_output(text):
+            decoded = text.decode('utf8', 'surrogateescape')
+            expected = ascii(decoded).encode('ascii') + b'\n'
+
+            env = os.environ.copy()
+            # C locale gives ASCII locale encoding, but Python uses UTF-8
+            # to parse the command line arguments on Mac OS X
+            env['LC_ALL'] = 'C'
+
+            p = subprocess.Popen(
+                (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
+                stdout=subprocess.PIPE,
+                env=env)
+            stdout, stderr = p.communicate()
+            self.assertEqual(stdout, expected)
+            self.assertEqual(p.returncode, 0)
+
+        # test valid utf-8
+        text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+        check_output(text)
+
+        # test invalid utf-8
+        text = (
+            b'\xff'         # invalid byte
+            b'\xc3\xa9'     # valid utf-8 character
+            b'\xc3\xff'     # invalid byte sequence
+            b'\xed\xa0\x80' # lone surrogate character (invalid)
+        )
+        check_output(text)
+
     def test_unbuffered_output(self):
         # Test expected operation of the '-u' switch
         for stream in ('stdout', 'stderr'):
author	Victor Stinner <victor.stinner@haypocalc.com>	2010-10-20 22:58:25 (GMT)
committer	Victor Stinner <victor.stinner@haypocalc.com>	2010-10-20 22:58:25 (GMT)
commit	f933e1ab6fdea76973384e38ea95520de422c340 (patch)
tree	88a9a55449b4eb3a2167630127f2b9640f678e3e /Lib
parent	073f759d65118674c4c7b82f778dde44ae22c6c9 (diff)
download	cpython-f933e1ab6fdea76973384e38ea95520de422c340.zip cpython-f933e1ab6fdea76973384e38ea95520de422c340.tar.gz cpython-f933e1ab6fdea76973384e38ea95520de422c340.tar.bz2