summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_cmd_line.py32
-rw-r--r--Misc/NEWS6
-rw-r--r--Modules/python.c8
-rw-r--r--Objects/unicodeobject.c114
4 files changed, 160 insertions, 0 deletions
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index 73acb9f..c864cdd 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -148,6 +148,38 @@ class CmdLineTest(unittest.TestCase):
if not stdout.startswith(pattern):
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
+ @unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X')
+ def test_osx_utf8(self):
+ def check_output(text):
+ decoded = text.decode('utf8', 'surrogateescape')
+ expected = ascii(decoded).encode('ascii') + b'\n'
+
+ env = os.environ.copy()
+ # C locale gives ASCII locale encoding, but Python uses UTF-8
+ # to parse the command line arguments on Mac OS X
+ env['LC_ALL'] = 'C'
+
+ p = subprocess.Popen(
+ (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
+ stdout=subprocess.PIPE,
+ env=env)
+ stdout, stderr = p.communicate()
+ self.assertEqual(stdout, expected)
+ self.assertEqual(p.returncode, 0)
+
+ # test valid utf-8
+ text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+ check_output(text)
+
+ # test invalid utf-8
+ text = (
+ b'\xff' # invalid byte
+ b'\xc3\xa9' # valid utf-8 character
+ b'\xc3\xff' # invalid byte sequence
+ b'\xed\xa0\x80' # lone surrogate character (invalid)
+ )
+ check_output(text)
+
def test_unbuffered_output(self):
# Test expected operation of the '-u' switch
for stream in ('stdout', 'stderr'):
diff --git a/Misc/NEWS b/Misc/NEWS
index 60d546f..265b881 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1?
Core and Builtins
-----------------
+- Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead
+ of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment
+ variable is not set, the locale encoding is ISO-8859-1, whereas most programs
+ (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem
+ encoding and to encode command line arguments on this OS.
+
- Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects
filenames encoded to the filesystem encoding with surrogateescape error
handler (to support undecodable bytes), instead of UTF-8 in strict mode.
diff --git a/Modules/python.c b/Modules/python.c
index 47685a4..18f9b3d 100644
--- a/Modules/python.c
+++ b/Modules/python.c
@@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv)
}
#else
+#ifdef __APPLE__
+extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
+#endif
+
int
main(int argc, char **argv)
{
@@ -41,7 +45,11 @@ main(int argc, char **argv)
oldloc = strdup(setlocale(LC_ALL, NULL));
setlocale(LC_ALL, "");
for (i = 0; i < argc; i++) {
+#ifdef __APPLE__
+ argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
+#else
argv_copy[i] = _Py_char2wchar(argv[i], NULL);
+#endif
if (!argv_copy[i])
return 1;
argv_copy2[i] = argv_copy[i];
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7564b67..f5c09dd 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
#undef ASCII_CHAR_MASK
+#ifdef __APPLE__
+
+/* Simplified UTF-8 decoder using surrogateescape error handler,
+ used to decode the command line arguments on Mac OS X. */
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
+{
+ int n;
+ const char *e;
+ wchar_t *unicode, *p;
+
+ /* Note: size will always be longer than the resulting Unicode
+ character count */
+ if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
+ if (!unicode)
+ return NULL;
+
+ /* Unpack UTF-8 encoded data */
+ p = unicode;
+ e = s + size;
+ while (s < e) {
+ Py_UCS4 ch = (unsigned char)*s;
+
+ if (ch < 0x80) {
+ *p++ = (wchar_t)ch;
+ s++;
+ continue;
+ }
+
+ n = utf8_code_length[ch];
+ if (s + n > e) {
+ goto surrogateescape;
+ }
+
+ switch (n) {
+ case 0:
+ case 1:
+ goto surrogateescape;
+
+ case 2:
+ if ((s[1] & 0xc0) != 0x80)
+ goto surrogateescape;
+ ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+ assert ((ch > 0x007F) && (ch <= 0x07FF));
+ *p++ = (wchar_t)ch;
+ break;
+
+ case 3:
+ /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
+ will result in surrogates in range d800-dfff. Surrogates are
+ not valid UTF-8 so they are rejected.
+ See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+ (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ ((unsigned char)s[0] == 0xE0 &&
+ (unsigned char)s[1] < 0xA0) ||
+ ((unsigned char)s[0] == 0xED &&
+ (unsigned char)s[1] > 0x9F)) {
+
+ goto surrogateescape;
+ }
+ ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+ assert ((ch > 0x07FF) && (ch <= 0xFFFF));
+ *p++ = (Py_UNICODE)ch;
+ break;
+
+ case 4:
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ ((unsigned char)s[0] == 0xF0 &&
+ (unsigned char)s[1] < 0x90) ||
+ ((unsigned char)s[0] == 0xF4 &&
+ (unsigned char)s[1] > 0x8F)) {
+ goto surrogateescape;
+ }
+ ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+ ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+ assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+
+#if SIZEOF_WCHAR_T == 4
+ *p++ = (wchar_t)ch;
+#else
+ /* compute and append the two surrogates: */
+
+ /* translate from 10000..10FFFF to 0..FFFF */
+ ch -= 0x10000;
+
+ /* high surrogate = top 10 bits added to D800 */
+ *p++ = (wchar_t)(0xD800 + (ch >> 10));
+
+ /* low surrogate = bottom 10 bits added to DC00 */
+ *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
+#endif
+ break;
+ }
+ s += n;
+ continue;
+
+ surrogateescape:
+ *p++ = 0xDC00 + ch;
+ s++;
+ }
+ *p = L'\0';
+ return unicode;
+}
+
+#endif /* __APPLE__ */
/* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the