diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2020-11-15 16:16:59 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-15 16:16:59 (GMT) |
commit | a26215db11cfcf7b5f55cab9e91396761a0e0bcf (patch) | |
tree | 9cb26e5a66cdbeadf8de94948b80c95d4ddc32b0 | |
parent | 7a27c7ed4b2b45bb9ea27d3f5c4f423495d6e939 (diff) | |
download | cpython-a26215db11cfcf7b5f55cab9e91396761a0e0bcf.zip cpython-a26215db11cfcf7b5f55cab9e91396761a0e0bcf.tar.gz cpython-a26215db11cfcf7b5f55cab9e91396761a0e0bcf.tar.bz2 |
bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281)
-rw-r--r-- | Lib/test/test_tcl.py | 46 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst | 1 | ||||
-rw-r--r-- | Modules/_tkinter.c | 54 |
3 files changed, 94 insertions, 7 deletions
diff --git a/Lib/test/test_tcl.py b/Lib/test/test_tcl.py index cd2a30e..d104eb8 100644 --- a/Lib/test/test_tcl.py +++ b/Lib/test/test_tcl.py @@ -1,4 +1,5 @@ import unittest +import locale import re import subprocess import sys @@ -61,6 +62,10 @@ class TclTest(unittest.TestCase): tcl = self.interp self.assertEqual(tcl.eval('set a "a\\0b"'), 'a\x00b') + def test_eval_surrogates_in_result(self): + tcl = self.interp + self.assertIn(tcl.eval(r'set a "<\ud83d\udcbb>"'), '<\U0001f4bb>') + def testEvalException(self): tcl = self.interp self.assertRaises(TclError,tcl.eval,'set a') @@ -193,29 +198,48 @@ class TclTest(unittest.TestCase): def testEvalFile(self): tcl = self.interp - with open(os_helper.TESTFN, 'w') as f: - self.addCleanup(os_helper.unlink, os_helper.TESTFN) + filename = os_helper.TESTFN_ASCII + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'w') as f: f.write("""set a 1 set b 2 set c [ expr $a + $b ] """) - tcl.evalfile(os_helper.TESTFN) + tcl.evalfile(filename) self.assertEqual(tcl.eval('set a'),'1') self.assertEqual(tcl.eval('set b'),'2') self.assertEqual(tcl.eval('set c'),'3') def test_evalfile_null_in_result(self): tcl = self.interp - with open(os_helper.TESTFN, 'w') as f: - self.addCleanup(os_helper.unlink, os_helper.TESTFN) + filename = os_helper.TESTFN_ASCII + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'w') as f: f.write(""" set a "a\0b" set b "a\\0b" """) - tcl.evalfile(os_helper.TESTFN) + tcl.evalfile(filename) self.assertEqual(tcl.eval('set a'), 'a\x00b') self.assertEqual(tcl.eval('set b'), 'a\x00b') + def test_evalfile_surrogates_in_result(self): + tcl = self.interp + encoding = tcl.call('encoding', 'system') + self.addCleanup(tcl.call, 'encoding', 'system', encoding) + tcl.call('encoding', 'system', 'utf-8') + + filename = os_helper.TESTFN_ASCII + self.addCleanup(os_helper.unlink, filename) + with open(filename, 'wb') as f: + f.write(b""" + set a "<\xed\xa0\xbd\xed\xb2\xbb>" + set b "<\\ud83d\\udcbb>" + """) + tcl.evalfile(filename) + self.assertEqual(tcl.eval('set a'), '<\U0001f4bb>') + self.assertEqual(tcl.eval('set b'), '<\U0001f4bb>') + def testEvalFileException(self): tcl = self.interp filename = "doesnotexists" @@ -438,6 +462,11 @@ class TclTest(unittest.TestCase): self.assertEqual(passValue('str\x00ing\u20ac'), 'str\x00ing\u20ac') self.assertEqual(passValue('str\x00ing\U0001f4bb'), 'str\x00ing\U0001f4bb') + if sys.platform != 'win32': + self.assertEqual(passValue('<\udce2\udc82\udcac>'), + '<\u20ac>') + self.assertEqual(passValue('<\udced\udca0\udcbd\udced\udcb2\udcbb>'), + '<\U0001f4bb>') self.assertEqual(passValue(b'str\x00ing'), b'str\x00ing' if self.wantobjects else 'str\x00ing') self.assertEqual(passValue(b'str\xc0\x80ing'), @@ -497,6 +526,9 @@ class TclTest(unittest.TestCase): check('string\xbd') check('string\u20ac') check('string\U0001f4bb') + if sys.platform != 'win32': + check('<\udce2\udc82\udcac>', '<\u20ac>') + check('<\udced\udca0\udcbd\udced\udcb2\udcbb>', '<\U0001f4bb>') check('') check(b'string', 'string') check(b'string\xe2\x82\xac', 'string\xe2\x82\xac') @@ -540,6 +572,8 @@ class TclTest(unittest.TestCase): ('a \u20ac', ('a', '\u20ac')), ('a \U0001f4bb', ('a', '\U0001f4bb')), (b'a \xe2\x82\xac', ('a', '\u20ac')), + (b'a \xf0\x9f\x92\xbb', ('a', '\U0001f4bb')), + (b'a \xed\xa0\xbd\xed\xb2\xbb', ('a', '\U0001f4bb')), (b'a\xc0\x80b c\xc0\x80d', ('a\x00b', 'c\x00d')), ('a {b c}', ('a', 'b c')), (r'a b\ c', ('a', 'b c')), diff --git a/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst new file mode 100644 index 0000000..e72daeb --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst @@ -0,0 +1 @@ +Fixed support of non-BMP characters in :mod:`tkinter` on macOS. diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index 793c5e7..b30141d 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -395,7 +395,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) char *buf = NULL; PyErr_Clear(); - /* Tcl encodes null character as \xc0\x80 */ + /* Tcl encodes null character as \xc0\x80. + https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */ if (memchr(s, '\xc0', size)) { char *q; const char *e = s + size; @@ -419,6 +420,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) if (buf != NULL) { PyMem_Free(buf); } + if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) { + return r; + } + + /* In CESU-8 non-BMP characters are represented as a surrogate pair, + like in UTF-16, and then each surrogate code point is encoded in UTF-8. + https://en.wikipedia.org/wiki/CESU-8 */ + Py_ssize_t len = PyUnicode_GET_LENGTH(r); + Py_ssize_t i, j; + /* All encoded surrogate characters start with \xED. */ + i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1); + if (i == -2) { + Py_DECREF(r); + return NULL; + } + if (i == -1) { + return r; + } + Py_UCS4 *u = PyUnicode_AsUCS4Copy(r); + Py_DECREF(r); + if (u == NULL) { + return NULL; + } + Py_UCS4 ch; + for (j = i; i < len; i++, u[j++] = ch) { + Py_UCS4 ch1, ch2, ch3, high, low; + /* Low surrogates U+D800 - U+DBFF are encoded as + \xED\xA0\x80 - \xED\xAF\xBF. */ + ch1 = ch = u[i]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 1]; + if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue; + ch3 = u[i + 2]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + /* High surrogates U+DC00 - U+DFFF are encoded as + \xED\xB0\x80 - \xED\xBF\xBF. */ + ch1 = u[i + 3]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 4]; + if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue; + ch3 = u[i + 5]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + ch = Py_UNICODE_JOIN_SURROGATES(high, low); + i += 5; + } + r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j); + PyMem_Free(u); return r; } |