summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2020-11-15 16:16:59 (GMT)
committerGitHub <noreply@github.com>2020-11-15 16:16:59 (GMT)
commita26215db11cfcf7b5f55cab9e91396761a0e0bcf (patch)
tree9cb26e5a66cdbeadf8de94948b80c95d4ddc32b0
parent7a27c7ed4b2b45bb9ea27d3f5c4f423495d6e939 (diff)
downloadcpython-a26215db11cfcf7b5f55cab9e91396761a0e0bcf.zip
cpython-a26215db11cfcf7b5f55cab9e91396761a0e0bcf.tar.gz
cpython-a26215db11cfcf7b5f55cab9e91396761a0e0bcf.tar.bz2
bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281)
-rw-r--r--Lib/test/test_tcl.py46
-rw-r--r--Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst1
-rw-r--r--Modules/_tkinter.c54
3 files changed, 94 insertions, 7 deletions
diff --git a/Lib/test/test_tcl.py b/Lib/test/test_tcl.py
index cd2a30e..d104eb8 100644
--- a/Lib/test/test_tcl.py
+++ b/Lib/test/test_tcl.py
@@ -1,4 +1,5 @@
import unittest
+import locale
import re
import subprocess
import sys
@@ -61,6 +62,10 @@ class TclTest(unittest.TestCase):
tcl = self.interp
self.assertEqual(tcl.eval('set a "a\\0b"'), 'a\x00b')
+ def test_eval_surrogates_in_result(self):
+ tcl = self.interp
+ self.assertIn(tcl.eval(r'set a "<\ud83d\udcbb>"'), '<\U0001f4bb>')
+
def testEvalException(self):
tcl = self.interp
self.assertRaises(TclError,tcl.eval,'set a')
@@ -193,29 +198,48 @@ class TclTest(unittest.TestCase):
def testEvalFile(self):
tcl = self.interp
- with open(os_helper.TESTFN, 'w') as f:
- self.addCleanup(os_helper.unlink, os_helper.TESTFN)
+ filename = os_helper.TESTFN_ASCII
+ self.addCleanup(os_helper.unlink, filename)
+ with open(filename, 'w') as f:
f.write("""set a 1
set b 2
set c [ expr $a + $b ]
""")
- tcl.evalfile(os_helper.TESTFN)
+ tcl.evalfile(filename)
self.assertEqual(tcl.eval('set a'),'1')
self.assertEqual(tcl.eval('set b'),'2')
self.assertEqual(tcl.eval('set c'),'3')
def test_evalfile_null_in_result(self):
tcl = self.interp
- with open(os_helper.TESTFN, 'w') as f:
- self.addCleanup(os_helper.unlink, os_helper.TESTFN)
+ filename = os_helper.TESTFN_ASCII
+ self.addCleanup(os_helper.unlink, filename)
+ with open(filename, 'w') as f:
f.write("""
set a "a\0b"
set b "a\\0b"
""")
- tcl.evalfile(os_helper.TESTFN)
+ tcl.evalfile(filename)
self.assertEqual(tcl.eval('set a'), 'a\x00b')
self.assertEqual(tcl.eval('set b'), 'a\x00b')
+ def test_evalfile_surrogates_in_result(self):
+ tcl = self.interp
+ encoding = tcl.call('encoding', 'system')
+ self.addCleanup(tcl.call, 'encoding', 'system', encoding)
+ tcl.call('encoding', 'system', 'utf-8')
+
+ filename = os_helper.TESTFN_ASCII
+ self.addCleanup(os_helper.unlink, filename)
+ with open(filename, 'wb') as f:
+ f.write(b"""
+ set a "<\xed\xa0\xbd\xed\xb2\xbb>"
+ set b "<\\ud83d\\udcbb>"
+ """)
+ tcl.evalfile(filename)
+ self.assertEqual(tcl.eval('set a'), '<\U0001f4bb>')
+ self.assertEqual(tcl.eval('set b'), '<\U0001f4bb>')
+
def testEvalFileException(self):
tcl = self.interp
filename = "doesnotexists"
@@ -438,6 +462,11 @@ class TclTest(unittest.TestCase):
self.assertEqual(passValue('str\x00ing\u20ac'), 'str\x00ing\u20ac')
self.assertEqual(passValue('str\x00ing\U0001f4bb'),
'str\x00ing\U0001f4bb')
+ if sys.platform != 'win32':
+ self.assertEqual(passValue('<\udce2\udc82\udcac>'),
+ '<\u20ac>')
+ self.assertEqual(passValue('<\udced\udca0\udcbd\udced\udcb2\udcbb>'),
+ '<\U0001f4bb>')
self.assertEqual(passValue(b'str\x00ing'),
b'str\x00ing' if self.wantobjects else 'str\x00ing')
self.assertEqual(passValue(b'str\xc0\x80ing'),
@@ -497,6 +526,9 @@ class TclTest(unittest.TestCase):
check('string\xbd')
check('string\u20ac')
check('string\U0001f4bb')
+ if sys.platform != 'win32':
+ check('<\udce2\udc82\udcac>', '<\u20ac>')
+ check('<\udced\udca0\udcbd\udced\udcb2\udcbb>', '<\U0001f4bb>')
check('')
check(b'string', 'string')
check(b'string\xe2\x82\xac', 'string\xe2\x82\xac')
@@ -540,6 +572,8 @@ class TclTest(unittest.TestCase):
('a \u20ac', ('a', '\u20ac')),
('a \U0001f4bb', ('a', '\U0001f4bb')),
(b'a \xe2\x82\xac', ('a', '\u20ac')),
+ (b'a \xf0\x9f\x92\xbb', ('a', '\U0001f4bb')),
+ (b'a \xed\xa0\xbd\xed\xb2\xbb', ('a', '\U0001f4bb')),
(b'a\xc0\x80b c\xc0\x80d', ('a\x00b', 'c\x00d')),
('a {b c}', ('a', 'b c')),
(r'a b\ c', ('a', 'b c')),
diff --git a/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst
new file mode 100644
index 0000000..e72daeb
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-11-14-13-46-27.bpo-42318.wYAcBD.rst
@@ -0,0 +1 @@
+Fixed support of non-BMP characters in :mod:`tkinter` on macOS.
diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c
index 793c5e7..b30141d 100644
--- a/Modules/_tkinter.c
+++ b/Modules/_tkinter.c
@@ -395,7 +395,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
char *buf = NULL;
PyErr_Clear();
- /* Tcl encodes null character as \xc0\x80 */
+ /* Tcl encodes null character as \xc0\x80.
+ https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */
if (memchr(s, '\xc0', size)) {
char *q;
const char *e = s + size;
@@ -419,6 +420,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
if (buf != NULL) {
PyMem_Free(buf);
}
+ if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) {
+ return r;
+ }
+
+ /* In CESU-8 non-BMP characters are represented as a surrogate pair,
+ like in UTF-16, and then each surrogate code point is encoded in UTF-8.
+ https://en.wikipedia.org/wiki/CESU-8 */
+ Py_ssize_t len = PyUnicode_GET_LENGTH(r);
+ Py_ssize_t i, j;
+ /* All encoded surrogate characters start with \xED. */
+ i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1);
+ if (i == -2) {
+ Py_DECREF(r);
+ return NULL;
+ }
+ if (i == -1) {
+ return r;
+ }
+ Py_UCS4 *u = PyUnicode_AsUCS4Copy(r);
+ Py_DECREF(r);
+ if (u == NULL) {
+ return NULL;
+ }
+ Py_UCS4 ch;
+ for (j = i; i < len; i++, u[j++] = ch) {
+ Py_UCS4 ch1, ch2, ch3, high, low;
+ /* Low surrogates U+D800 - U+DBFF are encoded as
+ \xED\xA0\x80 - \xED\xAF\xBF. */
+ ch1 = ch = u[i];
+ if (ch1 != 0xdcED) continue;
+ ch2 = u[i + 1];
+ if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue;
+ ch3 = u[i + 2];
+ if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+ high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ /* High surrogates U+DC00 - U+DFFF are encoded as
+ \xED\xB0\x80 - \xED\xBF\xBF. */
+ ch1 = u[i + 3];
+ if (ch1 != 0xdcED) continue;
+ ch2 = u[i + 4];
+ if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue;
+ ch3 = u[i + 5];
+ if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+ low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ ch = Py_UNICODE_JOIN_SURROGATES(high, low);
+ i += 5;
+ }
+ r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j);
+ PyMem_Free(u);
return r;
}