summaryrefslogtreecommitdiffstats
path: root/Modules/_tkinter.c
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2020-12-25 22:35:46 (GMT)
committerGitHub <noreply@github.com>2020-12-25 22:35:46 (GMT)
commit4d840e428ab1a2712f219c5e4008658cbe15892e (patch)
treeea6cb19f2f15bcf1d9273ae9535c5cbc4dfacb37 /Modules/_tkinter.c
parent0178a6b67ca3e782443f311e953509ca3eb4aacf (diff)
downloadcpython-4d840e428ab1a2712f219c5e4008658cbe15892e.zip
cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.gz
cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.bz2
[3.8] bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281). (GH-23784) (GH-23787)
(cherry picked from commit a26215db11cfcf7b5f55cab9e91396761a0e0bcf) (cherry picked from commit 28bf6ab61f77c69b732a211c398ac882bf3f65f4)
Diffstat (limited to 'Modules/_tkinter.c')
-rw-r--r--Modules/_tkinter.c54
1 files changed, 53 insertions, 1 deletions
diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c
index a1071e5..c1eb6e1 100644
--- a/Modules/_tkinter.c
+++ b/Modules/_tkinter.c
@@ -397,7 +397,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
char *buf = NULL;
PyErr_Clear();
- /* Tcl encodes null character as \xc0\x80 */
+ /* Tcl encodes null character as \xc0\x80.
+ https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */
if (memchr(s, '\xc0', size)) {
char *q;
const char *e = s + size;
@@ -421,6 +422,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
if (buf != NULL) {
PyMem_Free(buf);
}
+ if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) {
+ return r;
+ }
+
+ /* In CESU-8 non-BMP characters are represented as a surrogate pair,
+ like in UTF-16, and then each surrogate code point is encoded in UTF-8.
+ https://en.wikipedia.org/wiki/CESU-8 */
+ Py_ssize_t len = PyUnicode_GET_LENGTH(r);
+ Py_ssize_t i, j;
+ /* All encoded surrogate characters start with \xED. */
+ i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1);
+ if (i == -2) {
+ Py_DECREF(r);
+ return NULL;
+ }
+ if (i == -1) {
+ return r;
+ }
+ Py_UCS4 *u = PyUnicode_AsUCS4Copy(r);
+ Py_DECREF(r);
+ if (u == NULL) {
+ return NULL;
+ }
+ Py_UCS4 ch;
+ for (j = i; i < len; i++, u[j++] = ch) {
+ Py_UCS4 ch1, ch2, ch3, high, low;
+ /* Low surrogates U+D800 - U+DBFF are encoded as
+ \xED\xA0\x80 - \xED\xAF\xBF. */
+ ch1 = ch = u[i];
+ if (ch1 != 0xdcED) continue;
+ ch2 = u[i + 1];
+ if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue;
+ ch3 = u[i + 2];
+ if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+ high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ /* High surrogates U+DC00 - U+DFFF are encoded as
+ \xED\xB0\x80 - \xED\xBF\xBF. */
+ ch1 = u[i + 3];
+ if (ch1 != 0xdcED) continue;
+ ch2 = u[i + 4];
+ if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue;
+ ch3 = u[i + 5];
+ if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+ low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ ch = Py_UNICODE_JOIN_SURROGATES(high, low);
+ i += 5;
+ }
+ r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j);
+ PyMem_Free(u);
return r;
}