[3.8] bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281). (GH-23784) (GH-23787)

(cherry picked from commit a26215db11cfcf7b5f55cab9e91396761a0e0bcf) (cherry picked from commit 28bf6ab61f77c69b732a211c398ac882bf3f65f4)
author: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> 2020-12-25 22:35:46 (GMT)
committer: GitHub <noreply@github.com> 2020-12-25 22:35:46 (GMT)
commit: 4d840e428ab1a2712f219c5e4008658cbe15892e (patch)
tree: ea6cb19f2f15bcf1d9273ae9535c5cbc4dfacb37 /Modules/_tkinter.c
parent: 0178a6b67ca3e782443f311e953509ca3eb4aacf (diff)
download: cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.zip
cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.gz
cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.bz2
1 files changed, 53 insertions, 1 deletions
diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c
index a1071e5..c1eb6e1 100644
--- a/Modules/_tkinter.c
+++ b/Modules/_tkinter.c
@@ -397,7 +397,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
 
     char *buf = NULL;
     PyErr_Clear();
-    /* Tcl encodes null character as \xc0\x80 */
+    /* Tcl encodes null character as \xc0\x80.
+       https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */
     if (memchr(s, '\xc0', size)) {
         char *q;
         const char *e = s + size;
@@ -421,6 +422,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
     if (buf != NULL) {
         PyMem_Free(buf);
     }
+    if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) {
+        return r;
+    }
+
+    /* In CESU-8 non-BMP characters are represented as a surrogate pair,
+       like in UTF-16, and then each surrogate code point is encoded in UTF-8.
+       https://en.wikipedia.org/wiki/CESU-8 */
+    Py_ssize_t len = PyUnicode_GET_LENGTH(r);
+    Py_ssize_t i, j;
+    /* All encoded surrogate characters start with \xED. */
+    i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1);
+    if (i == -2) {
+        Py_DECREF(r);
+        return NULL;
+    }
+    if (i == -1) {
+        return r;
+    }
+    Py_UCS4 *u = PyUnicode_AsUCS4Copy(r);
+    Py_DECREF(r);
+    if (u == NULL) {
+        return NULL;
+    }
+    Py_UCS4 ch;
+    for (j = i; i < len; i++, u[j++] = ch) {
+        Py_UCS4 ch1, ch2, ch3, high, low;
+        /* Low surrogates U+D800 - U+DBFF are encoded as
+           \xED\xA0\x80 - \xED\xAF\xBF. */
+        ch1 = ch = u[i];
+        if (ch1 != 0xdcED) continue;
+        ch2 = u[i + 1];
+        if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue;
+        ch3 = u[i + 2];
+        if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+        high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+        assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+        /* High surrogates U+DC00 - U+DFFF are encoded as
+           \xED\xB0\x80 - \xED\xBF\xBF. */
+        ch1 = u[i + 3];
+        if (ch1 != 0xdcED) continue;
+        ch2 = u[i + 4];
+        if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue;
+        ch3 = u[i + 5];
+        if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+        low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+        assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+        ch = Py_UNICODE_JOIN_SURROGATES(high, low);
+        i += 5;
+    }
+    r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j);
+    PyMem_Free(u);
     return r;
 }
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>	2020-12-25 22:35:46 (GMT)
committer	GitHub <noreply@github.com>	2020-12-25 22:35:46 (GMT)
commit	4d840e428ab1a2712f219c5e4008658cbe15892e (patch)
tree	ea6cb19f2f15bcf1d9273ae9535c5cbc4dfacb37 /Modules/_tkinter.c
parent	0178a6b67ca3e782443f311e953509ca3eb4aacf (diff)
download	cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.zip cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.gz cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.bz2