diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2020-12-25 22:35:46 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-25 22:35:46 (GMT) |
commit | 4d840e428ab1a2712f219c5e4008658cbe15892e (patch) | |
tree | ea6cb19f2f15bcf1d9273ae9535c5cbc4dfacb37 /Modules/_tkinter.c | |
parent | 0178a6b67ca3e782443f311e953509ca3eb4aacf (diff) | |
download | cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.zip cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.gz cpython-4d840e428ab1a2712f219c5e4008658cbe15892e.tar.bz2 |
[3.8] bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281). (GH-23784) (GH-23787)
(cherry picked from commit a26215db11cfcf7b5f55cab9e91396761a0e0bcf)
(cherry picked from commit 28bf6ab61f77c69b732a211c398ac882bf3f65f4)
Diffstat (limited to 'Modules/_tkinter.c')
-rw-r--r-- | Modules/_tkinter.c | 54 |
1 files changed, 53 insertions, 1 deletions
diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index a1071e5..c1eb6e1 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -397,7 +397,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) char *buf = NULL; PyErr_Clear(); - /* Tcl encodes null character as \xc0\x80 */ + /* Tcl encodes null character as \xc0\x80. + https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */ if (memchr(s, '\xc0', size)) { char *q; const char *e = s + size; @@ -421,6 +422,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size) if (buf != NULL) { PyMem_Free(buf); } + if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) { + return r; + } + + /* In CESU-8 non-BMP characters are represented as a surrogate pair, + like in UTF-16, and then each surrogate code point is encoded in UTF-8. + https://en.wikipedia.org/wiki/CESU-8 */ + Py_ssize_t len = PyUnicode_GET_LENGTH(r); + Py_ssize_t i, j; + /* All encoded surrogate characters start with \xED. */ + i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1); + if (i == -2) { + Py_DECREF(r); + return NULL; + } + if (i == -1) { + return r; + } + Py_UCS4 *u = PyUnicode_AsUCS4Copy(r); + Py_DECREF(r); + if (u == NULL) { + return NULL; + } + Py_UCS4 ch; + for (j = i; i < len; i++, u[j++] = ch) { + Py_UCS4 ch1, ch2, ch3, high, low; + /* Low surrogates U+D800 - U+DBFF are encoded as + \xED\xA0\x80 - \xED\xAF\xBF. */ + ch1 = ch = u[i]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 1]; + if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue; + ch3 = u[i + 2]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + /* High surrogates U+DC00 - U+DFFF are encoded as + \xED\xB0\x80 - \xED\xBF\xBF. */ + ch1 = u[i + 3]; + if (ch1 != 0xdcED) continue; + ch2 = u[i + 4]; + if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue; + ch3 = u[i + 5]; + if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue; + low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + ch = Py_UNICODE_JOIN_SURROGATES(high, low); + i += 5; + } + r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j); + PyMem_Free(u); return r; } |