summaryrefslogtreecommitdiffstats
path: root/Modules/_tkinter.c
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/_tkinter.c')
-rw-r--r--Modules/_tkinter.c54
1 files changed, 53 insertions, 1 deletions
diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c
index 793c5e7..b30141d 100644
--- a/Modules/_tkinter.c
+++ b/Modules/_tkinter.c
@@ -395,7 +395,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
char *buf = NULL;
PyErr_Clear();
- /* Tcl encodes null character as \xc0\x80 */
+ /* Tcl encodes null character as \xc0\x80.
+ https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */
if (memchr(s, '\xc0', size)) {
char *q;
const char *e = s + size;
@@ -419,6 +420,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
if (buf != NULL) {
PyMem_Free(buf);
}
+ if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) {
+ return r;
+ }
+
+ /* In CESU-8 non-BMP characters are represented as a surrogate pair,
+ like in UTF-16, and then each surrogate code point is encoded in UTF-8.
+ https://en.wikipedia.org/wiki/CESU-8 */
+ Py_ssize_t len = PyUnicode_GET_LENGTH(r);
+ Py_ssize_t i, j;
+ /* All encoded surrogate characters start with \xED. */
+ i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1);
+ if (i == -2) {
+ Py_DECREF(r);
+ return NULL;
+ }
+ if (i == -1) {
+ return r;
+ }
+ Py_UCS4 *u = PyUnicode_AsUCS4Copy(r);
+ Py_DECREF(r);
+ if (u == NULL) {
+ return NULL;
+ }
+ Py_UCS4 ch;
+ for (j = i; i < len; i++, u[j++] = ch) {
+ Py_UCS4 ch1, ch2, ch3, high, low;
+ /* Low surrogates U+D800 - U+DBFF are encoded as
+ \xED\xA0\x80 - \xED\xAF\xBF. */
+ ch1 = ch = u[i];
+ if (ch1 != 0xdcED) continue;
+ ch2 = u[i + 1];
+ if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue;
+ ch3 = u[i + 2];
+ if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+ high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ /* High surrogates U+DC00 - U+DFFF are encoded as
+ \xED\xB0\x80 - \xED\xBF\xBF. */
+ ch1 = u[i + 3];
+ if (ch1 != 0xdcED) continue;
+ ch2 = u[i + 4];
+ if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue;
+ ch3 = u[i + 5];
+ if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
+ low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ ch = Py_UNICODE_JOIN_SURROGATES(high, low);
+ i += 5;
+ }
+ r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j);
+ PyMem_Free(u);
return r;
}