summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2000-05-04 15:07:16 (GMT)
committerGuido van Rossum <guido@python.org>2000-05-04 15:07:16 (GMT)
commit990f5c6c987359694dbf951cdd3ffdf2e10e64e4 (patch)
tree61de5982921a636584378cc6a4a6c86fe1f9aea1 /Modules
parentcc229ea76fde7c2762ca6e23800decdc6fad4b01 (diff)
downloadcpython-990f5c6c987359694dbf951cdd3ffdf2e10e64e4.zip
cpython-990f5c6c987359694dbf951cdd3ffdf2e10e64e4.tar.gz
cpython-990f5c6c987359694dbf951cdd3ffdf2e10e64e4.tar.bz2
Two changes to improve (I hope) Unicode support.
1. In Tcl 8.2 and later, use Tcl_NewUnicodeObj() when passing a Python Unicode object rather than going through UTF-8. (This function doesn't exist in Tcl 8.1, so there the original UTF-8 code is still used; in Tcl 8.0 there is no support for Unicode.) This assumes that Tcl_UniChar is the same thing as Py_UNICODE; a run-time error is issued if this is not the case. 2. In Tcl 8.1 and later (i.e., whenever Tcl supports Unicode), when a string returned from Tcl contains bytes with the top bit set, we assume it is encoded in UTF-8, and decode it into a Unicode string object. Notes: - Passing Unicode strings to Tcl 8.0 does not do the right thing; this isn't worth fixing. - When passing an 8-bit string to Tcl 8.1 or later that has bytes with the top bit set, Tcl tries to interpret it as UTF-8; it seems to fall back on Latin-1 for non-UTF-8 bytes. I'm not sure what to do about this besides telling the user to disambiguate such strings by converting them to Unicode (forcing the user to be explicit about the encoding). - Obviously it won't be possible to get binary data out of Tk this way. Do we need that ability? How to do it?
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_tkinter.c33
1 files changed, 31 insertions, 2 deletions
diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c
index 15cc7e7..882715f 100644
--- a/Modules/_tkinter.c
+++ b/Modules/_tkinter.c
@@ -550,6 +550,8 @@ AsObj(value)
return result;
}
else if (PyUnicode_Check(value)) {
+#if TKMAJORMINOR <= 8001
+ /* In Tcl 8.1 we must use UTF-8 */
PyObject* utf8 = PyUnicode_AsUTF8String (value);
if (!utf8)
return 0;
@@ -557,6 +559,17 @@ AsObj(value)
PyString_GET_SIZE (utf8));
Py_DECREF(utf8);
return result;
+#else /* TKMAJORMINOR > 8001 */
+ /* In Tcl 8.2 and later, use Tcl_NewUnicodeObj() */
+ if (sizeof(Py_UNICODE) != sizeof(Tcl_UniChar)) {
+ /* XXX Should really test this at compile time */
+ PyErr_SetString(PyExc_SystemError,
+ "Py_UNICODE and Tcl_UniChar differ in size");
+ return 0;
+ }
+ return Tcl_NewUnicodeObj(PyUnicode_AS_UNICODE(value),
+ PyUnicode_GET_SIZE(value));
+#endif /* TKMAJORMINOR > 8001 */
}
else {
PyObject *v = PyObject_Str(value);
@@ -624,10 +637,26 @@ Tkapp_Call(self, args)
ENTER_OVERLAP
if (i == TCL_ERROR)
Tkinter_Error(self);
- else
+ else {
/* We could request the object result here, but doing
so would confuse applications that expect a string. */
- res = PyString_FromString(Tcl_GetStringResult(interp));
+ char *s = Tcl_GetStringResult(interp);
+ char *p = s;
+ /* If the result contains any bytes with the top bit set,
+ it's UTF-8 and we should decode it to Unicode */
+ while (*p != '\0') {
+ if (*p & 0x80)
+ break;
+ p++;
+ }
+ if (*p == '\0')
+ res = PyString_FromStringAndSize(s, (int)(p-s));
+ else {
+ /* Convert UTF-8 to Unicode string */
+ p = strchr(p, '\0');
+ res = PyUnicode_DecodeUTF8(s, (int)(p-s), "ignore");
+ }
+ }
LEAVE_OVERLAP_TCL