From 48cd8747a366cb72f8622bb65e6579499c6bf5a3 Mon Sep 17 00:00:00 2001 From: culler Date: Mon, 18 Nov 2019 00:31:48 +0000 Subject: Implement non-BMP unicode for macOS when TCL_UTF_MAX = 3 by encoding surrogates as 3-byte UTF-8-ish sequences. --- macosx/tkMacOSXClipboard.c | 35 ++++------ macosx/tkMacOSXFont.c | 156 +++++++++++++++++++++++++++++++++++++-------- macosx/tkMacOSXKeyEvent.c | 52 ++++++--------- macosx/tkMacOSXPrivate.h | 4 ++ 4 files changed, 166 insertions(+), 81 deletions(-) diff --git a/macosx/tkMacOSXClipboard.c b/macosx/tkMacOSXClipboard.c index 696b70e..a5f0ba1 100644 --- a/macosx/tkMacOSXClipboard.c +++ b/macosx/tkMacOSXClipboard.c @@ -35,10 +35,7 @@ static Tk_Window clipboardOwner = NULL; targetPtr->type == dispPtr->utf8Atom) { for (TkClipboardBuffer *cbPtr = targetPtr->firstBufferPtr; cbPtr; cbPtr = cbPtr->nextPtr) { - NSString *s = [[NSString alloc] initWithBytesNoCopy: - cbPtr->buffer length:cbPtr->length - encoding:NSUTF8StringEncoding freeWhenDone:NO]; - + NSString *s = TclUniToNSString(cbPtr->buffer, cbPtr->length); [string appendString:s]; [s release]; } @@ -126,11 +123,11 @@ TkSelGetSelection( int haveExternalClip = ([[NSPasteboard generalPasteboard] changeCount] != changeCount); + printf("TkSelGetSelection\n"); if (dispPtr && (haveExternalClip || dispPtr->clipboardActive) && selection == dispPtr->clipboardAtom && (target == XA_STRING || target == dispPtr->utf8Atom)) { NSString *string = nil; - NSString *clean; NSPasteboard *pb = [NSPasteboard generalPasteboard]; NSString *type = [pb availableTypeFromArray:[NSArray arrayWithObject: NSStringPboardType]]; @@ -139,25 +136,19 @@ TkSelGetSelection( string = [pb stringForType:type]; } if (string) { + int utfSize; + char *tclUni = NSStringToTclUni(string, &utfSize); + /* - * Replace all non-BMP characters by the replacement character 0xfffd. - * This is a workaround until Tcl supports TCL_UTF_MAX > 3. + * Re-encode the string using the encoding which is used in Tcl + * when TCL_UTF_MAX = 3. This replaces each UTF-16 surrogate with + * a 3-byte sequence generated using the UTF-8 algorithm. (Even + * though UTF-8 does not allow encoding surrogates, the algorithm + * does produce a 3-byte sequence.) */ - int i, j, len = [string length]; - CFRange all = CFRangeMake(0, len); - UniChar *buffer = ckalloc(len*sizeof(UniChar)); - CFStringGetCharacters((CFStringRef) string, all, buffer); - for (i = 0, j = 0 ; j < len ; i++, j++) { - if (CFStringIsSurrogateHighCharacter(buffer[j])) { - buffer[i] = 0xfffd; - j++; - } else { - buffer[i] = buffer[j]; - } - } - clean = (NSString *)CFStringCreateWithCharacters(NULL, buffer, i); - ckfree(buffer); - result = proc(clientData, interp, [clean UTF8String]); + + result = proc(clientData, interp, tclUni); + ckfree(tclUni); } } else { Tcl_SetObjResult(interp, Tcl_ObjPrintf( diff --git a/macosx/tkMacOSXFont.c b/macosx/tkMacOSXFont.c index df7f770..8350908 100644 --- a/macosx/tkMacOSXFont.c +++ b/macosx/tkMacOSXFont.c @@ -101,6 +101,132 @@ static void DrawCharsInContext(Display *display, Drawable drawable, #pragma mark - #pragma mark Font Helpers: +/* + *--------------------------------------------------------------------------- + * + * NSStringFromTclUTF -- + * + * When Tcl is compiled with TCL_UTF_MAX = 3 (the default for 8.6) it cannot + * deal directly with UTF-8 encoded non-BMP characters, since their UTF-8 + * encoding requires 4 bytes. + * + * As a workaround, these versions of Tcl encode non-BMP characters as a string + * of length 6 in which the high and low UTF-16 surrogates have been encoded + * using the UTF-8 algorithm. The UTF-8 encoding does not allow encoding + * surrogates, so these 6-byte strings are not valid UTF-8, and hence Apple's + * NString class will refuse to instantiate an NSString from the 6-byte + * encoding. This function allows creating an NSString from a C-string which + * has been encoded using this scheme. + * + * Results: + * An NSString, which may be nil. + * + * Side effects: + * None. + *--------------------------------------------------------------------------- + */ + +MODULE_SCOPE NSString* +TclUniToNSString( + const char *source, + int numBytes) +{ + NSString *string = [[NSString alloc] initWithBytesNoCopy:(void *)source + length:numBytes + encoding:NSUTF8StringEncoding + freeWhenDone:NO]; + if (!string) { + const unichar *characters = ckalloc(numBytes*sizeof(unichar)); + const char *in = source; + unichar *out = (unichar *) characters; + while (in < source + numBytes) { + in += Tcl_UtfToUniChar(in, out++); + } + string = [[NSString alloc] initWithCharacters:characters + length:(out - characters)]; + ckfree(characters); + } + return string; +} + +/* + *--------------------------------------------------------------------------- + * + * TclUniAtIndex -- + * + * Write a sequence of bytes up to length 6 which is an encoding of a UTF-16 + * character in an NSString. Also record the unicode code point of the character. + * this may be a non-BMP character constructed by reading two surrogates from + * the NSString. + * + * Results: + * Returns the number of bytes written. + * + * Side effects: + * Bytes are written to the address uni and the unicode code point is written + * to the integer at address code. + * + */ + +MODULE_SCOPE int +TclUniAtIndex( + NSString *string, + int index, + char *uni, + unsigned int *code) +{ + char *ptr = uni; + UniChar uniChar = [string characterAtIndex: index]; + if (CFStringIsSurrogateHighCharacter(uniChar)) { + UniChar lowChar = [string characterAtIndex: ++index]; + *code = CFStringGetLongCharacterForSurrogatePair( + uniChar, lowChar); + ptr += Tcl_UniCharToUtf(uniChar, ptr); + ptr += Tcl_UniCharToUtf(lowChar, ptr); + return ptr - uni; + } else { + *code = (int) uniChar; + [[string substringWithRange: NSMakeRange(index, 1)] + getCString: uni + maxLength: XMaxTransChars + encoding: NSUTF8StringEncoding]; + return strlen(uni); + } +} + +/* + *--------------------------------------------------------------------------- + * + * NSStringToTclUni -- + * + * Encodes the unicode string represented by an NSString object using the + * special internal Tcl encoding used when TCL_UTF_MAX = 3. This encoding + * is similar to UTF-8 except that non-BMP characters are encoded as two + * successive 3-byte sequences which are constructed from UTF-16 surrogates + * by applying the UTF-8 algorithm. Even though the UTF-8 encoding does not + * allow encoding surrogates, the algorithm does produce a well-defined + * 3-byte sequence. + * + */ + +MODULE_SCOPE char* +NSStringToTclUni( + NSString *string, + int *numBytes) +{ + unsigned int code; + int i, length = [string length]; + char *ptr, *result = ckalloc(6*length + 1); + for (i = 0, ptr = result; i < length; i++) { + ptr += TclUniAtIndex(string, i, ptr, &code); + if (code > 0xffff){ + i++; + } + } + *ptr = '\0'; + return result; +} + #define GetNSFontTraitsFromTkFontAttributes(faPtr) \ ((faPtr)->weight == TK_FW_BOLD ? NSBoldFontMask : NSUnboldFontMask) | \ ((faPtr)->slant == TK_FS_ITALIC ? NSItalicFontMask : NSUnitalicFontMask) @@ -844,8 +970,7 @@ TkpMeasureCharsInContext( if (maxLength > 32767) { maxLength = 32767; } - string = [[NSString alloc] initWithBytesNoCopy:(void*)source - length:numBytes encoding:NSUTF8StringEncoding freeWhenDone:NO]; + string = TclUniToNSString((const char *)source, numBytes); if (!string) { length = 0; fit = rangeLength; @@ -1124,33 +1249,10 @@ DrawCharsInContext( !TkMacOSXSetupDrawingContext(drawable, gc, 1, &drawingContext)) { return; } - string = [[NSString alloc] initWithBytesNoCopy:(void*)source - length:numBytes encoding:NSUTF8StringEncoding freeWhenDone:NO]; + string = TclUniToNSString((const char *)source, numBytes); if (!string) { - - /* - * The decoding might have failed because we got a fake UTF-8 byte - * array in which UTF-16 surrogates had been encoded using the UTF-8 - * algorithm, even though UTF-8 does not allow encoding surrogates. - * (When Tcl is compiled with TCL_UTF_MAX = 3 Tk uses this encoding - * internally.) We can attempt to decode the source using this - * encoding and see if Apple accepts the result as UTF-16. - */ - - const unichar *characters = ckalloc(numBytes*sizeof(unichar)); - const char *in = source; - unichar *out = (unichar *) characters; - while (in < source + numBytes) { - in += Tcl_UtfToUniChar(in, out++); - } - string = [[NSString alloc] initWithCharacters:characters - length:(out - characters)]; - ckfree(characters); - if (!string) { - return; - } + return; } - context = drawingContext.context; fg = TkMacOSXCreateCGColor(gc, gc->foreground); attributes = [fontPtr->nsAttributes mutableCopy]; diff --git a/macosx/tkMacOSXKeyEvent.c b/macosx/tkMacOSXKeyEvent.c index 677f77e..025cccb 100644 --- a/macosx/tkMacOSXKeyEvent.c +++ b/macosx/tkMacOSXKeyEvent.c @@ -14,7 +14,7 @@ */ #include "tkMacOSXPrivate.h" -#include "tkMacOSXEvent.h" +#include "tkMacOSXInt.h" #include "tkMacOSXConstants.h" /* @@ -331,42 +331,30 @@ static unsigned isFunctionKey(unsigned int code); } /* - * NSString represents a non-BMP character as a string of length 2 where - * the first character is the high surrogate and the second character is - * the low surrogate. We could record this in the XEvent by setting the - * keycode to the unicode code point and setting the trans_chars to the - * 4-byte UTF-8 string. However, that will not help as long as TCL_UTF_MAX - * is set to 3. Until that changes, we just replace non-BMP characters by - * the "replacement character" U+FFFD. + * Next we generate an XEvent for each unicode character in our string. + * + * NSString uses UTF-16 internally, which means that a non-BMP character is + * represented by a sequence of two 16-bit "surrogates". In principle we + * could record this in the XEvent by setting the keycode to the 32-bit + * unicode code point and setting the trans_chars string to the 4-byte + * UTF-8 string for the non-BMP character. However, that will not work + * when TCL_UTF_MAX is set to 3, as is the case for Tcl 8.6. A workaround + * used internally by Tcl 8.6 is to encode each surrogate as a 3-byte + * sequence using the UTF-8 algorithm (ignoring the fact that the UTF-8 + * encoding specification does not allow encoding UTF-16 surrogates). + * This gives a 6-byte encoding of the non-BMP character which we write into + * the trans_chars field of the XEvent. */ for (i = 0; i < len; i++) { - UniChar nextChar = [str characterAtIndex: i]; - if (CFStringIsSurrogateHighCharacter(nextChar)) { -#if 0 - UniChar lowChar = [str characterAtIndex: ++i]; - xEvent.xkey.keycode = CFStringGetLongCharacterForSurrogatePair( - nextChar, lowChar); - xEvent.xkey.nbytes = TkUniCharToUtf(xEvent.xkey.keycode, - &xEvent.xkey.trans_chars); -#else + xEvent.xkey.nbytes = TclUniAtIndex(str, i, xEvent.xkey.trans_chars, + &xEvent.xkey.keycode); + if (xEvent.xkey.keycode > 0xffff){ i++; - xEvent.xkey.keycode = 0xfffd; - strcpy(xEvent.xkey.trans_chars, "\xef\xbf\xbd"); - xEvent.xkey.nbytes = strlen(xEvent.xkey.trans_chars); -#endif - } else { - xEvent.xkey.keycode = (int) nextChar; - [[str substringWithRange: NSMakeRange(i,1)] - getCString: xEvent.xkey.trans_chars - maxLength: XMaxTransChars encoding: NSUTF8StringEncoding]; - xEvent.xkey.nbytes = strlen(xEvent.xkey.trans_chars); } - xEvent.xany.type = KeyPress; - releaseCode = (UInt16) nextChar; - Tk_QueueWindowEvent(&xEvent, TCL_QUEUE_TAIL); + xEvent.xany.type = KeyPress; + Tk_QueueWindowEvent(&xEvent, TCL_QUEUE_TAIL); } - releaseCode = (UInt16) [str characterAtIndex: 0]; } @@ -642,7 +630,7 @@ XGrabKeyboard( Time time) { keyboardGrabWinPtr = Tk_IdToWindow(display, grab_window); - TkWindow *captureWinPtr = (TkWindow *) TkMacOSXGetCapture(); + TkWindow *captureWinPtr = (TkWindow *) TkpGetCapture(); if (keyboardGrabWinPtr && captureWinPtr) { NSWindow *w = TkMacOSXDrawableWindow(grab_window); diff --git a/macosx/tkMacOSXPrivate.h b/macosx/tkMacOSXPrivate.h index 68bad41..a285bba 100644 --- a/macosx/tkMacOSXPrivate.h +++ b/macosx/tkMacOSXPrivate.h @@ -239,6 +239,10 @@ MODULE_SCOPE int TkMacOSXServices_Init(Tcl_Interp *interp); MODULE_SCOPE int TkMacOSXRegisterServiceWidgetObjCmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); +MODULE_SCOPE int TclUniAtIndex(NSString *string, int index, char *uni, + unsigned int *code); +MODULE_SCOPE NSString* TclUniToNSString(const char *source, int numBytes); +MODULE_SCOPE char* NSStringToTclUni(NSString *string, int *numBytes); #pragma mark Private Objective-C Classes -- cgit v0.12 From 7035421b1ebd6767e98f5052a626955bee5d761a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 18 Nov 2019 08:14:47 +0000 Subject: remove printf("TkSelGetSelection\n"); --- macosx/tkMacOSXClipboard.c | 3 +-- macosx/tkMacOSXFont.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/macosx/tkMacOSXClipboard.c b/macosx/tkMacOSXClipboard.c index a5f0ba1..bdcccf9 100644 --- a/macosx/tkMacOSXClipboard.c +++ b/macosx/tkMacOSXClipboard.c @@ -123,7 +123,6 @@ TkSelGetSelection( int haveExternalClip = ([[NSPasteboard generalPasteboard] changeCount] != changeCount); - printf("TkSelGetSelection\n"); if (dispPtr && (haveExternalClip || dispPtr->clipboardActive) && selection == dispPtr->clipboardAtom && (target == XA_STRING || target == dispPtr->utf8Atom)) { @@ -138,7 +137,7 @@ TkSelGetSelection( if (string) { int utfSize; char *tclUni = NSStringToTclUni(string, &utfSize); - + /* * Re-encode the string using the encoding which is used in Tcl * when TCL_UTF_MAX = 3. This replaces each UTF-16 surrogate with diff --git a/macosx/tkMacOSXFont.c b/macosx/tkMacOSXFont.c index 8350908..aeb90f8 100644 --- a/macosx/tkMacOSXFont.c +++ b/macosx/tkMacOSXFont.c @@ -109,7 +109,7 @@ static void DrawCharsInContext(Display *display, Drawable drawable, * When Tcl is compiled with TCL_UTF_MAX = 3 (the default for 8.6) it cannot * deal directly with UTF-8 encoded non-BMP characters, since their UTF-8 * encoding requires 4 bytes. - * + * * As a workaround, these versions of Tcl encode non-BMP characters as a string * of length 6 in which the high and low UTF-16 surrogates have been encoded * using the UTF-8 algorithm. The UTF-8 encoding does not allow encoding @@ -117,7 +117,7 @@ static void DrawCharsInContext(Display *display, Drawable drawable, * NString class will refuse to instantiate an NSString from the 6-byte * encoding. This function allows creating an NSString from a C-string which * has been encoded using this scheme. - * + * * Results: * An NSString, which may be nil. * @@ -302,7 +302,7 @@ FindNSFont( size = [defaultFont pointSize]; } nsFont = [fm fontWithFamily:family traits:traits weight:weight size:size]; - + /* * A second bug in NSFontManager that Apple created for the Catalina OS * causes requests as above to sometimes return fonts with additional -- cgit v0.12 From 6e5b062650d937a3a1c7af3bf112fe61d54d6a78 Mon Sep 17 00:00:00 2001 From: culler Date: Mon, 18 Nov 2019 15:05:55 +0000 Subject: Edit comments, add tests for NULL pointers. --- macosx/tkMacOSXClipboard.c | 11 ++++++----- macosx/tkMacOSXFont.c | 42 +++++++++++++++++++++++++++++------------- macosx/tkMacOSXPrivate.h | 2 +- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/macosx/tkMacOSXClipboard.c b/macosx/tkMacOSXClipboard.c index bdcccf9..db45428 100644 --- a/macosx/tkMacOSXClipboard.c +++ b/macosx/tkMacOSXClipboard.c @@ -135,19 +135,20 @@ TkSelGetSelection( string = [pb stringForType:type]; } if (string) { - int utfSize; - char *tclUni = NSStringToTclUni(string, &utfSize); /* - * Re-encode the string using the encoding which is used in Tcl + * Encode the string using the encoding which is used in Tcl * when TCL_UTF_MAX = 3. This replaces each UTF-16 surrogate with * a 3-byte sequence generated using the UTF-8 algorithm. (Even * though UTF-8 does not allow encoding surrogates, the algorithm * does produce a 3-byte sequence.) */ - result = proc(clientData, interp, tclUni); - ckfree(tclUni); + char *bytes = NSStringToTclUni(string, NULL); + result = proc(clientData, interp, bytes); + if (bytes) { + ckfree(bytes); + } } } else { Tcl_SetObjResult(interp, Tcl_ObjPrintf( diff --git a/macosx/tkMacOSXFont.c b/macosx/tkMacOSXFont.c index aeb90f8..9b2d11c 100644 --- a/macosx/tkMacOSXFont.c +++ b/macosx/tkMacOSXFont.c @@ -104,7 +104,7 @@ static void DrawCharsInContext(Display *display, Drawable drawable, /* *--------------------------------------------------------------------------- * - * NSStringFromTclUTF -- + * TclUniToNSString -- * * When Tcl is compiled with TCL_UTF_MAX = 3 (the default for 8.6) it cannot * deal directly with UTF-8 encoded non-BMP characters, since their UTF-8 @@ -163,8 +163,9 @@ TclUniToNSString( * Returns the number of bytes written. * * Side effects: - * Bytes are written to the address uni and the unicode code point is written - * to the integer at address code. + * Bytes are written to the char array referenced by the pointer uni and + * the unicode code point is written to the integer referenced by the + * pointer code. * */ @@ -199,14 +200,22 @@ TclUniAtIndex( * * NSStringToTclUni -- * - * Encodes the unicode string represented by an NSString object using the - * special internal Tcl encoding used when TCL_UTF_MAX = 3. This encoding + * Encodes the unicode string represented by an NSString object with the + * internal encoding that Tcl uses when TCL_UTF_MAX = 3. This encoding * is similar to UTF-8 except that non-BMP characters are encoded as two * successive 3-byte sequences which are constructed from UTF-16 surrogates * by applying the UTF-8 algorithm. Even though the UTF-8 encoding does not * allow encoding surrogates, the algorithm does produce a well-defined * 3-byte sequence. * + * Results: + * Returns a pointer to a null-terminated byte array which encodes the + * NSString. + * + * Side effects: + * Memory is allocated to hold the byte array, which must be freed with + * ckalloc. If the pointer numBytes is not NULL the number of non-null + * bytes written to the array is stored in the integer it references. */ MODULE_SCOPE char* @@ -215,16 +224,23 @@ NSStringToTclUni( int *numBytes) { unsigned int code; - int i, length = [string length]; - char *ptr, *result = ckalloc(6*length + 1); - for (i = 0, ptr = result; i < length; i++) { - ptr += TclUniAtIndex(string, i, ptr, &code); - if (code > 0xffff){ - i++; + int i; + char *ptr, *bytes = ckalloc(6*[string length] + 1); + + ptr = bytes; + if (ptr) { + for (i = 0; i < [string length]; i++) { + ptr += TclUniAtIndex(string, i, ptr, &code); + if (code > 0xffff){ + i++; + } } + *ptr = '\0'; + } + if (numBytes) { + *numBytes = ptr - bytes; } - *ptr = '\0'; - return result; + return bytes; } #define GetNSFontTraitsFromTkFontAttributes(faPtr) \ diff --git a/macosx/tkMacOSXPrivate.h b/macosx/tkMacOSXPrivate.h index a285bba..9417b62 100644 --- a/macosx/tkMacOSXPrivate.h +++ b/macosx/tkMacOSXPrivate.h @@ -239,9 +239,9 @@ MODULE_SCOPE int TkMacOSXServices_Init(Tcl_Interp *interp); MODULE_SCOPE int TkMacOSXRegisterServiceWidgetObjCmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); +MODULE_SCOPE NSString* TclUniToNSString(const char *source, int numBytes); MODULE_SCOPE int TclUniAtIndex(NSString *string, int index, char *uni, unsigned int *code); -MODULE_SCOPE NSString* TclUniToNSString(const char *source, int numBytes); MODULE_SCOPE char* NSStringToTclUni(NSString *string, int *numBytes); #pragma mark Private Objective-C Classes -- cgit v0.12