From 7759441602f14b3772186ea0d87f61ffdafc8404 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 4 May 2020 12:31:51 +0000 Subject: New internal function TclGetUCS4() only available when TCL_UTF_MAX=4. This fixes all "knownBug" testcases related to tip389. --- generic/tclCmdMZ.c | 2 +- generic/tclExecute.c | 12 ++------ generic/tclInt.h | 7 ++++- generic/tclStringObj.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-- generic/tclUtf.c | 6 +++- tests/utf.test | 14 ++++----- 6 files changed, 98 insertions(+), 21 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 011164b..7516208 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1401,7 +1401,7 @@ StringIndexCmd( } if ((index >= 0) && (index < length)) { - Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index); + int ch = TclGetUCS4(objv[1], index); /* * If we have a ByteArray object, we're careful to generate a new diff --git a/generic/tclExecute.c b/generic/tclExecute.c index f38e752..eeb69de 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5571,16 +5571,10 @@ TEBCresume( objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { - char buf[4] = ""; - Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); + char buf[8] = ""; + int ch = TclGetUCS4(valuePtr, index); - /* - * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) - * but creating the object as a string seems to be faster in - * practical use. - */ - - length = Tcl_UniCharToUtf(ch, buf); + length = TclUCS4ToUtf(ch, buf); objResultPtr = Tcl_NewStringObj(buf, length); } diff --git a/generic/tclInt.h b/generic/tclInt.h index 6f024a6..8983659 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3183,8 +3183,13 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); -MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +MODULE_SCOPE int TclUtfToUCS4(const char *, int *); MODULE_SCOPE int TclUCS4ToUtf(int, char *); +#if TCL_UTF_MAX == 4 + MODULE_SCOPE int TclGetUCS4(Tcl_Obj *, int); +#else + #define TclGetUCS4 Tcl_GetUniChar +#endif /* * Bytes F0-F4 are start-bytes for 4-byte sequences. diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 3ce8281..656d6ce 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -519,10 +519,10 @@ TclCheckEmptyString ( /* *---------------------------------------------------------------------- * - * Tcl_GetUniChar -- + * Tcl_GetUniChar/TclGetUCS4 -- * * Get the index'th Unicode character from the String object. If index - * is out of range, the result = 0xFFFD; + * is out of range, the result = 0xFFFD (Tcl_GetUniChar) resp. -1 (TclGetUCS4) * * Results: * Returns the index'th Unicode character in the Object. @@ -587,6 +587,80 @@ Tcl_GetUniChar( } return stringPtr->unicode[index]; } + +#if TCL_UTF_MAX == 4 +int +TclGetUCS4( + Tcl_Obj *objPtr, /* The object to get the Unicode charater + * from. */ + int index) /* Get the index'th Unicode character. */ +{ + String *stringPtr; + int ch, length; + + if (index < 0) { + return -1; + } + + /* + * Optimize the case where we're really dealing with a bytearray object + * we don't need to convert to a string to perform the indexing operation. + */ + + if (TclIsPureByteArray(objPtr)) { + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (index >= length) { + return -1; + } + + return (int) bytes[index]; + } + + /* + * OK, need to work with the object as a string. + */ + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (stringPtr->hasUnicode == 0) { + /* + * If numChars is unknown, compute it. + */ + + if (stringPtr->numChars == -1) { + TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + } + if (stringPtr->numChars == objPtr->length) { + return (Tcl_UniChar) objPtr->bytes[index]; + } + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + + if (index >= stringPtr->numChars) { + return -1; + } + ch = stringPtr->unicode[index]; +#if TCL_UTF_MAX <= 4 + /* See: bug [11ae2be95dac9417] */ + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x400) { + if ((index > 0) + && ((stringPtr->unicode[index-1] & 0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index] & 0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | + (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } +#endif + return ch; +} +#endif /* *---------------------------------------------------------------------- diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ab3c577..9714204 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2383,7 +2383,8 @@ TclUtfToUCS4( * * Results: * The return values is the number of bytes in the buffer that were - * consumed. + * consumed. If ch == -1, this function outputs 0 bytes (empty string), + * since TclGetUCS4 returns -1 for out-of-range indices. * * Side effects: * None. @@ -2414,6 +2415,9 @@ TclUCS4ToUtf( buf[0] = (char) ((ch >> 12) | 0xE0); return 3; } + if (ch == -1) { + return 0; + } return Tcl_UniCharToUtf(ch, buf); } diff --git a/tests/utf.test b/tests/utf.test index a3c049d..fd8231d 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -983,14 +983,14 @@ test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { } c test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { string index \u4E4E\u25A\xFF\u543 2 -} \uFF +} \xFF test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { string index \uD842 0 } \uD842 -test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} {ucs4 knownBug} { +test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} {ucs4} { string index \uD842 0 } \uD842 -test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {tip389 knownBug} { +test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} {tip389} { string index \uD842 0 } \uD842 test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { @@ -1002,7 +1002,7 @@ test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 0 } \U1F600 -test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {tip389 knownBug} { +test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} {tip389} { string index \uD83D\uDE00G 0 } \U1F600 test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { @@ -1011,7 +1011,7 @@ test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { string index \uD83D\uDE00G 1 } G -test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {tip389 knownBug} { +test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} {tip389} { string index \uD83D\uDE00G 1 } {} test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { @@ -1029,7 +1029,7 @@ test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 0 } \U1F600 -test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389 knownBug} { +test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { string index \U1F600G 0 } \U1F600 test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { @@ -1038,7 +1038,7 @@ test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs4} { string index \U1F600G 1 } G -test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389 knownBug} { +test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} {Uesc tip389} { string index \U1F600G 1 } {} test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} {Uesc ucs2} { -- cgit v0.12