diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-04 12:31:51 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-04 12:31:51 (GMT) |
commit | 7759441602f14b3772186ea0d87f61ffdafc8404 (patch) | |
tree | c8aac29d738caa3756201b63f7a4b9a81f27ad47 /generic | |
parent | 66197cab8b6c43b474a6dceae32fc95f4eed37b9 (diff) | |
download | tcl-7759441602f14b3772186ea0d87f61ffdafc8404.zip tcl-7759441602f14b3772186ea0d87f61ffdafc8404.tar.gz tcl-7759441602f14b3772186ea0d87f61ffdafc8404.tar.bz2 |
New internal function TclGetUCS4() only available when TCL_UTF_MAX=4. This fixes all "knownBug" testcases related to tip389.
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclCmdMZ.c | 2 | ||||
-rw-r--r-- | generic/tclExecute.c | 12 | ||||
-rw-r--r-- | generic/tclInt.h | 7 | ||||
-rw-r--r-- | generic/tclStringObj.c | 78 | ||||
-rw-r--r-- | generic/tclUtf.c | 6 |
5 files changed, 91 insertions, 14 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 011164b..7516208 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1401,7 +1401,7 @@ StringIndexCmd( } if ((index >= 0) && (index < length)) { - Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index); + int ch = TclGetUCS4(objv[1], index); /* * If we have a ByteArray object, we're careful to generate a new diff --git a/generic/tclExecute.c b/generic/tclExecute.c index f38e752..eeb69de 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5571,16 +5571,10 @@ TEBCresume( objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { - char buf[4] = ""; - Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); + char buf[8] = ""; + int ch = TclGetUCS4(valuePtr, index); - /* - * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) - * but creating the object as a string seems to be faster in - * practical use. - */ - - length = Tcl_UniCharToUtf(ch, buf); + length = TclUCS4ToUtf(ch, buf); objResultPtr = Tcl_NewStringObj(buf, length); } diff --git a/generic/tclInt.h b/generic/tclInt.h index 6f024a6..8983659 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3183,8 +3183,13 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); -MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +MODULE_SCOPE int TclUtfToUCS4(const char *, int *); MODULE_SCOPE int TclUCS4ToUtf(int, char *); +#if TCL_UTF_MAX == 4 + MODULE_SCOPE int TclGetUCS4(Tcl_Obj *, int); +#else + #define TclGetUCS4 Tcl_GetUniChar +#endif /* * Bytes F0-F4 are start-bytes for 4-byte sequences. diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 3ce8281..656d6ce 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -519,10 +519,10 @@ TclCheckEmptyString ( /* *---------------------------------------------------------------------- * - * Tcl_GetUniChar -- + * Tcl_GetUniChar/TclGetUCS4 -- * * Get the index'th Unicode character from the String object. If index - * is out of range, the result = 0xFFFD; + * is out of range, the result = 0xFFFD (Tcl_GetUniChar) resp. -1 (TclGetUCS4) * * Results: * Returns the index'th Unicode character in the Object. @@ -587,6 +587,80 @@ Tcl_GetUniChar( } return stringPtr->unicode[index]; } + +#if TCL_UTF_MAX == 4 +int +TclGetUCS4( + Tcl_Obj *objPtr, /* The object to get the Unicode charater + * from. */ + int index) /* Get the index'th Unicode character. */ +{ + String *stringPtr; + int ch, length; + + if (index < 0) { + return -1; + } + + /* + * Optimize the case where we're really dealing with a bytearray object + * we don't need to convert to a string to perform the indexing operation. + */ + + if (TclIsPureByteArray(objPtr)) { + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (index >= length) { + return -1; + } + + return (int) bytes[index]; + } + + /* + * OK, need to work with the object as a string. + */ + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (stringPtr->hasUnicode == 0) { + /* + * If numChars is unknown, compute it. + */ + + if (stringPtr->numChars == -1) { + TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + } + if (stringPtr->numChars == objPtr->length) { + return (Tcl_UniChar) objPtr->bytes[index]; + } + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + + if (index >= stringPtr->numChars) { + return -1; + } + ch = stringPtr->unicode[index]; +#if TCL_UTF_MAX <= 4 + /* See: bug [11ae2be95dac9417] */ + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x400) { + if ((index > 0) + && ((stringPtr->unicode[index-1] & 0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index] & 0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | + (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } +#endif + return ch; +} +#endif /* *---------------------------------------------------------------------- diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ab3c577..9714204 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2383,7 +2383,8 @@ TclUtfToUCS4( * * Results: * The return values is the number of bytes in the buffer that were - * consumed. + * consumed. If ch == -1, this function outputs 0 bytes (empty string), + * since TclGetUCS4 returns -1 for out-of-range indices. * * Side effects: * None. @@ -2414,6 +2415,9 @@ TclUCS4ToUtf( buf[0] = (char) ((ch >> 12) | 0xE0); return 3; } + if (ch == -1) { + return 0; + } return Tcl_UniCharToUtf(ch, buf); } |