diff options
-rw-r--r-- | generic/tclCmdMZ.c | 34 | ||||
-rw-r--r-- | generic/tclInt.h | 1 | ||||
-rw-r--r-- | generic/tclUtf.c | 55 |
3 files changed, 61 insertions, 29 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d344678..23370a8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1081,23 +1081,10 @@ Tcl_SplitObjCmd( Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS); for ( ; stringPtr < end; stringPtr += len) { - int fullchar; - len = TclUtfToUniChar(stringPtr, &ch); - fullchar = ch; - -#if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (len < 3)) { - len += TclUtfToUniChar(stringPtr + len, &ch); - fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; - } -#endif + int ucs4; - /* - * Assume Tcl_UniChar is an integral type... - */ - - hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(fullchar), - &isNew); + len = TclUtfToUCS4(stringPtr, &ucs4); + hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ucs4), &isNew); if (isNew) { TclNewStringObj(objPtr, stringPtr, len); @@ -1466,7 +1453,6 @@ StringIsCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { const char *string1, *end, *stop; - Tcl_UniChar ch = 0; int (*chcomp)(int) = NULL; /* The UniChar comparison function. */ int i, failat = 0, result = 1, strict = 0, index, length1, length2; Tcl_Obj *objPtr, *failVarObj = NULL; @@ -1797,16 +1783,10 @@ StringIsCmd( } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { - int fullchar; - length2 = TclUtfToUniChar(string1, &ch); - fullchar = ch; -#if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (length2 < 3)) { - length2 += TclUtfToUniChar(string1 + length2, &ch); - fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; - } -#endif - if (!chcomp(fullchar)) { + int ucs4; + + length2 = TclUtfToUCS4(string1, &ucs4); + if (!chcomp(ucs4)) { result = 0; break; } diff --git a/generic/tclInt.h b/generic/tclInt.h index c30a257..74b2cc9 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3186,6 +3186,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); +MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr, diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c58f5a9..0db06bd 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -278,8 +278,8 @@ Tcl_UniCharToUtfDString( * If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done: * For any UTF-8 string containing a character outside of the BMP, the * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr + * and generate a return value of 1. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 3. Because *chPtr * is used to remember whether the high surrogate is already produced, it * is recommended to initialize the variable it points to as 0 before * the first call to Tcl_UtfToUniChar is done. @@ -2156,6 +2156,57 @@ TclUniCharMatch( } /* + *--------------------------------------------------------------------------- + * + * TclUtfToUCS4 -- + * + * Extract the 4-byte codepoint from the leading bytes of the + * Modified UTF-8 string "src". This is a utility routine to + * contain the surrogate gymnastics in one place. + * + * The caller must ensure that the source buffer is long enough that this + * routine does not run off the end and dereference non-existent memory + * looking for trail bytes. If the source buffer is known to be '\0' + * terminated, this cannot happen. Otherwise, the caller should call + * Tcl_UtfCharComplete() before calling this routine to ensure that + * enough bytes remain in the string. + * + * Results: + * *usc4Ptr is filled with the UCS4 code point, and the return value is + * the number of bytes from the UTF-8 string that were consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +TclUtfToUCS4( + const char *src, /* The UTF-8 string. */ + int *ucs4Ptr) /* Filled with the UCS4 codepoint represented + * by the UTF-8 string. */ +{ + int len, fullchar; + Tcl_UniChar ch = 0; + + len = TclUtfToUniChar(src, &ch); + fullchar = ch; + +#if TCL_UTF_MAX == 4 + /* 4-byte UTF-8 is supported; decode surrogates */ + + if ((ch >= 0xD800) && len < 3) + len += Tcl_UtfToUniChar(src + len, &ch); + fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; + } +#endif + + *ucs4Ptr = fullchar; + return len; +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 |