From 03c66864aa2ffa9871ce216b00cd661eaf1be688 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 29 Nov 2017 09:49:31 +0000 Subject: Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely --- generic/tclCmdIL.c | 4 +-- generic/tclCmdMZ.c | 2 +- generic/tclInt.h | 1 + generic/tclUtf.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 80 insertions(+), 6 deletions(-) diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c index 47076ec..b41d312 100644 --- a/generic/tclCmdIL.c +++ b/generic/tclCmdIL.c @@ -2945,7 +2945,7 @@ Tcl_LsearchObjCmd( double patDouble, objDouble; SortInfo sortInfo; Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr; - SortStrCmpFn_t strCmpFn = strcmp; + SortStrCmpFn_t strCmpFn = TclUtfCmp; Tcl_RegExp regexp = NULL; static const char *const options[] = { "-all", "-ascii", "-bisect", "-decreasing", "-dictionary", @@ -4263,7 +4263,7 @@ SortCompare( int order = 0; if (infoPtr->sortMode == SORTMODE_ASCII) { - order = strcmp(elemPtr1->collationKey.strValuePtr, + order = TclUtfCmp(elemPtr1->collationKey.strValuePtr, elemPtr2->collationKey.strValuePtr); } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) { order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr, diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index ad1dd5f..a206cc5 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -3547,7 +3547,7 @@ TclNRSwitchObjCmd( OPT_LAST }; typedef int (*strCmpFn_t)(const char *, const char *); - strCmpFn_t strCmpFn = strcmp; + strCmpFn_t strCmpFn = TclUtfCmp; mode = OPT_EXACT; foundmode = 0; diff --git a/generic/tclInt.h b/generic/tclInt.h index d77889e..ad1d9c6 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3219,6 +3219,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); +MODULE_SCOPE int TclUtfCmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index aed332f..aff10c1 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1108,6 +1108,15 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { return (ch1 - ch2); } @@ -1140,6 +1149,7 @@ Tcl_UtfNcasecmp( unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1148,6 +1158,15 @@ Tcl_UtfNcasecmp( */ cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); @@ -1158,11 +1177,56 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif + if (ch1 != ch2) { + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- + * TclUtfCasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1182,11 +1246,20 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - while (*cs && *ct) { - Tcl_UniChar ch1 = 0, ch2 = 0; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); +#if TCL_UTF_MAX == 4 + /* map high surrogate characters to values > 0xffff */ + if ((ch1 & 0xFC00) == 0xD800) { + ch1 += 0x4000; + } + if ((ch2 & 0xFC00) == 0xD800) { + ch2 += 0x4000; + } +#endif if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); -- cgit v0.12