From c47ba70f4545f7c961160a6beb0a466e3bfd5532 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Wed, 29 Nov 2017 18:35:51 +0000 Subject: Add TCL_CREATE_NS_IF_UNKNOWN back into Tcl_CreateEnsemble(). --- generic/tclEnsemble.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/generic/tclEnsemble.c b/generic/tclEnsemble.c index c5ccd22..a981851 100644 --- a/generic/tclEnsemble.c +++ b/generic/tclEnsemble.c @@ -732,7 +732,6 @@ Tcl_CreateEnsemble( Tcl_Namespace *namespacePtr, int flags) { - Tcl_Obj *nameObj = NULL; Namespace *nsPtr = (Namespace *)namespacePtr, *foundNsPtr, *altNsPtr, *actualNsPtr; const char * simpleName; @@ -741,11 +740,8 @@ Tcl_CreateEnsemble( nsPtr = (Namespace *) TclGetCurrentNamespace(interp); } - TclGetNamespaceForQualName(interp, name, nsPtr, 0, + TclGetNamespaceForQualName(interp, name, nsPtr, TCL_CREATE_NS_IF_UNKNOWN, &foundNsPtr, &altNsPtr, &actualNsPtr, &simpleName); - if (nameObj != NULL) { - TclDecrRefCount(nameObj); - } return TclCreateEnsembleInNs(interp, simpleName, (Tcl_Namespace *) foundNsPtr, (Tcl_Namespace *) nsPtr, flags); } -- cgit v0.12 From 72998869a3be3534fec99499faabe2d1557d6bcb Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 30 Nov 2017 13:14:43 +0000 Subject: Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely. Also fix various initializations, which only make a difference when TCL_UTF_MAX == 4. Add new test-cases which demonstrate the fix. For TCL_UTF_MAX == 4, surrogates will now be handled as expected as well when sorting. --- doc/UniCharIsAlpha.3 | 2 +- generic/tclCmdIL.c | 4 +- generic/tclCmdMZ.c | 2 +- generic/tclInt.h | 3 +- generic/tclScan.c | 2 +- generic/tclStringObj.c | 10 ++--- generic/tclUtf.c | 103 +++++++++++++++++++++++++++++++++++-------------- generic/tclUtil.c | 10 ++--- tests/cmdIL.test | 13 +++++++ win/tclWinSerial.c | 2 +- 10 files changed, 105 insertions(+), 46 deletions(-) diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 5ba3fc9..61490ed 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -48,7 +48,7 @@ int .SH ARGUMENTS .AS int ch .AP int ch in -The Tcl_UniChar to be examined. +The Unicode character to be examined. .BE .SH DESCRIPTION diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c index 47076ec..b41d312 100644 --- a/generic/tclCmdIL.c +++ b/generic/tclCmdIL.c @@ -2945,7 +2945,7 @@ Tcl_LsearchObjCmd( double patDouble, objDouble; SortInfo sortInfo; Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr; - SortStrCmpFn_t strCmpFn = strcmp; + SortStrCmpFn_t strCmpFn = TclUtfCmp; Tcl_RegExp regexp = NULL; static const char *const options[] = { "-all", "-ascii", "-bisect", "-decreasing", "-dictionary", @@ -4263,7 +4263,7 @@ SortCompare( int order = 0; if (infoPtr->sortMode == SORTMODE_ASCII) { - order = strcmp(elemPtr1->collationKey.strValuePtr, + order = TclUtfCmp(elemPtr1->collationKey.strValuePtr, elemPtr2->collationKey.strValuePtr); } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) { order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr, diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index ad1dd5f..a206cc5 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -3547,7 +3547,7 @@ TclNRSwitchObjCmd( OPT_LAST }; typedef int (*strCmpFn_t)(const char *, const char *); - strCmpFn_t strCmpFn = strcmp; + strCmpFn_t strCmpFn = TclUtfCmp; mode = OPT_EXACT; foundmode = 0; diff --git a/generic/tclInt.h b/generic/tclInt.h index ef88bf5..ad1d9c6 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3219,6 +3219,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); +MODULE_SCOPE int TclUtfCmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); @@ -4446,7 +4447,7 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, */ #define TclUtfToUniChar(str, chPtr) \ - ((((unsigned char) *(str)) < 0xC0) ? \ + ((((unsigned char) *(str)) < 0x80) ? \ ((*(chPtr) = (unsigned char) *(str)), 1) \ : Tcl_UtfToUniChar(str, chPtr)) diff --git a/generic/tclScan.c b/generic/tclScan.c index 7f71262..e0798df 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -889,7 +889,7 @@ Tcl_ScanObjCmd( i = (int)sch; #if TCL_UTF_MAX == 4 if (!offset) { - offset = Tcl_UtfToUniChar(string, &sch); + offset = TclUtfToUniChar(string, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } #endif diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 547f7c6..b943095 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1869,20 +1869,20 @@ Tcl_AppendFormatToObj( } else if (ch == 'I') { if ((format[1] == '6') && (format[2] == '4')) { format += (step + 2); - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG useWide = 1; #endif } else if ((format[1] == '3') && (format[2] == '2')) { format += (step + 2); - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } else { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); } } else if ((ch == 't') || (ch == 'z')) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG if (sizeof(size_t) > sizeof(int)) { useWide = 1; @@ -1890,7 +1890,7 @@ Tcl_AppendFormatToObj( #endif } else if ((ch == 'q') ||(ch == 'j')) { format += step; - step = Tcl_UtfToUniChar(format, &ch); + step = TclUtfToUniChar(format, &ch); #ifndef TCL_WIDE_INT_IS_LONG useWide = 1; #endif diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a72394d..43636b4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -720,8 +720,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch = 0; - while (index >= 0) { - index--; + while (index-- >= 0) { src += TclUtfToUniChar(src, &ch); } return ch; @@ -751,8 +750,7 @@ Tcl_UtfAtIndex( { Tcl_UniChar ch = 0; - while (index > 0) { - index--; + while (index-- > 0) { src += TclUtfToUniChar(src, &ch); } return src; @@ -1066,16 +1064,17 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { return (ch1 - ch2); } } @@ -1116,16 +1115,17 @@ Tcl_UtfNcasecmp( */ cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1135,6 +1135,52 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- @@ -1164,16 +1210,17 @@ TclUtfCasecmp( while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 8ebace5..51af016 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1665,6 +1665,7 @@ TclTrimRight( { const char *p = bytes + numBytes; int pInc; + Tcl_UniChar ch1 = 0, ch2 = 0; if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) { Tcl_Panic("TclTrimRight works only on null-terminated strings"); @@ -1683,7 +1684,6 @@ TclTrimRight( */ do { - Tcl_UniChar ch1; const char *q = trim; int bytesLeft = numTrim; @@ -1695,7 +1695,6 @@ TclTrimRight( */ do { - Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); if (ch1 == ch2) { @@ -1745,6 +1744,7 @@ TclTrimLeft( int numTrim) /* ...and its length in bytes */ { const char *p = bytes; + Tcl_UniChar ch1 = 0, ch2 = 0; if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) { Tcl_Panic("TclTrimLeft works only on null-terminated strings"); @@ -1763,7 +1763,6 @@ TclTrimLeft( */ do { - Tcl_UniChar ch1; int pInc = TclUtfToUniChar(p, &ch1); const char *q = trim; int bytesLeft = numTrim; @@ -1773,7 +1772,6 @@ TclTrimLeft( */ do { - Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); if (ch1 == ch2) { @@ -2107,7 +2105,7 @@ Tcl_StringCaseMatch( { int p, charLen; const char *pstart = pattern; - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; while (1) { p = *pattern; @@ -2217,7 +2215,7 @@ Tcl_StringCaseMatch( */ if (p == '[') { - Tcl_UniChar startChar, endChar; + Tcl_UniChar startChar = 0, endChar = 0; pattern++; if (UCHAR(*str) < 0x80) { diff --git a/tests/cmdIL.test b/tests/cmdIL.test index 70ac6bb..df59e6e 100644 --- a/tests/cmdIL.test +++ b/tests/cmdIL.test @@ -19,6 +19,7 @@ catch [list package require -exact Tcltest [info patchlevel]] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] testConstraint testobj [llength [info commands testobj]] +testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}] test cmdIL-1.1 {Tcl_LsortObjCmd procedure} -returnCodes error -body { lsort @@ -147,6 +148,18 @@ test cmdIL-1.36 {lsort -stride and -index: Bug 2918962} { {{b i g} 12345} {{d e m o} 34512} } } {{{b i g} 12345} {{d e m o} 34512} {{c o d e} 54321} {{b l a h} 94729}} +test cmdIL-1.37 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} { + lsort -ascii [list \0 \x7f \x80 \uffff] +} [list \0 \x7f \x80 \uffff] +test cmdIL-1.38 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} { + lsort -ascii -nocase [list \0 \x7f \x80 \uffff] +} [list \0 \x7f \x80 \uffff] +test cmdIL-1.39 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} fullutf { + lsort -ascii [list \0 \x7f \x80 \U01ffff \uffff] +} [list \0 \x7f \x80 \uffff \U01ffff] +test cmdIL-1.40 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} fullutf { + lsort -ascii -nocase [list \0 \x7f \x80 \U01ffff \uffff] +} [list \0 \x7f \x80 \uffff \U01ffff] # Can't think of any good tests for the MergeSort and MergeLists procedures, # except a bunch of random lists to sort. diff --git a/win/tclWinSerial.c b/win/tclWinSerial.c index ed1a8e5..acfeecb 100644 --- a/win/tclWinSerial.c +++ b/win/tclWinSerial.c @@ -1738,7 +1738,7 @@ SerialSetOptionProc( dcb.XonChar = argv[0][0]; dcb.XoffChar = argv[1][0]; if (argv[0][0] & 0x80 || argv[1][0] & 0x80) { - Tcl_UniChar character; + Tcl_UniChar character = 0; int charLen; charLen = Tcl_UtfToUniChar(argv[0], &character); -- cgit v0.12