From f193acf08ce4f3fe6db1cb79ab3589d037e5853c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 May 2013 09:27:15 +0000 Subject: Proposed solution for 3613609: lsort -nocase does not sort non-ASCII correctly --- generic/tclCmdIL.c | 10 +++++----- generic/tclCmdMZ.c | 2 +- generic/tclInt.h | 1 + generic/tclUtf.c | 27 +++++++++++++++++++++++++++ tests/cmdIL.test | 3 +++ 5 files changed, 37 insertions(+), 6 deletions(-) diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c index 152e61d..98ec8b4 100644 --- a/generic/tclCmdIL.c +++ b/generic/tclCmdIL.c @@ -2807,7 +2807,7 @@ Tcl_LsearchObjCmd( dataType = INTEGER; break; case LSEARCH_NOCASE: /* -nocase */ - strCmpFn = strcasecmp; + strCmpFn = TclUtfCasecmp; noCase = 1; break; case LSEARCH_NOT: /* -not */ @@ -3209,7 +3209,7 @@ Tcl_LsearchObjCmd( */ if (noCase) { - match = (strcasecmp(bytes, patternBytes) == 0); + match = (TclUtfCasecmp(bytes, patternBytes) == 0); } else { match = (memcmp(bytes, patternBytes, (size_t) length) == 0); @@ -3712,7 +3712,7 @@ Tcl_LsortObjCmd( goto done1; } elementArray[i].index.intValue = a; - } else if (sortInfo.sortMode == SORTMODE_REAL) { + } else if (sortMode == SORTMODE_REAL) { double a; if (Tcl_GetDoubleFromObj(sortInfo.interp, indexPtr, &a) != TCL_OK) { sortInfo.resultCode = TCL_ERROR; @@ -3790,7 +3790,7 @@ Tcl_LsortObjCmd( ckfree((char *)elementArray); done: - if (sortInfo.sortMode == SORTMODE_COMMAND) { + if (sortMode == SORTMODE_COMMAND) { TclDecrRefCount(sortInfo.compareCmdPtr); TclDecrRefCount(listObj); sortInfo.compareCmdPtr = NULL; @@ -3932,7 +3932,7 @@ SortCompare( order = strcmp(elemPtr1->index.strValuePtr, elemPtr2->index.strValuePtr); } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) { - order = strcasecmp(elemPtr1->index.strValuePtr, + order = TclUtfCasecmp(elemPtr1->index.strValuePtr, elemPtr2->index.strValuePtr); } else if (infoPtr->sortMode == SORTMODE_DICTIONARY) { order = DictionaryCompare(elemPtr1->index.strValuePtr, diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 0ad77aa..6fd468c 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -3436,7 +3436,7 @@ Tcl_SwitchObjCmd( i++; goto finishedOptions; case OPT_NOCASE: - strCmpFn = strcasecmp; + strCmpFn = TclUtfCasecmp; noCase = 1; break; diff --git a/generic/tclInt.h b/generic/tclInt.h index 92251fe..dc28b97 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2762,6 +2762,7 @@ MODULE_SCOPE Tcl_WideInt TclpGetWideClicks(void); MODULE_SCOPE double TclpWideClicksToNanoseconds(Tcl_WideInt clicks); #endif MODULE_SCOPE Tcl_Obj * TclDisassembleByteCodeObj(Tcl_Obj *objPtr); +MODULE_SCOPE int TclUtfCasecmp(CONST char *cs, CONST char *ct); /* *---------------------------------------------------------------- diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 83900e9..9dacb53 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1101,6 +1101,33 @@ Tcl_UtfNcasecmp( } return 0; } + + +/* Replacement for strcasecmp in Tcl core, in places where UTF-8 should be handled. */ +int +TclUtfCasecmp( + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1, ch2; + char c; + + do { + + /* If c == '\0', loop should end. */ + c = *cs; + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) break; + } + } while (c); + return (ch1 - ch2); +} + /* *---------------------------------------------------------------------- diff --git a/tests/cmdIL.test b/tests/cmdIL.test index b387e71..c9a10b6 100644 --- a/tests/cmdIL.test +++ b/tests/cmdIL.test @@ -394,6 +394,9 @@ test cmdIL-4.34 {SortCompare procedure, -ascii option with -nocase option} { test cmdIL-4.35 {SortCompare procedure, -ascii option with -nocase option} { lsort -ascii -nocase {d E c B a D35 d300 100 20} } {100 20 a B c d d300 D35 E} +test cmdIL-4.36 {SortCompare procedure, UTF-8 with -nocase option} { + lsort -ascii -nocase [list \u101 \u100] +} [list \u101 \u100] test cmdIL-5.1 {lsort with list style index} { lsort -ascii -decreasing -index {0 1} { -- cgit v0.12 From 52ae090e68314dd4aad0ba73e0869527bb321db4 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 May 2013 09:38:05 +0000 Subject: Slight improvement: if cs = "\xC0\x80" and ct = "\x00", loop would continue after NUL-byte, this should not happen. --- generic/tclUtf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9dacb53..a7a2091 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1110,12 +1110,12 @@ TclUtfCasecmp( CONST char *ct) /* UTF string cs is compared to. */ { Tcl_UniChar ch1, ch2; - char c; + int goOn; do { - /* If c == '\0', loop should end. */ - c = *cs; + /* If *cs == '\0' or *ct == '\0', loop should end. */ + goOn = *cs && *ct; cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); @@ -1124,7 +1124,7 @@ TclUtfCasecmp( ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) break; } - } while (c); + } while (goOn); return (ch1 - ch2); } -- cgit v0.12 From 5689a75644f9b109581a5dc3ae15294467c657ab Mon Sep 17 00:00:00 2001 From: dkf Date: Wed, 22 May 2013 12:32:28 +0000 Subject: Improved tests. --- tests/cmdIL.test | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/cmdIL.test b/tests/cmdIL.test index c9a10b6..192c10c 100644 --- a/tests/cmdIL.test +++ b/tests/cmdIL.test @@ -395,8 +395,11 @@ test cmdIL-4.35 {SortCompare procedure, -ascii option with -nocase option} { lsort -ascii -nocase {d E c B a D35 d300 100 20} } {100 20 a B c d d300 D35 E} test cmdIL-4.36 {SortCompare procedure, UTF-8 with -nocase option} { - lsort -ascii -nocase [list \u101 \u100] -} [list \u101 \u100] + scan [lsort -ascii -nocase [list \u101 \u100]] %c%c%c +} {257 32 256} +test cmdIL-4.37 {SortCompare procedure, UTF-8 with -nocase option} { + scan [lsort -ascii -nocase [list a\u0000a a]] %c%c%c%c%c +} {97 32 97 0 97} test cmdIL-5.1 {lsort with list style index} { lsort -ascii -decreasing -index {0 1} { -- cgit v0.12 From 75b8011dbad373e664a676ed8e3bcfec70313838 Mon Sep 17 00:00:00 2001 From: dkf Date: Wed, 22 May 2013 12:55:50 +0000 Subject: Fixed the weird edge case. --- generic/tclUtf.c | 37 +++++++++++++++++++++++++------------ tests/cmdIL.test | 3 +++ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a7a2091..f3d1758 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1101,31 +1101,44 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfNcasecmp -- + * + * Compare UTF chars of string cs to string ct case insensitively. + * Replacement for strcasecmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ - -/* Replacement for strcasecmp in Tcl core, in places where UTF-8 should be handled. */ int TclUtfCasecmp( CONST char *cs, /* UTF string to compare to ct. */ CONST char *ct) /* UTF string cs is compared to. */ { - Tcl_UniChar ch1, ch2; - int goOn; - - do { - - /* If *cs == '\0' or *ct == '\0', loop should end. */ - goOn = *cs && *ct; + while (*cs && *ct) { + Tcl_UniChar ch1, ch2; cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) break; + if (ch1 != ch2) { + return ch1 - ch2; + } } - } while (goOn); - return (ch1 - ch2); + } + return UCHAR(*cs) - UCHAR(*ct); } diff --git a/tests/cmdIL.test b/tests/cmdIL.test index 192c10c..6fab269 100644 --- a/tests/cmdIL.test +++ b/tests/cmdIL.test @@ -400,6 +400,9 @@ test cmdIL-4.36 {SortCompare procedure, UTF-8 with -nocase option} { test cmdIL-4.37 {SortCompare procedure, UTF-8 with -nocase option} { scan [lsort -ascii -nocase [list a\u0000a a]] %c%c%c%c%c } {97 32 97 0 97} +test cmdIL-4.38 {SortCompare procedure, UTF-8 with -nocase option} { + scan [lsort -ascii -nocase [list a a\u0000a]] %c%c%c%c%c +} {97 32 97 0 97} test cmdIL-5.1 {lsort with list style index} { lsort -ascii -decreasing -index {0 1} { -- cgit v0.12