diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2017-11-30 13:14:43 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2017-11-30 13:14:43 (GMT) |
| commit | 72998869a3be3534fec99499faabe2d1557d6bcb (patch) | |
| tree | 85593ed297518fcde58853b7fa1d2cb98b1a0e6d /generic/tclUtf.c | |
| parent | c47ba70f4545f7c961160a6beb0a466e3bfd5532 (diff) | |
| download | tcl-72998869a3be3534fec99499faabe2d1557d6bcb.zip tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.gz tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.bz2 | |
Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely. Also fix various initializations, which only make a difference when TCL_UTF_MAX == 4.
Add new test-cases which demonstrate the fix. For TCL_UTF_MAX == 4, surrogates will now be handled as expected as well when sorting.
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 103 |
1 files changed, 75 insertions, 28 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a72394d..43636b4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -720,8 +720,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch = 0; - while (index >= 0) { - index--; + while (index-- >= 0) { src += TclUtfToUniChar(src, &ch); } return ch; @@ -751,8 +750,7 @@ Tcl_UtfAtIndex( { Tcl_UniChar ch = 0; - while (index > 0) { - index--; + while (index-- > 0) { src += TclUtfToUniChar(src, &ch); } return src; @@ -1066,16 +1064,17 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { return (ch1 - ch2); } } @@ -1116,16 +1115,17 @@ Tcl_UtfNcasecmp( */ cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1135,6 +1135,52 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- @@ -1164,16 +1210,17 @@ TclUtfCasecmp( while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { |
