summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2017-11-30 13:14:43 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2017-11-30 13:14:43 (GMT)
commit72998869a3be3534fec99499faabe2d1557d6bcb (patch)
tree85593ed297518fcde58853b7fa1d2cb98b1a0e6d
parentc47ba70f4545f7c961160a6beb0a466e3bfd5532 (diff)
downloadtcl-72998869a3be3534fec99499faabe2d1557d6bcb.zip
tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.gz
tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.bz2
Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely. Also fix various initializations, which only make a difference when TCL_UTF_MAX == 4.
Add new test-cases which demonstrate the fix. For TCL_UTF_MAX == 4, surrogates will now be handled as expected as well when sorting.
-rw-r--r--doc/UniCharIsAlpha.32
-rw-r--r--generic/tclCmdIL.c4
-rw-r--r--generic/tclCmdMZ.c2
-rw-r--r--generic/tclInt.h3
-rw-r--r--generic/tclScan.c2
-rw-r--r--generic/tclStringObj.c10
-rw-r--r--generic/tclUtf.c103
-rw-r--r--generic/tclUtil.c10
-rw-r--r--tests/cmdIL.test13
-rw-r--r--win/tclWinSerial.c2
10 files changed, 105 insertions, 46 deletions
diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3
index 5ba3fc9..61490ed 100644
--- a/doc/UniCharIsAlpha.3
+++ b/doc/UniCharIsAlpha.3
@@ -48,7 +48,7 @@ int
.SH ARGUMENTS
.AS int ch
.AP int ch in
-The Tcl_UniChar to be examined.
+The Unicode character to be examined.
.BE
.SH DESCRIPTION
diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c
index 47076ec..b41d312 100644
--- a/generic/tclCmdIL.c
+++ b/generic/tclCmdIL.c
@@ -2945,7 +2945,7 @@ Tcl_LsearchObjCmd(
double patDouble, objDouble;
SortInfo sortInfo;
Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr;
- SortStrCmpFn_t strCmpFn = strcmp;
+ SortStrCmpFn_t strCmpFn = TclUtfCmp;
Tcl_RegExp regexp = NULL;
static const char *const options[] = {
"-all", "-ascii", "-bisect", "-decreasing", "-dictionary",
@@ -4263,7 +4263,7 @@ SortCompare(
int order = 0;
if (infoPtr->sortMode == SORTMODE_ASCII) {
- order = strcmp(elemPtr1->collationKey.strValuePtr,
+ order = TclUtfCmp(elemPtr1->collationKey.strValuePtr,
elemPtr2->collationKey.strValuePtr);
} else if (infoPtr->sortMode == SORTMODE_ASCII_NC) {
order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr,
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index ad1dd5f..a206cc5 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -3547,7 +3547,7 @@ TclNRSwitchObjCmd(
OPT_LAST
};
typedef int (*strCmpFn_t)(const char *, const char *);
- strCmpFn_t strCmpFn = strcmp;
+ strCmpFn_t strCmpFn = TclUtfCmp;
mode = OPT_EXACT;
foundmode = 0;
diff --git a/generic/tclInt.h b/generic/tclInt.h
index ef88bf5..ad1d9c6 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3219,6 +3219,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes,
const char *trim, int numTrim);
MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes,
const char *trim, int numTrim);
+MODULE_SCOPE int TclUtfCmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfCount(int ch);
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
@@ -4446,7 +4447,7 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file,
*/
#define TclUtfToUniChar(str, chPtr) \
- ((((unsigned char) *(str)) < 0xC0) ? \
+ ((((unsigned char) *(str)) < 0x80) ? \
((*(chPtr) = (unsigned char) *(str)), 1) \
: Tcl_UtfToUniChar(str, chPtr))
diff --git a/generic/tclScan.c b/generic/tclScan.c
index 7f71262..e0798df 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -889,7 +889,7 @@ Tcl_ScanObjCmd(
i = (int)sch;
#if TCL_UTF_MAX == 4
if (!offset) {
- offset = Tcl_UtfToUniChar(string, &sch);
+ offset = TclUtfToUniChar(string, &sch);
i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
}
#endif
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 547f7c6..b943095 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -1869,20 +1869,20 @@ Tcl_AppendFormatToObj(
} else if (ch == 'I') {
if ((format[1] == '6') && (format[2] == '4')) {
format += (step + 2);
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
useWide = 1;
#endif
} else if ((format[1] == '3') && (format[2] == '2')) {
format += (step + 2);
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
} else {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
}
} else if ((ch == 't') || (ch == 'z')) {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
if (sizeof(size_t) > sizeof(int)) {
useWide = 1;
@@ -1890,7 +1890,7 @@ Tcl_AppendFormatToObj(
#endif
} else if ((ch == 'q') ||(ch == 'j')) {
format += step;
- step = Tcl_UtfToUniChar(format, &ch);
+ step = TclUtfToUniChar(format, &ch);
#ifndef TCL_WIDE_INT_IS_LONG
useWide = 1;
#endif
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index a72394d..43636b4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -720,8 +720,7 @@ Tcl_UniCharAtIndex(
{
Tcl_UniChar ch = 0;
- while (index >= 0) {
- index--;
+ while (index-- >= 0) {
src += TclUtfToUniChar(src, &ch);
}
return ch;
@@ -751,8 +750,7 @@ Tcl_UtfAtIndex(
{
Tcl_UniChar ch = 0;
- while (index > 0) {
- index--;
+ while (index-- > 0) {
src += TclUtfToUniChar(src, &ch);
}
return src;
@@ -1066,16 +1064,17 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
return (ch1 - ch2);
}
}
@@ -1116,16 +1115,17 @@ Tcl_UtfNcasecmp(
*/
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1135,6 +1135,52 @@ Tcl_UtfNcasecmp(
}
return 0;
}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ * Compare UTF chars of string cs to string ct case sensitively.
+ * Replacement for strcmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
+ while (*cs && *ct) {
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
+ return ch1 - ch2;
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
/*
*----------------------------------------------------------------------
@@ -1164,16 +1210,17 @@ TclUtfCasecmp(
while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 8ebace5..51af016 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1665,6 +1665,7 @@ TclTrimRight(
{
const char *p = bytes + numBytes;
int pInc;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) {
Tcl_Panic("TclTrimRight works only on null-terminated strings");
@@ -1683,7 +1684,6 @@ TclTrimRight(
*/
do {
- Tcl_UniChar ch1;
const char *q = trim;
int bytesLeft = numTrim;
@@ -1695,7 +1695,6 @@ TclTrimRight(
*/
do {
- Tcl_UniChar ch2;
int qInc = TclUtfToUniChar(q, &ch2);
if (ch1 == ch2) {
@@ -1745,6 +1744,7 @@ TclTrimLeft(
int numTrim) /* ...and its length in bytes */
{
const char *p = bytes;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) {
Tcl_Panic("TclTrimLeft works only on null-terminated strings");
@@ -1763,7 +1763,6 @@ TclTrimLeft(
*/
do {
- Tcl_UniChar ch1;
int pInc = TclUtfToUniChar(p, &ch1);
const char *q = trim;
int bytesLeft = numTrim;
@@ -1773,7 +1772,6 @@ TclTrimLeft(
*/
do {
- Tcl_UniChar ch2;
int qInc = TclUtfToUniChar(q, &ch2);
if (ch1 == ch2) {
@@ -2107,7 +2105,7 @@ Tcl_StringCaseMatch(
{
int p, charLen;
const char *pstart = pattern;
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
while (1) {
p = *pattern;
@@ -2217,7 +2215,7 @@ Tcl_StringCaseMatch(
*/
if (p == '[') {
- Tcl_UniChar startChar, endChar;
+ Tcl_UniChar startChar = 0, endChar = 0;
pattern++;
if (UCHAR(*str) < 0x80) {
diff --git a/tests/cmdIL.test b/tests/cmdIL.test
index 70ac6bb..df59e6e 100644
--- a/tests/cmdIL.test
+++ b/tests/cmdIL.test
@@ -19,6 +19,7 @@ catch [list package require -exact Tcltest [info patchlevel]]
# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]
testConstraint testobj [llength [info commands testobj]]
+testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}]
test cmdIL-1.1 {Tcl_LsortObjCmd procedure} -returnCodes error -body {
lsort
@@ -147,6 +148,18 @@ test cmdIL-1.36 {lsort -stride and -index: Bug 2918962} {
{{b i g} 12345} {{d e m o} 34512}
}
} {{{b i g} 12345} {{d e m o} 34512} {{c o d e} 54321} {{b l a h} 94729}}
+test cmdIL-1.37 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} {
+ lsort -ascii [list \0 \x7f \x80 \uffff]
+} [list \0 \x7f \x80 \uffff]
+test cmdIL-1.38 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} {
+ lsort -ascii -nocase [list \0 \x7f \x80 \uffff]
+} [list \0 \x7f \x80 \uffff]
+test cmdIL-1.39 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} fullutf {
+ lsort -ascii [list \0 \x7f \x80 \U01ffff \uffff]
+} [list \0 \x7f \x80 \uffff \U01ffff]
+test cmdIL-1.40 {Tcl_LsortObjCmd procedure, Bug 8e1e31eac0fd6b6c} fullutf {
+ lsort -ascii -nocase [list \0 \x7f \x80 \U01ffff \uffff]
+} [list \0 \x7f \x80 \uffff \U01ffff]
# Can't think of any good tests for the MergeSort and MergeLists procedures,
# except a bunch of random lists to sort.
diff --git a/win/tclWinSerial.c b/win/tclWinSerial.c
index ed1a8e5..acfeecb 100644
--- a/win/tclWinSerial.c
+++ b/win/tclWinSerial.c
@@ -1738,7 +1738,7 @@ SerialSetOptionProc(
dcb.XonChar = argv[0][0];
dcb.XoffChar = argv[1][0];
if (argv[0][0] & 0x80 || argv[1][0] & 0x80) {
- Tcl_UniChar character;
+ Tcl_UniChar character = 0;
int charLen;
charLen = Tcl_UtfToUniChar(argv[0], &character);