Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort treats NUL chars strangely. Also fix various initializations, which only make a difference when TCL_UTF_MAX == 4.

Add new test-cases which demonstrate the fix. For TCL_UTF_MAX == 4, surrogates will now be handled as expected as well when sorting.
author: jan.nijtmans <nijtmans@users.sourceforge.net> 2017-11-30 13:14:43 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2017-11-30 13:14:43 (GMT)
commit: 72998869a3be3534fec99499faabe2d1557d6bcb (patch)
tree: 85593ed297518fcde58853b7fa1d2cb98b1a0e6d /generic/tclUtf.c
parent: c47ba70f4545f7c961160a6beb0a466e3bfd5532 (diff)
download: tcl-72998869a3be3534fec99499faabe2d1557d6bcb.zip
tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.gz
tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.bz2
1 files changed, 75 insertions, 28 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index a72394d..43636b4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -720,8 +720,7 @@ Tcl_UniCharAtIndex(
 {
     Tcl_UniChar ch = 0;
 
-    while (index >= 0) {
-	index--;
+    while (index-- >= 0) {
 	src += TclUtfToUniChar(src, &ch);
     }
     return ch;
@@ -751,8 +750,7 @@ Tcl_UtfAtIndex(
 {
     Tcl_UniChar ch = 0;
 
-    while (index > 0) {
-	index--;
+    while (index-- > 0) {
 	src += TclUtfToUniChar(src, &ch);
     }
     return src;
@@ -1066,16 +1064,17 @@ Tcl_UtfNcmp(
 
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
 #if TCL_UTF_MAX == 4
-    /* map high surrogate characters to values > 0xffff */
-    if ((ch1 & 0xFC00) == 0xD800) {
-	ch1 += 0x4000;
-    }
-    if ((ch2 & 0xFC00) == 0xD800) {
-	ch2 += 0x4000;
-    }
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
+		return -ch2;
+	    }
 #endif
-	if (ch1 != ch2) {
 	    return (ch1 - ch2);
 	}
     }
@@ -1116,16 +1115,17 @@ Tcl_UtfNcasecmp(
 	 */
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
 #if TCL_UTF_MAX == 4
-    /* map high surrogate characters to values > 0xffff */
-    if ((ch1 & 0xFC00) == 0xD800) {
-	ch1 += 0x4000;
-    }
-    if ((ch2 & 0xFC00) == 0xD800) {
-	ch2 += 0x4000;
-    }
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
+		return -ch2;
+	    }
 #endif
-	if (ch1 != ch2) {
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
 	    if (ch1 != ch2) {
@@ -1135,6 +1135,52 @@ Tcl_UtfNcasecmp(
     }
     return 0;
 }
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ *	Compare UTF chars of string cs to string ct case sensitively.
+ *	Replacement for strcmp in Tcl core, in places where UTF-8 should
+ *	be handled.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct)		/* UTF string cs is compared to. */
+{
+    Tcl_UniChar ch1 = 0, ch2 = 0;
+
+    while (*cs && *ct) {
+	cs += TclUtfToUniChar(cs, &ch1);
+	ct += TclUtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
+		return -ch2;
+	    }
+#endif
+	    return ch1 - ch2;
+	}
+    }
+    return UCHAR(*cs) - UCHAR(*ct);
+}
+
 
 /*
  *----------------------------------------------------------------------
@@ -1164,16 +1210,17 @@ TclUtfCasecmp(
     while (*cs && *ct) {
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
 #if TCL_UTF_MAX == 4
-    /* map high surrogate characters to values > 0xffff */
-    if ((ch1 & 0xFC00) == 0xD800) {
-	ch1 += 0x4000;
-    }
-    if ((ch2 & 0xFC00) == 0xD800) {
-	ch2 += 0x4000;
-    }
+	    /* Surrogates always report higher than non-surrogates */
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
+		return ch1;
+	    }
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
+		return -ch2;
+	    }
 #endif
-	if (ch1 != ch2) {
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
 	    if (ch1 != ch2) {
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2017-11-30 13:14:43 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2017-11-30 13:14:43 (GMT)
commit	72998869a3be3534fec99499faabe2d1557d6bcb (patch)
tree	85593ed297518fcde58853b7fa1d2cb98b1a0e6d /generic/tclUtf.c
parent	c47ba70f4545f7c961160a6beb0a466e3bfd5532 (diff)
download	tcl-72998869a3be3534fec99499faabe2d1557d6bcb.zip tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.gz tcl-72998869a3be3534fec99499faabe2d1557d6bcb.tar.bz2