From 03c66864aa2ffa9871ce216b00cd661eaf1be688 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 29 Nov 2017 09:49:31 +0000
Subject: Fix [8e1e31eac0fd6b6c4452bc108a98ab08c6b64588|8e1e31eac0]: lsort
 treats NUL chars strangely

---
 generic/tclCmdIL.c |  4 +--
 generic/tclCmdMZ.c |  2 +-
 generic/tclInt.h   |  1 +
 generic/tclUtf.c   | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c
index 47076ec..b41d312 100644
--- a/generic/tclCmdIL.c
+++ b/generic/tclCmdIL.c
@@ -2945,7 +2945,7 @@ Tcl_LsearchObjCmd(
     double patDouble, objDouble;
     SortInfo sortInfo;
     Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr;
-    SortStrCmpFn_t strCmpFn = strcmp;
+    SortStrCmpFn_t strCmpFn = TclUtfCmp;
     Tcl_RegExp regexp = NULL;
     static const char *const options[] = {
 	"-all",	    "-ascii",   "-bisect", "-decreasing", "-dictionary",
@@ -4263,7 +4263,7 @@ SortCompare(
     int order = 0;
 
     if (infoPtr->sortMode == SORTMODE_ASCII) {
-	order = strcmp(elemPtr1->collationKey.strValuePtr,
+	order = TclUtfCmp(elemPtr1->collationKey.strValuePtr,
 		elemPtr2->collationKey.strValuePtr);
     } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) {
 	order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr,
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index ad1dd5f..a206cc5 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -3547,7 +3547,7 @@ TclNRSwitchObjCmd(
 	OPT_LAST
     };
     typedef int (*strCmpFn_t)(const char *, const char *);
-    strCmpFn_t strCmpFn = strcmp;
+    strCmpFn_t strCmpFn = TclUtfCmp;
 
     mode = OPT_EXACT;
     foundmode = 0;
diff --git a/generic/tclInt.h b/generic/tclInt.h
index d77889e..ad1d9c6 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3219,6 +3219,7 @@ MODULE_SCOPE int	TclTrimLeft(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
 MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
 			    const char *trim, int numTrim);
+MODULE_SCOPE int	TclUtfCmp(const char *cs, const char *ct);
 MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
 MODULE_SCOPE int	TclUtfCount(int ch);
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index aed332f..aff10c1 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -1108,6 +1108,15 @@ Tcl_UtfNcmp(
 
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
 	if (ch1 != ch2) {
 	    return (ch1 - ch2);
 	}
@@ -1140,6 +1149,7 @@ Tcl_UtfNcasecmp(
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1 = 0, ch2 = 0;
+
     while (numChars-- > 0) {
 	/*
 	 * n must be interpreted as chars, not bytes.
@@ -1148,6 +1158,15 @@ Tcl_UtfNcasecmp(
 	 */
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
 	if (ch1 != ch2) {
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
@@ -1158,11 +1177,56 @@ Tcl_UtfNcasecmp(
     }
     return 0;
 }
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ *	Compare UTF chars of string cs to string ct case sensitively.
+ *	Replacement for strcmp in Tcl core, in places where UTF-8 should
+ *	be handled.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct)		/* UTF string cs is compared to. */
+{
+    Tcl_UniChar ch1 = 0, ch2 = 0;
+
+    while (*cs && *ct) {
+	cs += TclUtfToUniChar(cs, &ch1);
+	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
+	if (ch1 != ch2) {
+	    return ch1 - ch2;
+	}
+    }
+    return UCHAR(*cs) - UCHAR(*ct);
+}
+
 
 /*
  *----------------------------------------------------------------------
  *
- * Tcl_UtfNcasecmp --
+ * TclUtfCasecmp --
  *
  *	Compare UTF chars of string cs to string ct case insensitively.
  *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1182,11 +1246,20 @@ TclUtfCasecmp(
     const char *cs,		/* UTF string to compare to ct. */
     const char *ct)		/* UTF string cs is compared to. */
 {
-    while (*cs && *ct) {
-	Tcl_UniChar ch1 = 0, ch2 = 0;
+    Tcl_UniChar ch1 = 0, ch2 = 0;
 
+    while (*cs && *ct) {
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
 	if (ch1 != ch2) {
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
-- 
cgit v0.12