From 4fc9a4c991e75f3060fad431f5951cda27377a5a Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 29 Nov 2017 11:04:20 +0000
Subject: Update some functions in tclUtf.c to handle surrogate pairs when
 TCL_UTF_MAX == 4. Also update documentation to distinguish better between
 "Tcl_UniChar" and "Unicode character": Those are not necessary the same when
 TCL_UTF_MAX == 4. No change when TCL_UTF_MAX == 4 or TCL_UTF_MAX == 6.

---
 doc/ToUpper.3        |  2 +-
 doc/UniCharIsAlpha.3 |  5 +---
 doc/Utf.3            |  2 +-
 generic/tclUtf.c     | 74 ++++++++++++++++++++++++++++++++++++++++------------
 4 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/doc/ToUpper.3 b/doc/ToUpper.3
index b933e9c..be614e7 100644
--- a/doc/ToUpper.3
+++ b/doc/ToUpper.3
@@ -33,7 +33,7 @@ int
 .SH ARGUMENTS
 .AS char *str in/out
 .AP int ch in
-The Tcl_UniChar to be converted.
+The Unicode character to be converted.
 .AP char *str in/out
 Pointer to UTF-8 string to be converted in place.
 .BE
diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3
index 2336c34..5ba3fc9 100644
--- a/doc/UniCharIsAlpha.3
+++ b/doc/UniCharIsAlpha.3
@@ -53,14 +53,11 @@ The Tcl_UniChar to be examined.
 
 .SH DESCRIPTION
 .PP
-All of the routines described examine Tcl_UniChars and return a
+All of the routines described examine Unicode characters and return a
 boolean value. A non-zero return value means that the character does
 belong to the character class associated with the called routine. The
 rest of this document just describes the character classes associated
 with the various routines.
-.PP
-Note: A Tcl_UniChar is a Unicode character represented as an unsigned,
-fixed-size quantity.
 
 .SH "CHARACTER CLASSES"
 .PP
diff --git a/doc/Utf.3 b/doc/Utf.3
index 378c806..9d0c617 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -77,7 +77,7 @@ int
 Buffer in which the UTF-8 representation of the Tcl_UniChar is stored.  At most
 \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
 .AP int ch in
-The Tcl_UniChar to be converted or examined.
+The Unicode character to be converted or examined.
 .AP Tcl_UniChar *chPtr out
 Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
 .AP "const char" *src in
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 3b39226..17f769d 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -404,7 +404,7 @@ Tcl_UtfToUniCharDString(
 				 * appended to this previously initialized
 				 * DString. */
 {
-    Tcl_UniChar ch, *w, *wString;
+    Tcl_UniChar ch = 0, *w, *wString;
     const char *p, *end;
     int oldLength;
 
@@ -528,13 +528,13 @@ Tcl_NumUtfChars(
  *
  * Tcl_UtfFindFirst --
  *
- *	Returns a pointer to the first occurance of the given Tcl_UniChar in
- *	the NULL-terminated UTF-8 string. The NULL terminator is considered
+ *	Returns a pointer to the first occurance of the given Unicode character
+ *	in the NULL-terminated UTF-8 string. The NULL terminator is considered
  *	part of the UTF-8 string. Equivalent to Plan 9 utfrune().
  *
  * Results:
- *	As above. If the Tcl_UniChar does not exist in the given string, the
- *	return value is NULL.
+ *	As above. If the Unicode character does not exist in the given string,
+ *	the return value is NULL.
  *
  * Side effects:
  *	None.
@@ -545,14 +545,21 @@ Tcl_NumUtfChars(
 const char *
 Tcl_UtfFindFirst(
     const char *src,		/* The UTF-8 string to be searched. */
-    int ch)			/* The Tcl_UniChar to search for. */
+    int ch)			/* The Unicode character to search for. */
 {
-    int len;
+    int len, fullchar;
     Tcl_UniChar find = 0;
 
     while (1) {
 	len = TclUtfToUniChar(src, &find);
-	if (find == ch) {
+	fullchar = find;
+#if TCL_UTF_MAX == 4
+	if (!len) {
+	    len += TclUtfToUniChar(src, &find);
+	    fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+#endif
+	if (find == fullchar) {
 	    return src;
 	}
 	if (*src == '\0') {
@@ -567,8 +574,8 @@ Tcl_UtfFindFirst(
  *
  * Tcl_UtfFindLast --
  *
- *	Returns a pointer to the last occurance of the given Tcl_UniChar in
- *	the NULL-terminated UTF-8 string. The NULL terminator is considered
+ *	Returns a pointer to the last occurance of the given Unicode character
+ *	in the NULL-terminated UTF-8 string. The NULL terminator is considered
  *	part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
  *
  * Results:
@@ -584,16 +591,23 @@ Tcl_UtfFindFirst(
 const char *
 Tcl_UtfFindLast(
     const char *src,		/* The UTF-8 string to be searched. */
-    int ch)			/* The Tcl_UniChar to search for. */
+    int ch)			/* The Unicode character to search for. */
 {
-    int len;
+    int len, fullchar;
     Tcl_UniChar find = 0;
     const char *last;
 
     last = NULL;
     while (1) {
 	len = TclUtfToUniChar(src, &find);
-	if (find == ch) {
+	fullchar = find;
+#if TCL_UTF_MAX == 4
+	if (!len) {
+	    len += TclUtfToUniChar(src, &find);
+	    fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+#endif
+	if (find == fullchar) {
 	    last = src;
 	}
 	if (*src == '\0') {
@@ -1058,6 +1072,15 @@ Tcl_UtfNcmp(
 
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
 	if (ch1 != ch2) {
 	    return (ch1 - ch2);
 	}
@@ -1090,6 +1113,7 @@ Tcl_UtfNcasecmp(
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1 = 0, ch2 = 0;
+
     while (numChars-- > 0) {
 	/*
 	 * n must be interpreted as chars, not bytes.
@@ -1098,6 +1122,15 @@ Tcl_UtfNcasecmp(
 	 */
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
 	if (ch1 != ch2) {
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
@@ -1112,7 +1145,7 @@ Tcl_UtfNcasecmp(
 /*
  *----------------------------------------------------------------------
  *
- * Tcl_UtfNcasecmp --
+ * TclUtfCasecmp --
  *
  *	Compare UTF chars of string cs to string ct case insensitively.
  *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1132,11 +1165,20 @@ TclUtfCasecmp(
     const char *cs,		/* UTF string to compare to ct. */
     const char *ct)		/* UTF string cs is compared to. */
 {
-    while (*cs && *ct) {
-	Tcl_UniChar ch1, ch2;
+    Tcl_UniChar ch1 = 0, ch2 = 0;
 
+    while (*cs && *ct) {
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
+#if TCL_UTF_MAX == 4
+    /* map high surrogate characters to values > 0xffff */
+    if ((ch1 & 0xFC00) == 0xD800) {
+	ch1 += 0x4000;
+    }
+    if ((ch2 & 0xFC00) == 0xD800) {
+	ch2 += 0x4000;
+    }
+#endif
 	if (ch1 != ch2) {
 	    ch1 = Tcl_UniCharToLower(ch1);
 	    ch2 = Tcl_UniCharToLower(ch2);
-- 
cgit v0.12