From 48b529209c87473364215e8aef740e331f88415a Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Tue, 7 Nov 2017 12:15:30 +0000
Subject: Somewhat simplified implementation of TIP #389, in which the "string
 length" if characters > U+FFFF is considered to be 2, not 1.

---
 doc/StringObj.3        |  2 +-
 doc/ToUpper.3          |  2 +-
 doc/Utf.3              |  2 +-
 generic/tcl.decls      | 10 +++---
 generic/tcl.h          |  2 +-
 generic/tclCmdMZ.c     | 18 ++++++++--
 generic/tclDecls.h     | 20 +++++------
 generic/tclScan.c      | 12 +++++--
 generic/tclStringObj.c | 12 +++----
 generic/tclUtf.c       | 95 +++++++++++++++++++++++++++++++++++---------------
 tests/string.test      | 26 +++++++-------
 tests/utf.test         |  4 +--
 12 files changed, 132 insertions(+), 73 deletions(-)

diff --git a/doc/StringObj.3 b/doc/StringObj.3
index 7042cc8..8d9bb56 100644
--- a/doc/StringObj.3
+++ b/doc/StringObj.3
@@ -37,7 +37,7 @@ Tcl_UniChar *
 Tcl_UniChar *
 \fBTcl_GetUnicode\fR(\fIobjPtr\fR)
 .sp
-Tcl_UniChar
+int
 \fBTcl_GetUniChar\fR(\fIobjPtr, index\fR)
 .sp
 int
diff --git a/doc/ToUpper.3 b/doc/ToUpper.3
index b933e9c..14766da 100644
--- a/doc/ToUpper.3
+++ b/doc/ToUpper.3
@@ -13,7 +13,7 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_
 .nf
 \fB#include <tcl.h>\fR
 .sp
-Tcl_UniChar
+int
 \fBTcl_UniCharToUpper\fR(\fIch\fR)
 .sp
 Tcl_UniChar
diff --git a/doc/Utf.3 b/doc/Utf.3
index 378c806..5cd6b7df 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -63,7 +63,7 @@ const char *
 const char *
 \fBTcl_UtfPrev\fR(\fIsrc, start\fR)
 .sp
-Tcl_UniChar
+int
 \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
 .sp
 const char *
diff --git a/generic/tcl.decls b/generic/tcl.decls
index b2b91a9..e3ea9bc 100644
--- a/generic/tcl.decls
+++ b/generic/tcl.decls
@@ -1148,16 +1148,16 @@ declare 319 {
 	    Tcl_QueuePosition position)
 }
 declare 320 {
-    Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index)
+    int Tcl_UniCharAtIndex(const char *src, int index)
 }
 declare 321 {
-    Tcl_UniChar Tcl_UniCharToLower(int ch)
+    int Tcl_UniCharToLower(int ch)
 }
 declare 322 {
-    Tcl_UniChar Tcl_UniCharToTitle(int ch)
+    int Tcl_UniCharToTitle(int ch)
 }
 declare 323 {
-    Tcl_UniChar Tcl_UniCharToUpper(int ch)
+    int Tcl_UniCharToUpper(int ch)
 }
 declare 324 {
     int Tcl_UniCharToUtf(int ch, char *buf)
@@ -1351,7 +1351,7 @@ declare 380 {
     int Tcl_GetCharLength(Tcl_Obj *objPtr)
 }
 declare 381 {
-    Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index)
+    int Tcl_GetUniChar(Tcl_Obj *objPtr, int index)
 }
 declare 382 {
     Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr)
diff --git a/generic/tcl.h b/generic/tcl.h
index 07d841d..f874997 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2201,7 +2201,7 @@ typedef struct Tcl_EncodingType {
  */
 
 #ifndef TCL_UTF_MAX
-#define TCL_UTF_MAX		3
+#define TCL_UTF_MAX		4
 #endif
 
 /*
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 2195aa1..b6a8fe9 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -309,7 +309,7 @@ Tcl_RegexpObjCmd(
 	    eflags = 0;
 	} else if (offset > stringLength) {
 	    eflags = TCL_REG_NOTBOL;
-	} else if (Tcl_GetUniChar(objPtr, offset-1) == (Tcl_UniChar)'\n') {
+	} else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') {
 	    eflags = 0;
 	} else {
 	    eflags = TCL_REG_NOTBOL;
@@ -1218,6 +1218,12 @@ Tcl_SplitObjCmd(
 	for ( ; stringPtr < end; stringPtr += len) {
 	    len = TclUtfToUniChar(stringPtr, &ch);
 
+#if TCL_UTF_MAX == 4
+	    if (!len) {
+		continue;
+	    }
+#endif
+
 	    /*
 	     * Assume Tcl_UniChar is an integral type...
 	     */
@@ -1814,8 +1820,16 @@ StringIsCmd(
 	}
 	end = string1 + length1;
 	for (; string1 < end; string1 += length2, failat++) {
+	    int fullchar;
 	    length2 = TclUtfToUniChar(string1, &ch);
-	    if (!chcomp(ch)) {
+	    fullchar = ch;
+#if TCL_UTF_MAX == 4
+	    if (!length2) {
+	    	length2 = TclUtfToUniChar(string1, &ch);
+	    	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	    }
+#endif
+	    if (!chcomp(fullchar)) {
 		result = 0;
 		break;
 	    }
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 464fc0f..5f83636 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -959,13 +959,13 @@ EXTERN void		Tcl_ThreadAlert(Tcl_ThreadId threadId);
 EXTERN void		Tcl_ThreadQueueEvent(Tcl_ThreadId threadId,
 				Tcl_Event *evPtr, Tcl_QueuePosition position);
 /* 320 */
-EXTERN Tcl_UniChar	Tcl_UniCharAtIndex(const char *src, int index);
+EXTERN int		Tcl_UniCharAtIndex(const char *src, int index);
 /* 321 */
-EXTERN Tcl_UniChar	Tcl_UniCharToLower(int ch);
+EXTERN int		Tcl_UniCharToLower(int ch);
 /* 322 */
-EXTERN Tcl_UniChar	Tcl_UniCharToTitle(int ch);
+EXTERN int		Tcl_UniCharToTitle(int ch);
 /* 323 */
-EXTERN Tcl_UniChar	Tcl_UniCharToUpper(int ch);
+EXTERN int		Tcl_UniCharToUpper(int ch);
 /* 324 */
 EXTERN int		Tcl_UniCharToUtf(int ch, char *buf);
 /* 325 */
@@ -1117,7 +1117,7 @@ EXTERN void		Tcl_SetUnicodeObj(Tcl_Obj *objPtr,
 /* 380 */
 EXTERN int		Tcl_GetCharLength(Tcl_Obj *objPtr);
 /* 381 */
-EXTERN Tcl_UniChar	Tcl_GetUniChar(Tcl_Obj *objPtr, int index);
+EXTERN int		Tcl_GetUniChar(Tcl_Obj *objPtr, int index);
 /* 382 */
 EXTERN Tcl_UniChar *	Tcl_GetUnicode(Tcl_Obj *objPtr);
 /* 383 */
@@ -2186,10 +2186,10 @@ typedef struct TclStubs {
     Tcl_Obj * (*tcl_SetVar2Ex) (Tcl_Interp *interp, const char *part1, const char *part2, Tcl_Obj *newValuePtr, int flags); /* 317 */
     void (*tcl_ThreadAlert) (Tcl_ThreadId threadId); /* 318 */
     void (*tcl_ThreadQueueEvent) (Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 319 */
-    Tcl_UniChar (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */
-    Tcl_UniChar (*tcl_UniCharToLower) (int ch); /* 321 */
-    Tcl_UniChar (*tcl_UniCharToTitle) (int ch); /* 322 */
-    Tcl_UniChar (*tcl_UniCharToUpper) (int ch); /* 323 */
+    int (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */
+    int (*tcl_UniCharToLower) (int ch); /* 321 */
+    int (*tcl_UniCharToTitle) (int ch); /* 322 */
+    int (*tcl_UniCharToUpper) (int ch); /* 323 */
     int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */
     CONST84_RETURN char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */
     int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */
@@ -2247,7 +2247,7 @@ typedef struct TclStubs {
     Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */
     void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */
     int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */
-    Tcl_UniChar (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */
+    int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */
     Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */
     Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */
     void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */
diff --git a/generic/tclScan.c b/generic/tclScan.c
index e1fcad4..7f71262 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -885,9 +885,17 @@ Tcl_ScanObjCmd(
 	     * Scan a single Unicode character.
 	     */
 
-	    string += TclUtfToUniChar(string, &sch);
+	    offset = TclUtfToUniChar(string, &sch);
+	    i = (int)sch;
+#if TCL_UTF_MAX == 4
+	    if (!offset) {
+		offset = Tcl_UtfToUniChar(string, &sch);
+		i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
+	    }
+#endif
+	    string += offset;
 	    if (!(flags & SCAN_SUPPRESS)) {
-		objPtr = Tcl_NewIntObj((int)sch);
+		objPtr = Tcl_NewIntObj(i);
 		Tcl_IncrRefCount(objPtr);
 		CLANG_ASSERT(objs);
 		objs[objIndex++] = objPtr;
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 3a35bcf..1ccd778 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -466,7 +466,7 @@ Tcl_GetCharLength(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_GetUniChar(
     Tcl_Obj *objPtr,		/* The object to get the Unicode charater
 				 * from. */
@@ -483,7 +483,7 @@ Tcl_GetUniChar(
     if (TclIsPureByteArray(objPtr)) {
 	unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL);
 
-	return (Tcl_UniChar) bytes[index];
+	return (int) bytes[index];
     }
 
     /*
@@ -507,7 +507,7 @@ Tcl_GetUniChar(
 	FillUnicodeRep(objPtr);
 	stringPtr = GET_STRING(objPtr);
     }
-    return stringPtr->unicode[index];
+    return (int) stringPtr->unicode[index];
 }
 
 /*
@@ -3462,7 +3462,6 @@ TclStringObjReverse(
 	     * Tcl_SetObjLength into growing the unicode rep buffer.
 	     */
 
-	    ch = 0;
 	    objPtr = Tcl_NewUnicodeObj(&ch, 1);
 	    Tcl_SetObjLength(objPtr, stringPtr->numChars);
 	    to = Tcl_GetUnicode(objPtr);
@@ -3565,7 +3564,7 @@ ExtendUnicodeRepWithString(
 {
     String *stringPtr = GET_STRING(objPtr);
     int needed, numOrigChars = 0;
-    Tcl_UniChar *dst;
+    Tcl_UniChar *dst, unichar = 0;
 
     if (stringPtr->hasUnicode) {
 	numOrigChars = stringPtr->numChars;
@@ -3588,7 +3587,8 @@ ExtendUnicodeRepWithString(
 	numAppendChars = 0;
     }
     for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) {
-	bytes += TclUtfToUniChar(bytes, dst);
+	bytes += TclUtfToUniChar(bytes, &unichar);
+	*dst = unichar;
     }
     *dst = 0;
 }
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 25cc2d1..859fe78 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -699,7 +699,7 @@ Tcl_UtfPrev(
  *---------------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharAtIndex(
     register const char *src,	/* The UTF-8 string to dereference. */
     register int index)		/* The position of the desired character. */
@@ -819,7 +819,8 @@ int
 Tcl_UtfToUpper(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, upChar;
+    Tcl_UniChar ch = 0;
+    int upChar;
     char *src, *dst;
     int bytes;
 
@@ -830,7 +831,14 @@ Tcl_UtfToUpper(
     src = dst = str;
     while (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	upChar = Tcl_UniCharToUpper(ch);
+	upChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	upChar = Tcl_UniCharToUpper(upChar);
 
 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -872,7 +880,8 @@ int
 Tcl_UtfToLower(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, lowChar;
+    Tcl_UniChar ch = 0;
+    int lowChar;
     char *src, *dst;
     int bytes;
 
@@ -883,7 +892,14 @@ Tcl_UtfToLower(
     src = dst = str;
     while (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	lowChar = Tcl_UniCharToLower(ch);
+	lowChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	lowChar = Tcl_UniCharToLower(lowChar);
 
 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -926,7 +942,8 @@ int
 Tcl_UtfToTitle(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, titleChar, lowChar;
+    Tcl_UniChar ch = 0;
+    int titleChar, lowChar;
     char *src, *dst;
     int bytes;
 
@@ -939,7 +956,14 @@ Tcl_UtfToTitle(
 
     if (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	titleChar = Tcl_UniCharToTitle(ch);
+	titleChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	titleChar = Tcl_UniCharToTitle(titleChar);
 
 	if (bytes < TclUtfCount(titleChar)) {
 	    memcpy(dst, src, (size_t) bytes);
@@ -951,7 +975,14 @@ Tcl_UtfToTitle(
     }
     while (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	lowChar = Tcl_UniCharToLower(ch);
+	lowChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	lowChar = Tcl_UniCharToLower(lowChar);
 
 	if (bytes < TclUtfCount(lowChar)) {
 	    memcpy(dst, src, (size_t) bytes);
@@ -1159,16 +1190,18 @@ TclUtfCasecmp(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharToUpper(
     int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
 
-    if (GetCaseType(info) & 0x04) {
-	ch -= GetDelta(info);
+	if (GetCaseType(info) & 0x04) {
+	    ch -= GetDelta(info);
+	}
     }
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
 }
 
 /*
@@ -1187,16 +1220,18 @@ Tcl_UniCharToUpper(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharToLower(
     int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
 
-    if (GetCaseType(info) & 0x02) {
-	ch += GetDelta(info);
+	if (GetCaseType(info) & 0x02) {
+	    ch += GetDelta(info);
+	}
     }
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
 }
 
 /*
@@ -1215,23 +1250,25 @@ Tcl_UniCharToLower(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharToTitle(
     int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
-    int mode = GetCaseType(info);
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
+	int mode = GetCaseType(info);
 
-    if (mode & 0x1) {
-	/*
-	 * Subtract or add one depending on the original case.
-	 */
+	if (mode & 0x1) {
+	    /*
+	     * Subtract or add one depending on the original case.
+	     */
 
-	ch += ((mode & 0x4) ? -1 : 1);
-    } else if (mode == 0x4) {
-	ch -= GetDelta(info);
+	    ch += ((mode & 0x4) ? -1 : 1);
+	} else if (mode == 0x4) {
+	    ch -= GetDelta(info);
+	}
     }
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
 }
 
 /*
diff --git a/tests/string.test b/tests/string.test
index cb901b9..cebaf4c 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -1697,40 +1697,40 @@ test string-24.4 {string reverse command - unshared string} {
     string reverse $x$y
 } edcba
 test string-24.5 {string reverse command - shared unicode string} {
-    set x abcde\udead
+    set x abcde\ud0ad
     string reverse $x
-} \udeadedcba
+} \ud0adedcba
 test string-24.6 {string reverse command - unshared string} {
     set x abc
-    set y de\udead
+    set y de\ud0ad
     string reverse $x$y
-} \udeadedcba
+} \ud0adedcba
 test string-24.7 {string reverse command - simple case} {
     string reverse a
 } a
 test string-24.8 {string reverse command - simple case} {
-    string reverse \udead
-} \udead
+    string reverse \ud0ad
+} \ud0ad
 test string-24.9 {string reverse command - simple case} {
     string reverse {}
 } {}
 test string-24.10 {string reverse command - corner case} {
-    set x \ubeef\udead
+    set x \ubeef\ud0ad
     string reverse $x
-} \udead\ubeef
+} \ud0ad\ubeef
 test string-24.11 {string reverse command - corner case} {
     set x \ubeef
-    set y \udead
+    set y \ud0ad
     string reverse $x$y
-} \udead\ubeef
+} \ud0ad\ubeef
 test string-24.12 {string reverse command - corner case} {
     set x \ubeef
-    set y \udead
+    set y \ud0ad
     string is ascii [string reverse $x$y]
 } 0
 test string-24.13 {string reverse command - pure Unicode string} {
-    string reverse [string range \ubeef\udead\ubeef\udead\ubeef\udead 1 5]
-} \udead\ubeef\udead\ubeef\udead
+    string reverse [string range \ubeef\ud0ad\ubeef\ud0ad\ubeef\ud0ad 1 5]
+} \ud0ad\ubeef\ud0ad\ubeef\ud0ad
 test string-24.14 {string reverse command - pure bytearray} {
     binary scan [string reverse [binary format H* 010203]] H* x
     set x
diff --git a/tests/utf.test b/tests/utf.test
index 422ab08..45f9c0c 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -68,10 +68,10 @@ test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestrin
 } {1}
 test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body {
     string length [testbytestring "\xF0\x90\x80\x80"]
-} -result {1}
+} -result {2}
 test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {fullutf testbytestring} -body {
     string length [testbytestring "\xF4\x8F\xBF\xBF"]
-} -result {1}
+} -result {2}
 test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring {
     string length [testbytestring "\xF0\x8F\xBF\xBF"]
 } {4}
-- 
cgit v0.12