From a5a30fa3e2b30971c18a067291c9144bdd22199f Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 10 Mar 2021 12:55:25 +0000
Subject: TIP #597 implementation: "string is unicode" and new wtf-8 encoding

---
 doc/UniCharIsAlpha.3    |  7 ++++-
 generic/tcl.decls       |  4 +--
 generic/tclCmdMZ.c      | 11 ++++---
 generic/tclCompCmdsSZ.c | 24 +++++++++------
 generic/tclCompile.h    |  3 +-
 generic/tclDecls.h      | 13 +++++---
 generic/tclEncoding.c   | 19 ++++++++++--
 generic/tclStubInit.c   |  3 +-
 generic/tclUtf.c        | 30 ++++++++++++++++++
 tests/encoding.test     | 82 ++++++++++++++++++++++++++++++++++++++++++-------
 tests/string.test       | 26 ++++++++++++++--
 11 files changed, 182 insertions(+), 40 deletions(-)

diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3
index 61490ed..20828e4 100644
--- a/doc/UniCharIsAlpha.3
+++ b/doc/UniCharIsAlpha.3
@@ -8,7 +8,7 @@
 .so man.macros
 .BS
 .SH NAME
-Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters
+Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsUnicode, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters
 .SH SYNOPSIS
 .nf
 \fB#include <tcl.h>\fR
@@ -44,6 +44,9 @@ int
 \fBTcl_UniCharIsUpper\fR(\fIch\fR)
 .sp
 int
+\fBTcl_UniCharIsUnicode\fR(\fIch\fR)
+.sp
+int
 \fBTcl_UniCharIsWordChar\fR(\fIch\fR)
 .SH ARGUMENTS
 .AS int ch
@@ -81,6 +84,8 @@ with the various routines.
 .PP
 \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character.
 .PP
+\fBTcl_UniCharIsUpper\fR tests if the character is a Unicode character.
+.PP
 \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or
 a connector punctuation mark.
 
diff --git a/generic/tcl.decls b/generic/tcl.decls
index 51ece1a..03d43ce 100644
--- a/generic/tcl.decls
+++ b/generic/tcl.decls
@@ -2412,8 +2412,8 @@ declare 652 {
 declare 653 {
     unsigned char *TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr)
 }
-declare 656 {
-    void TclUnusedStubEntry(void)
+declare 657 {
+    int Tcl_UniCharIsUnicode(int ch)
 }
 
 # ----- BASELINE -- FOR -- 8.7.0 ----- #
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index d020a93..61d1010 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1533,16 +1533,16 @@ StringIsCmd(
 	"boolean",	"dict",		"digit",	"double",
 	"entier",	"false",	"graph",	"integer",
 	"list",		"lower",	"print",	"punct",
-	"space",	"true",		"upper",	"wideinteger",
-	"wordchar",	"xdigit",	NULL
+	"space",	"true",		"upper",	"unicode",
+	"wideinteger", "wordchar",	"xdigit",	NULL
     };
     enum isClassesEnum {
 	STR_IS_ALNUM,	STR_IS_ALPHA,	STR_IS_ASCII,	STR_IS_CONTROL,
 	STR_IS_BOOL,	STR_IS_DICT,	STR_IS_DIGIT,	STR_IS_DOUBLE,
 	STR_IS_ENTIER,	STR_IS_FALSE,	STR_IS_GRAPH,	STR_IS_INT,
 	STR_IS_LIST,	STR_IS_LOWER,	STR_IS_PRINT,	STR_IS_PUNCT,
-	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,	STR_IS_WIDE,
-	STR_IS_WORD,	STR_IS_XDIGIT
+	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,	STR_IS_UNICODE,
+	STR_IS_WIDE,	STR_IS_WORD,	STR_IS_XDIGIT
     };
     static const char *const isOptions[] = {
 	"-strict", "-failindex", NULL
@@ -1872,6 +1872,9 @@ StringIsCmd(
     case STR_IS_UPPER:
 	chcomp = Tcl_UniCharIsUpper;
 	break;
+    case STR_IS_UNICODE:
+	chcomp = Tcl_UniCharIsUnicode;
+	break;
     case STR_IS_WORD:
 	chcomp = Tcl_UniCharIsWordChar;
 	break;
diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c
index 0bac52b..a160625 100644
--- a/generic/tclCompCmdsSZ.c
+++ b/generic/tclCompCmdsSZ.c
@@ -505,19 +505,19 @@ TclCompileStringIsCmd(
     Tcl_Token *tokenPtr = TokenAfter(parsePtr->tokenPtr);
     static const char *const isClasses[] = {
 	"alnum",	"alpha",	"ascii",	"control",
-	"boolean",	"dict", "digit",	"double",	"entier",
-	"false",	"graph",	"integer",	"list",
-	"lower",	"print",	"punct",	"space",
-	"true",		"upper",	"wideinteger",	"wordchar",
-	"xdigit",	NULL
+	"boolean",	"dict",		"digit",	"double",
+	"entier",	"false",	"graph",	"integer",
+	"list",		"lower",	"print",	"punct",
+	"space",	"true",		"upper",	"unicode",
+	"wideinteger", "wordchar",	"xdigit",	NULL
     };
     enum isClassesEnum {
 	STR_IS_ALNUM,	STR_IS_ALPHA,	STR_IS_ASCII,	STR_IS_CONTROL,
-	STR_IS_BOOL,	STR_IS_DICT, STR_IS_DIGIT,	STR_IS_DOUBLE,	STR_IS_ENTIER,
-	STR_IS_FALSE,	STR_IS_GRAPH,	STR_IS_INT,	STR_IS_LIST,
-	STR_IS_LOWER,	STR_IS_PRINT,	STR_IS_PUNCT,	STR_IS_SPACE,
-	STR_IS_TRUE,	STR_IS_UPPER,	STR_IS_WIDE,	STR_IS_WORD,
-	STR_IS_XDIGIT
+	STR_IS_BOOL,	STR_IS_DICT,	STR_IS_DIGIT,	STR_IS_DOUBLE,
+	STR_IS_ENTIER,	STR_IS_FALSE,	STR_IS_GRAPH,	STR_IS_INT,
+	STR_IS_LIST,	STR_IS_LOWER,	STR_IS_PRINT,	STR_IS_PUNCT,
+	STR_IS_SPACE,	STR_IS_TRUE,	STR_IS_UPPER,	STR_IS_UNICODE,
+	STR_IS_WIDE,	STR_IS_WORD,	STR_IS_XDIGIT
     };
     int t, range, allowEmpty = 0, end;
     InstStringClassType strClassType;
@@ -609,6 +609,9 @@ TclCompileStringIsCmd(
     case STR_IS_UPPER:
 	strClassType = STR_CLASS_UPPER;
 	goto compileStrClass;
+    case STR_IS_UNICODE:
+	strClassType = STR_CLASS_UNICODE;
+	goto compileStrClass;
     case STR_IS_WORD:
 	strClassType = STR_CLASS_WORD;
 	goto compileStrClass;
@@ -1415,6 +1418,7 @@ StringClassDesc const tclStringClassTable[] = {
     {"upper",	Tcl_UniCharIsUpper},
     {"word",	Tcl_UniCharIsWordChar},
     {"xdigit",	UniCharIsHexDigit},
+    {"unicode",	Tcl_UniCharIsUnicode},
     {"",	NULL}
 };
 
diff --git a/generic/tclCompile.h b/generic/tclCompile.h
index 21a27f7..6a5faaf 100644
--- a/generic/tclCompile.h
+++ b/generic/tclCompile.h
@@ -922,8 +922,9 @@ typedef enum InstStringClassType {
     STR_CLASS_UPPER,		/* Unicode upper-case alphabet characters. */
     STR_CLASS_WORD,		/* Unicode word (alphabetic, digit, connector
 				 * punctuation) characters. */
-    STR_CLASS_XDIGIT		/* Characters that can be used as digits in
+    STR_CLASS_XDIGIT,		/* Characters that can be used as digits in
 				 * hexadecimal numbers ([0-9A-Fa-f]). */
+    STR_CLASS_UNICODE		/* Unicode characters. */
 } InstStringClassType;
 
 typedef struct StringClassDesc {
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 4b91817..bb31355 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -1933,8 +1933,9 @@ EXTERN unsigned char *	TclGetByteArrayFromObj(Tcl_Obj *objPtr,
 				size_t *lengthPtr);
 /* Slot 654 is reserved */
 /* Slot 655 is reserved */
-/* 656 */
-EXTERN void		TclUnusedStubEntry(void);
+/* Slot 656 is reserved */
+/* 657 */
+EXTERN int		Tcl_UniCharIsUnicode(int ch);
 
 typedef struct {
     const struct TclPlatStubs *tclPlatStubs;
@@ -2626,7 +2627,8 @@ typedef struct TclStubs {
     unsigned char * (*tclGetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */
     void (*reserved654)(void);
     void (*reserved655)(void);
-    void (*tclUnusedStubEntry) (void); /* 656 */
+    void (*reserved656)(void);
+    int (*tcl_UniCharIsUnicode) (int ch); /* 657 */
 } TclStubs;
 
 extern const TclStubs *tclStubsPtr;
@@ -3965,8 +3967,9 @@ extern const TclStubs *tclStubsPtr;
 	(tclStubsPtr->tclGetByteArrayFromObj) /* 653 */
 /* Slot 654 is reserved */
 /* Slot 655 is reserved */
-#define TclUnusedStubEntry \
-	(tclStubsPtr->tclUnusedStubEntry) /* 656 */
+/* Slot 656 is reserved */
+#define Tcl_UniCharIsUnicode \
+	(tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */
 
 #endif /* defined(USE_TCL_STUBS) */
 
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index c4ef159..d994f10 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -517,6 +517,9 @@ FillEncodingFileMap(void)
  *---------------------------------------------------------------------------
  */
 
+#define FLAG_LE 1
+#define FLAG_WTF 2
+
 void
 TclInitEncodingSubsystem(void)
 {
@@ -559,13 +562,16 @@ TclInitEncodingSubsystem(void)
     type.nullSize	= 1;
     type.clientData	= NULL;
     Tcl_CreateEncoding(&type);
+    type.clientData	= INT2PTR(FLAG_WTF);
+    type.encodingName	= "wtf-8";
+    Tcl_CreateEncoding(&type);
 
     type.toUtfProc	= Utf16ToUtfProc;
     type.fromUtfProc    = UtfToUcs2Proc;
     type.freeProc	= NULL;
     type.nullSize	= 2;
     type.encodingName   = "ucs-2le";
-    type.clientData	= INT2PTR(1);
+    type.clientData	= INT2PTR(FLAG_LE);
     Tcl_CreateEncoding(&type);
     type.encodingName   = "ucs-2be";
     type.clientData	= INT2PTR(0);
@@ -579,7 +585,7 @@ TclInitEncodingSubsystem(void)
     type.freeProc	= NULL;
     type.nullSize	= 2;
     type.encodingName   = "utf-16le";
-    type.clientData	= INT2PTR(1);
+    type.clientData	= INT2PTR(FLAG_LE);
     Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16be";
     type.clientData	= INT2PTR(0);
@@ -2271,7 +2277,7 @@ UtfExtToUtfIntProc(
 
 static int
 UtfToUtfProc(
-    TCL_UNUSED(ClientData),
+    void *clientData,	/* flags */
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
@@ -2302,6 +2308,7 @@ UtfToUtfProc(
     const char *srcStart, *srcEnd, *srcClose;
     const char *dstStart, *dstEnd;
     int result, numChars, charLimit = INT_MAX;
+    int encflags = PTR2INT(clientData);
     int *chPtr = (int *) statePtr;
 
     if (flags & TCL_ENCODING_START) {
@@ -2370,6 +2377,9 @@ UtfToUtfProc(
 		int low = *chPtr;
 		size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
 		if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
+		    if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) {
+			*chPtr = 0xFFFD;
+		    }
 			*dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
 			*dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
 			*dst++ = (char) ((*chPtr | 0x80) & 0xBF);
@@ -2378,6 +2388,9 @@ UtfToUtfProc(
 		src += len;
 		dst += Tcl_UniCharToUtf(*chPtr, dst);
 		*chPtr = low;
+	    } else if ((pureNullMode == 1) && !(encflags & FLAG_WTF)
+		    && !Tcl_UniCharIsUnicode(*chPtr)) {
+		*chPtr = 0xFFFD;
 	    }
 	    dst += Tcl_UniCharToUtf(*chPtr, dst);
 	}
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index fa5562d..cd1ce81 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -1905,7 +1905,8 @@ const TclStubs tclStubs = {
     TclGetByteArrayFromObj, /* 653 */
     0, /* 654 */
     0, /* 655 */
-    TclUnusedStubEntry, /* 656 */
+    0, /* 656 */
+    Tcl_UniCharIsUnicode, /* 657 */
 };
 
 /* !END!: Do not edit above this line. */
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e096c06..2687a1d 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2187,6 +2187,36 @@ Tcl_UniCharIsUpper(
 /*
  *----------------------------------------------------------------------
  *
+ * Tcl_UniCharIsUnicode --
+ *
+ *	Test if a character is a Unicode character.
+ *
+ * Results:
+ *	Returns non-zero if character belongs to the Unicode set.
+ *
+ *	Excluded are:
+ *	  1) All characters > U+10FFFF
+ *	  2) Surrogates U+D800 - U+DFFF
+ *	  3) Last 2 characters of each plane, so U+??FFFE  and U+??FFFF
+ *	  4) The characters in the range U+FDD0 - U+FDEF
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsUnicode(
+    int ch)			/* Unicode character to test. */
+{
+    return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800)
+	    && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
  * Tcl_UniCharIsWordChar --
  *
  *	Test if a character is alphanumeric or a connector punctuation mark.
diff --git a/tests/encoding.test b/tests/encoding.test
index b1150c6..76b830d 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -340,61 +340,61 @@ test encoding-15.5 {UtfToUtfProc emoji character input} {
 } "4 😂"
 test encoding-15.6 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uDE02\uD83D
-    set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D]
+    set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D]
     binary scan $y H* z
     list [string length $y] $z
 } {10 edb882f09f9882eda0bd}
 test encoding-15.7 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uD83D
-    set y [encoding convertto utf-8 \uDE02\uD83D\uD83D]
+    set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 9 edb882eda0bdeda0bd}
 test encoding-15.8 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\xE9
-    set y [encoding convertto utf-8 \uDE02\uD83D\xE9]
+    set y [encoding convertto wtf-8 \uDE02\uD83D\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 8 edb882eda0bdc3a9}
 test encoding-15.9 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83DX
-    set y [encoding convertto utf-8 \uDE02\uD83DX]
+    set y [encoding convertto wtf-8 \uDE02\uD83DX]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 7 edb882eda0bd58}
 test encoding-15.10 {UtfToUtfProc high surrogate character output} {
     set x \uDE02\xE9
-    set y [encoding convertto utf-8 \uDE02\xE9]
+    set y [encoding convertto wtf-8 \uDE02\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 edb882c3a9}
 test encoding-15.11 {UtfToUtfProc low surrogate character output} {
     set x \uDA02\xE9
-    set y [encoding convertto utf-8 \uDA02\xE9]
+    set y [encoding convertto wtf-8 \uDA02\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 eda882c3a9}
 test encoding-15.12 {UtfToUtfProc high surrogate character output} {
     set x \uDE02Y
-    set y [encoding convertto utf-8 \uDE02Y]
+    set y [encoding convertto wtf-8 \uDE02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 edb88259}
 test encoding-15.13 {UtfToUtfProc low surrogate character output} {
     set x \uDA02Y
-    set y [encoding convertto utf-8 \uDA02Y]
+    set y [encoding convertto wtf-8 \uDA02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 eda88259}
 test encoding-15.14 {UtfToUtfProc high surrogate character output} {
     set x \uDE02
-    set y [encoding convertto utf-8 \uDE02]
+    set y [encoding convertto wtf-8 \uDE02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 edb882}
 test encoding-15.15 {UtfToUtfProc low surrogate character output} {
     set x \uDA02
-    set y [encoding convertto utf-8 \uDA02]
+    set y [encoding convertto wtf-8 \uDA02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 eda882}
@@ -409,6 +409,66 @@ test encoding-15.17 {UtfToUtfProc emoji character output} {
     binary scan $y H* z
     list [string length $y] $z
 } {4 f09f9882}
+test encoding-15.18 {UtfToUtfProc emoji character output} {
+    set x \uDE02\uD83D\uDE02\uD83D
+    set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D]
+    binary scan $y H* z
+    list [string length $y] $z
+} {10 efbfbdf09f9882efbfbd}
+test encoding-15.19 {UtfToUtfProc emoji character output} {
+    set x \uDE02\uD83D\uD83D
+    set y [encoding convertto utf-8 \uDE02\uD83D\uD83D]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {3 9 efbfbdefbfbdefbfbd}
+test encoding-15.20 {UtfToUtfProc emoji character output} {
+    set x \uDE02\uD83D\xE9
+    set y [encoding convertto utf-8 \uDE02\uD83D\xE9]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {3 8 efbfbdefbfbdc3a9}
+test encoding-15.21 {UtfToUtfProc emoji character output} {
+    set x \uDE02\uD83DX
+    set y [encoding convertto utf-8 \uDE02\uD83DX]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {3 7 efbfbdefbfbd58}
+test encoding-15.22 {UtfToUtfProc high surrogate character output} {
+    set x \uDE02\xE9
+    set y [encoding convertto utf-8 \uDE02\xE9]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {2 5 efbfbdc3a9}
+test encoding-15.23 {UtfToUtfProc low surrogate character output} {
+    set x \uDA02\xE9
+    set y [encoding convertto utf-8 \uDA02\xE9]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {2 5 efbfbdc3a9}
+test encoding-15.24 {UtfToUtfProc high surrogate character output} {
+    set x \uDE02Y
+    set y [encoding convertto utf-8 \uDE02Y]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {2 4 efbfbd59}
+test encoding-15.25 {UtfToUtfProc low surrogate character output} {
+    set x \uDA02Y
+    set y [encoding convertto utf-8 \uDA02Y]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {2 4 efbfbd59}
+test encoding-15.26 {UtfToUtfProc high surrogate character output} {
+    set x \uDE02
+    set y [encoding convertto utf-8 \uDE02]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {1 3 efbfbd}
+test encoding-15.27 {UtfToUtfProc low surrogate character output} {
+    set x \uDA02
+    set y [encoding convertto utf-8 \uDA02]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {1 3 efbfbd}
 
 test encoding-16.1 {Utf16ToUtfProc} -body {
     set val [encoding convertfrom utf-16 NN]
@@ -742,7 +802,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 86 : 85}]
+} -result [expr {[info exists ::tcl_precision] ? 87 : 86}]
 
 runtests
 
diff --git a/tests/string.test b/tests/string.test
index 0eaa3da..b3b278c 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -525,10 +525,10 @@ test string-6.4.$noComp {string is, too many args} {
 } {1 {wrong # args: should be "string is class ?-strict? ?-failindex var? str"}}
 test string-6.5.$noComp {string is, class check} {
     list [catch {run {string is bogus str}} msg] $msg
-} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}}
+} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}}
 test string-6.6.$noComp {string is, ambiguous class} {
     list [catch {run {string is al str}} msg] $msg
-} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}}
+} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}}
 test string-6.7.$noComp {string is alpha, all ok} {
     run {string is alpha -strict -failindex var abc}
 } 1
@@ -961,6 +961,28 @@ test string-6.130.1.$noComp {string is entier, false on bad octal} {
 test string-6.131.$noComp {string is entier, false on bad hex} {
     list [run {string is entier -fail var 0X12345611234123456123456562345612345612345612345612345612345612345612345612345612345345XYZ}] $var
 } {0 88}
+test string-6.132.$noComp {string is unicode} {
+    run {string is unicode \U10FFFD\uD7FF\uE000\uFDCF\uFDF0}
+} 1
+test string-6.133.$noComp {string is unicode, upper surrogate} {
+    run {string is unicode \uD800}
+} 0
+test string-6.134.$noComp {string is unicode, lower surrogate} {
+    run {string is unicode \uDFFF}
+} 0
+test string-6.135.$noComp {string is unicode, noncharacter} {
+    run {string is unicode \uFFFE}
+} 0
+test string-6.136.$noComp {string is unicode, noncharacter} {
+    run {string is unicode \uFFFF}
+} 0
+test string-6.137.$noComp {string is unicode, noncharacter} {
+    run {string is unicode \uFDD0}
+} 0
+test string-6.138.$noComp {string is unicode, noncharacter} {
+    run {string is unicode \uFDEF}
+} 0
+
 
 test string-7.1.$noComp {string last, not enough args} {
     list [catch {run {string last a}} msg] $msg
-- 
cgit v0.12


From b51e777fe970a9ffcf0919f7262a6d474aeeaf2b Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 10 Mar 2021 15:00:44 +0000
Subject: Fix documentation

---
 doc/UniCharIsAlpha.3 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3
index 20828e4..a07af9a 100644
--- a/doc/UniCharIsAlpha.3
+++ b/doc/UniCharIsAlpha.3
@@ -84,7 +84,8 @@ with the various routines.
 .PP
 \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character.
 .PP
-\fBTcl_UniCharIsUpper\fR tests if the character is a Unicode character.
+\fBTcl_UniCharIsUnicode\fR tests if the character is a Unicode character, not being
+a surrogate or noncharacter.
 .PP
 \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or
 a connector punctuation mark.
-- 
cgit v0.12


From 74f570670c3ca45a4ff87a051d9ecdadb396dc56 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 17 Mar 2021 12:46:31 +0000
Subject: Add wtf-16 encodings to the set. With testcases

---
 generic/tclEncoding.c | 119 ++++++++++++++++++++++----------------------------
 tests/encoding.test   |  12 +++--
 2 files changed, 62 insertions(+), 69 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 2198c33..2707972 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -587,12 +587,21 @@ TclInitEncodingSubsystem(void)
     type.encodingName   = "utf-16le";
     type.clientData	= INT2PTR(FLAG_LE);
     Tcl_CreateEncoding(&type);
+    type.encodingName   = "wtf-16le";
+    type.clientData	= INT2PTR(FLAG_LE + FLAG_WTF);
+    Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16be";
     type.clientData	= INT2PTR(0);
     Tcl_CreateEncoding(&type);
+    type.encodingName   = "wtf-16be";
+    type.clientData	= INT2PTR(FLAG_WTF);
+    Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16";
     type.clientData	= INT2PTR(isLe.c);
     Tcl_CreateEncoding(&type);
+    type.encodingName   = "wtf-16";
+    type.clientData	= INT2PTR(isLe.c + FLAG_WTF);
+    Tcl_CreateEncoding(&type);
 
 #ifndef TCL_NO_DEPRECATED
     type.encodingName   = "unicode";
@@ -2281,11 +2290,7 @@ UtfToUtfProc(
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
-    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
-				 * information used during a piecewise
-				 * conversion. Contents of statePtr are
-				 * initialized and/or reset by conversion
-				 * routine under control of flags argument. */
+    TCL_UNUSED(Tcl_EncodingState *),
     char *dst,			/* Output buffer in which converted string is
 				 * stored. */
     int dstLen,			/* The maximum length of output buffer in
@@ -2309,11 +2314,8 @@ UtfToUtfProc(
     const char *dstStart, *dstEnd;
     int result, numChars, charLimit = INT_MAX;
     int encflags = PTR2INT(clientData);
-    int *chPtr = (int *) statePtr;
+    int ch;
 
-    if (flags & TCL_ENCODING_START) {
-    	*statePtr = 0;
-    }
     result = TCL_OK;
 
     srcStart = src;
@@ -2350,7 +2352,6 @@ UtfToUtfProc(
 	     */
 
 	    *dst++ = *src++;
-	    *chPtr = 0; /* reset surrogate handling */
 	} else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 &&
 		(src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) {
 	    /*
@@ -2358,7 +2359,6 @@ UtfToUtfProc(
 	     */
 
 	    *dst++ = 0;
-	    *chPtr = 0; /* reset surrogate handling */
 	    src += 2;
 	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
 	    /*
@@ -2367,32 +2367,32 @@ UtfToUtfProc(
 	     * incomplete char its bytes are made to represent themselves.
 	     */
 
-	    *chPtr = UCHAR(*src);
+	    ch = UCHAR(*src);
 	    src += 1;
-	    dst += Tcl_UniCharToUtf(*chPtr, dst);
+	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
-	    src += TclUtfToUCS4(src, chPtr);
-	    if ((*chPtr | 0x7FF) == 0xDFFF) {
+	    src += TclUtfToUCS4(src, &ch);
+	    if ((ch | 0x7FF) == 0xDFFF) {
 		/* A surrogate character is detected, handle especially */
-		int low = *chPtr;
+		int low = ch;
 		size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
-		if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
+		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
 		    if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) {
-			*chPtr = 0xFFFD;
+			ch = 0xFFFD;
 		    }
-			*dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
-			*dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
-			*dst++ = (char) ((*chPtr | 0x80) & 0xBF);
+			*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
+			*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
+			*dst++ = (char) ((ch | 0x80) & 0xBF);
 			continue;
 		}
 		src += len;
-		dst += Tcl_UniCharToUtf(*chPtr, dst);
-		*chPtr = low;
+		dst += Tcl_UniCharToUtf(ch, dst);
+		ch = low;
 	    } else if ((pureNullMode == 1) && !(encflags & FLAG_WTF)
-		    && !Tcl_UniCharIsUnicode(*chPtr)) {
-		*chPtr = 0xFFFD;
+		    && !Tcl_UniCharIsUnicode(ch)) {
+		ch = 0xFFFD;
 	    }
-	    dst += Tcl_UniCharToUtf(*chPtr, dst);
+	    dst += Tcl_UniCharToUtf(ch, dst);
 	}
     }
 
@@ -2420,7 +2420,7 @@ UtfToUtfProc(
 
 static int
 Utf16ToUtfProc(
-    ClientData clientData,	/* != NULL means LE, == NUL means BE */
+    ClientData clientData,	/* flags */
     const char *src,		/* Source string in Unicode. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
@@ -2444,6 +2444,7 @@ Utf16ToUtfProc(
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart;
     int result, numChars, charLimit = INT_MAX;
+    int encflags = PTR2INT(clientData);
     unsigned short ch;
 
     if (flags & TCL_ENCODING_CHAR_LIMIT) {
@@ -2457,7 +2458,7 @@ Utf16ToUtfProc(
 	srcLen--;
     }
     /* If last code point is a high surrogate, we cannot handle that yet */
-    if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) {
+    if ((srcLen >= 2) && ((src[srcLen - ((encflags & FLAG_LE)?1:2)] & 0xFC) == 0xD8)) {
 	result = TCL_CONVERT_MULTIBYTE;
 	srcLen-= 2;
     }
@@ -2474,7 +2475,7 @@ Utf16ToUtfProc(
 	    break;
 	}
 
-	if (clientData) {
+	if (encflags & FLAG_LE) {
 	    ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
 	} else {
 	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
@@ -2515,15 +2516,11 @@ Utf16ToUtfProc(
 
 static int
 UtfToUtf16Proc(
-    ClientData clientData,	/* != NULL means LE, == NUL means BE */
+    ClientData clientData1,	/* flags */
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
-    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
-				 * information used during a piecewise
-				 * conversion. Contents of statePtr are
-				 * initialized and/or reset by conversion
-				 * routine under control of flags argument. */
+    TCL_UNUSED(Tcl_EncodingState *),
     char *dst,			/* Output buffer in which converted string is
 				 * stored. */
     int dstLen,			/* The maximum length of output buffer in
@@ -2542,11 +2539,9 @@ UtfToUtf16Proc(
 {
     const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
     int result, numChars;
-    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+    int encflags = PTR2INT(clientData1);
+    int ch;
 
-    if (flags & TCL_ENCODING_START) {
-    	*statePtr = 0;
-    }
     srcStart = src;
     srcEnd = src + srcLen;
     srcClose = srcEnd;
@@ -2572,38 +2567,30 @@ UtfToUtf16Proc(
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	src += TclUtfToUniChar(src, chPtr);
-
-	if (clientData) {
-#if TCL_UTF_MAX > 3
-	    if (*chPtr <= 0xFFFF) {
-		*dst++ = (*chPtr & 0xFF);
-		*dst++ = (*chPtr >> 8);
+	src += TclUtfToUCS4(src, &ch);
+	if (!(encflags & FLAG_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+	    ch = 0xFFFD;
+	}
+	if (encflags & FLAG_LE) {
+	    if (ch <= 0xFFFF) {
+		*dst++ = (ch & 0xFF);
+		*dst++ = (ch >> 8);
 	    } else {
-		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
-		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
-		*dst++ = (*chPtr & 0xFF);
-		*dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
+		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+		*dst++ = (ch & 0xFF);
+		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
 	    }
-#else
-	    *dst++ = (*chPtr & 0xFF);
-	    *dst++ = (*chPtr >> 8);
-#endif
 	} else {
-#if TCL_UTF_MAX > 3
-	    if (*chPtr <= 0xFFFF) {
-		*dst++ = (*chPtr >> 8);
-		*dst++ = (*chPtr & 0xFF);
+	    if (ch <= 0xFFFF) {
+		*dst++ = (ch >> 8);
+		*dst++ = (ch & 0xFF);
 	    } else {
-		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
-		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
-		*dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
-		*dst++ = (*chPtr & 0xFF);
+		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
+		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
+		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
+		*dst++ = (ch & 0xFF);
 	    }
-#else
-	    *dst++ = (*chPtr >> 8);
-	    *dst++ = (*chPtr & 0xFF);
-#endif
 	}
     }
     *srcReadPtr = src - srcStart;
diff --git a/tests/encoding.test b/tests/encoding.test
index 76b830d..43aecbb 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -495,14 +495,20 @@ test encoding-17.1 {UtfToUtf16Proc} -body {
     encoding convertto utf-16 "\U460DC"
 } -result "\xD8\xD8\xDC\xDC"
 test encoding-17.2 {UtfToUtf16Proc} -body {
-    encoding convertto utf-16 "\uDCDC"
+    encoding convertto wtf-16 "\uDCDC"
 } -result "\xDC\xDC"
 test encoding-17.3 {UtfToUtf16Proc} -body {
-    encoding convertto utf-16 "\uD8D8"
+    encoding convertto wtf-16 "\uD8D8"
 } -result "\xD8\xD8"
 test encoding-17.4 {UtfToUcs2Proc} -body {
     encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"]
 } -result "\uFFFD"
+test encoding-17.5 {UtfToUtf16Proc} -body {
+    encoding convertto utf-16be "\uDCDC"
+} -result "\xFF\xFD"
+test encoding-17.6 {UtfToUtf16Proc} -body {
+    encoding convertto utf-16le "\uD8D8"
+} -result "\xFD\xFF"
 
 test encoding-18.1 {TableToUtfProc} {
 } {}
@@ -802,7 +808,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 87 : 86}]
+} -result [expr {[info exists ::tcl_precision] ? 90 : 89}]
 
 runtests
 
-- 
cgit v0.12


From 1806e5755f1240beb778c171d3b7a2797276adf2 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Sun, 28 Mar 2021 22:01:51 +0000
Subject: Make a start with CESU-8 encoder/decoder. Not finished yet

---
 generic/tclEncoding.c | 25 +++++++++++++++++--------
 tests/encoding.test   |  2 +-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index b69f7fc..a158269 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -513,7 +513,8 @@ FillEncodingFileMap(void)
 /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
 #define TCL_ENCODING_MODIFIED	0x20	/* Converting NULL bytes to 0xC0 0x80 */
 #define TCL_ENCODING_LE		0x80	/* Little-endian encoding, for ucs-2/utf-16 only */
-#define TCL_ENCODING_WTF	0x100	/* For wtf-8 encoding */
+#define TCL_ENCODING_WTF	0x100	/* For WTF-8 encoding, don't check for surrogates/noncharacters */
+#define TCL_ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
 
 void
 TclInitEncodingSubsystem(void)
@@ -555,11 +556,14 @@ TclInitEncodingSubsystem(void)
     type.fromUtfProc	= UtfToUtfProc;
     type.freeProc	= NULL;
     type.nullSize	= 1;
-    type.clientData	= NULL;
+    type.clientData	= INT2PTR(TCL_ENCODING_UTF);
     Tcl_CreateEncoding(&type);
-    type.clientData	= INT2PTR(TCL_ENCODING_WTF);
+    type.clientData	= INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF);
     type.encodingName	= "wtf-8";
     Tcl_CreateEncoding(&type);
+    type.clientData	= INT2PTR(0);
+    type.encodingName	= "cesu-8";
+    Tcl_CreateEncoding(&type);
 
     type.toUtfProc	= Utf16ToUtfProc;
     type.fromUtfProc    = UtfToUcs2Proc;
@@ -1150,7 +1154,7 @@ Tcl_ExternalToUtfDString(
 	srcLen = encodingPtr->lengthProc(src);
     }
 
-    flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED;
+    flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
 
     while (1) {
 	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1266,7 +1270,7 @@ Tcl_ExternalToUtf(
 
 	dstLen--;
     }
-    flags |= TCL_ENCODING_MODIFIED;
+    flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
     do {
 	Tcl_EncodingState savedState = *statePtr;
 
@@ -2221,8 +2225,8 @@ UtfToUtfProc(
     }
 
     dstStart = dst;
-    dstEnd = dst + dstLen - TCL_UTF_MAX;
     flags |= PTR2INT(clientData);
+    dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
 
     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2269,18 +2273,22 @@ UtfToUtfProc(
 	    src += 1;
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
+	    int low;
 	    size_t len = TclUtfToUCS4(src, &ch);
 	    if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
 		result = TCL_CONVERT_SYNTAX;
 		break;
 	    }
 	    src += len;
-	    if ((ch | 0x7FF) == 0xDFFF) {
+	    if (!(flags & TCL_ENCODING_UTF)) {
+		// TODO : handle chars > U+FFFF
+		goto cesu8;
+	    } else if ((ch | 0x7FF) == 0xDFFF) {
 		/*
 		 * A surrogate character is detected, handle especially.
 		 */
 
-		int low = ch;
+		low = ch;
 		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
 
 		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
@@ -2293,6 +2301,7 @@ UtfToUtfProc(
 			    ch = 0xFFFD;
 			}
 		    }
+		cesu8:
 		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
 		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
 		    *dst++ = (char) ((ch | 0x80) & 0xBF);
diff --git a/tests/encoding.test b/tests/encoding.test
index 43aecbb..0ce009f 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -808,7 +808,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 90 : 89}]
+} -result [expr {[info exists ::tcl_precision] ? 91 : 90}]
 
 runtests
 
-- 
cgit v0.12


From 958892fdd0e13a651556c8f6d06207ed5f759974 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 29 Mar 2021 09:51:54 +0000
Subject: Finish CESU-8 encoder/decoder

---
 generic/tclEncoding.c | 9 ++++++++-
 tests/encoding.test   | 6 ++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index a158269..79f9896 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2281,7 +2281,14 @@ UtfToUtfProc(
 	    }
 	    src += len;
 	    if (!(flags & TCL_ENCODING_UTF)) {
-		// TODO : handle chars > U+FFFF
+		if (ch > 0xFFFF) {
+		    /* CESU-8 6-byte sequence for chars > U+FFFF */
+		    ch -= 0x10000;
+		    *dst++ = 0xED;
+		    *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
+		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
+		    ch = (ch & 0x0CFF) | 0xDC00;
+		}
 		goto cesu8;
 	    } else if ((ch | 0x7FF) == 0xDFFF) {
 		/*
diff --git a/tests/encoding.test b/tests/encoding.test
index 0ce009f..2964a56 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -469,6 +469,12 @@ test encoding-15.27 {UtfToUtfProc low surrogate character output} {
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
+test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} {
+    set x \U10000
+    set y [encoding convertto cesu-8 \U10000]
+    binary scan $y H* z
+    list [string length $x] [string length $y] $z
+} {2 6 eda080edb080}
 
 test encoding-16.1 {Utf16ToUtfProc} -body {
     set val [encoding convertfrom utf-16 NN]
-- 
cgit v0.12


From 48a57e0116ab6513050d1e280014bde2010c8903 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Tue, 30 Mar 2021 14:14:02 +0000
Subject: Bugfix (backported from  encodings-with-flags branch): Use correct
 byte/char positions in error-situation

---
 generic/tclEncoding.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 8f4612b..445e613 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -1161,13 +1161,12 @@ Tcl_ExternalToUtfDString(
 		flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars);
 	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
 
+	src += srcRead;
 	if (result != TCL_CONVERT_NOSPACE) {
 	    Tcl_DStringSetLength(dstPtr, soFar);
 	    return Tcl_DStringValue(dstPtr);
 	}
-
 	flags &= ~TCL_ENCODING_START;
-	src += srcRead;
 	srcLen -= srcRead;
 	if (Tcl_DStringLength(dstPtr) == 0) {
 	    Tcl_DStringSetLength(dstPtr, dstLen);
@@ -1348,6 +1347,7 @@ Tcl_UtfToExternalDString(
 		&srcRead, &dstWrote, &dstChars);
 	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
 
+	src += srcRead;
 	if (result != TCL_CONVERT_NOSPACE) {
 	    if (encodingPtr->nullSize == 2) {
 		Tcl_DStringSetLength(dstPtr, soFar + 1);
@@ -1357,7 +1357,6 @@ Tcl_UtfToExternalDString(
 	}
 
 	flags &= ~TCL_ENCODING_START;
-	src += srcRead;
 	srcLen -= srcRead;
 	if (Tcl_DStringLength(dstPtr) == 0) {
 	    Tcl_DStringSetLength(dstPtr, dstLen);
@@ -2274,6 +2273,7 @@ UtfToUtfProc(
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
 	    int low;
+	    const char *saveSrc = src;
 	    size_t len = TclUtfToUCS4(src, &ch);
 	    if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
 		result = TCL_CONVERT_SYNTAX;
@@ -2302,6 +2302,7 @@ UtfToUtfProc(
 		    if (!(flags & TCL_ENCODING_WTF)) {
 			if (flags & TCL_ENCODING_STOPONERROR) {
 			    result = TCL_CONVERT_UNKNOWN;
+			    src = saveSrc;
 			    break;
 			}
 			if (!(flags & TCL_ENCODING_MODIFIED)) {
@@ -2320,6 +2321,7 @@ UtfToUtfProc(
 	    } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
 		if (flags & TCL_ENCODING_STOPONERROR) {
 		    result = TCL_CONVERT_UNKNOWN;
+		    src = saveSrc;
 		    break;
 		}
 		if (!(flags & TCL_ENCODING_MODIFIED)) {
@@ -2483,7 +2485,7 @@ UtfToUtf16Proc(
 {
     const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
     int result, numChars;
-    int ch;
+    int ch, len;
 
     srcStart = src;
     srcEnd = src + srcLen;
@@ -2511,7 +2513,7 @@ UtfToUtf16Proc(
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
 	}
-	src += TclUtfToUCS4(src, &ch);
+	len = TclUtfToUCS4(src, &ch);
 	if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
 	    if (flags & TCL_ENCODING_STOPONERROR) {
 		result = TCL_CONVERT_UNKNOWN;
@@ -2519,6 +2521,7 @@ UtfToUtf16Proc(
 	    }
 	    ch = 0xFFFD;
 	}
+	src += len;
 	if (flags & TCL_ENCODING_LE) {
 	    if (ch <= 0xFFFF) {
 		*dst++ = (ch & 0xFF);
-- 
cgit v0.12


From 3175cb43a5f12eff36cc942e3b096164a5622574 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 31 Mar 2021 06:40:02 +0000
Subject: Re-use TCL_ENCODING_MODIFIED flag value for TCL_ENCODING_LE too,
 since they are used for different encoders.

---
 generic/tclEncoding.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 445e613..0100326 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -511,8 +511,10 @@ FillEncodingFileMap(void)
  */
 
 /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
+/* Since TCL_ENCODING_MODIFIED is only used for utf-8/wtf-8/cesu-8 and
+ * TCL_ENCODING_LE is only used for  utf-16/wtf-16/ucs-2. re-use the same value */
 #define TCL_ENCODING_MODIFIED	0x20	/* Converting NULL bytes to 0xC0 0x80 */
-#define TCL_ENCODING_LE		0x80	/* Little-endian encoding, for ucs-2/utf-16 only */
+#define TCL_ENCODING_LE		TCL_ENCODING_MODIFIED	/* Little-endian encoding */
 #define TCL_ENCODING_WTF	0x100	/* For WTF-8 encoding, don't check for surrogates/noncharacters */
 #define TCL_ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
 
@@ -524,7 +526,7 @@ TclInitEncodingSubsystem(void)
     unsigned size;
     unsigned short i;
     union {
-        unsigned char c;
+        char c;
         short s;
     } isLe;
 
@@ -1154,7 +1156,10 @@ Tcl_ExternalToUtfDString(
 	srcLen = encodingPtr->lengthProc(src);
     }
 
-    flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+    flags = TCL_ENCODING_START | TCL_ENCODING_END;
+    if (encodingPtr->toUtfProc == UtfToUtfProc) {
+	flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+    }
 
     while (1) {
 	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1269,7 +1274,9 @@ Tcl_ExternalToUtf(
 
 	dstLen--;
     }
-    flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+    if (encodingPtr->toUtfProc == UtfToUtfProc) {
+	flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
+    }
     do {
 	Tcl_EncodingState savedState = *statePtr;
 
@@ -2185,7 +2192,7 @@ BinaryProc(
 
 static int
 UtfToUtfProc(
-    ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
+    ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_MODIFIED */
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* Conversion control flags. */
-- 
cgit v0.12


From 6b90b986b2b9db1fa7d0d34fb406ce0b66f6889d Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Wed, 31 Mar 2021 06:53:27 +0000
Subject: Fix testcase for TCL_UTF_MAX=4

---
 tests/encoding.test | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/encoding.test b/tests/encoding.test
index 0a40805..071ac27 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -470,11 +470,10 @@ test encoding-15.27 {UtfToUtfProc low surrogate character output} {
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
 test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} {
-    set x \U10000
     set y [encoding convertto cesu-8 \U10000]
     binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 6 eda080edb080}
+    list [string length $y] $z
+} {6 eda080edb080}
 
 test encoding-16.1 {Utf16ToUtfProc} -body {
     set val [encoding convertfrom utf-16 NN]
-- 
cgit v0.12


From 2e7b2436f1c5ea439341ba19e85964ae9444f76c Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Thu, 8 Apr 2021 11:14:56 +0000
Subject: Add "tcl-8" encoding

---
 generic/tclEncoding.c | 3 +++
 tests/encoding.test   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 0632839..29aeefd 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -566,6 +566,9 @@ TclInitEncodingSubsystem(void)
     type.clientData	= INT2PTR(0);
     type.encodingName	= "cesu-8";
     Tcl_CreateEncoding(&type);
+    type.clientData	= INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF|TCL_ENCODING_MODIFIED);
+    type.encodingName	= "tcl-8";
+    Tcl_CreateEncoding(&type);
 
     type.toUtfProc	= Utf16ToUtfProc;
     type.fromUtfProc    = UtfToUcs2Proc;
diff --git a/tests/encoding.test b/tests/encoding.test
index 071ac27..9924886 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -813,7 +813,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 91 : 90}]
+} -result [expr {[info exists ::tcl_precision] ? 92 : 91}]
 
 runtests
 
-- 
cgit v0.12


From aa4f7bcd47ef35e83a3a5a6c3f16398bf3758e49 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 9 Apr 2021 15:19:51 +0000
Subject: Deprecate/remove "string bytelength"

---
 doc/string.n          | 12 ++++++------
 generic/tclCmdMZ.c    |  4 ++++
 library/init.tcl      |  4 ++--
 tests/info.test       |  4 ++--
 tests/regexp.test     |  3 ++-
 tests/regexpComp.test |  4 +++-
 tests/string.test     | 19 ++++++++++---------
 7 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/doc/string.n b/doc/string.n
index 7cd53ca..f1a0592 100644
--- a/doc/string.n
+++ b/doc/string.n
@@ -404,7 +404,7 @@ Formally, the \fBstring bytelength\fR operation returns the content of
 the \fIlength\fR field of the \fBTcl_Obj\fR structure, after calling
 \fBTcl_GetString\fR to ensure that the \fIbytes\fR field is populated.
 This is highly unlikely to be useful to Tcl scripts, as Tcl's internal
-encoding is not strict UTF\-8, but rather a modified CESU\-8 with a
+encoding is not strict UTF\-8, but rather a modified WTF\-8 with a
 denormalized NUL (identical to that used in a number of places by
 Java's serialization mechanism) to enable basic processing with
 non-Unicode-aware C functions.  As this representation should only
@@ -413,13 +413,13 @@ store the representation is of very low value (except to C extension
 code, which has direct access for the purpose of memory management,
 etc.)
 .PP
-\fICompatibility note:\fR it is likely that this subcommand will be
-withdrawn in a future version of Tcl. It is better to use the
-\fBencoding convertto\fR command to convert a string to a known
-encoding and then apply \fBstring length\fR to that.
+\fICompatibility note:\fR This subcommand is deprecated and will
+be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR
+command to convert a string to a known encoding (e.g. "wtf-8" or "tcl-8")
+and then apply \fBstring length\fR to that.
 .PP
 .CS
-\fBstring length\fR [encoding convertto utf-8 $theString]
+\fBstring length\fR [encoding convertto wtf-8 $theString]
 .CE
 .RE
 .TP
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 61d1010..6f71198 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -2835,6 +2835,7 @@ StringCatCmd(
  *
  *----------------------------------------------------------------------
  */
+#if TCL_MAJOR_VERSION < 9 && !defined(TCL_NO_DEPRECATED)
 static int
 StringBytesCmd(
     TCL_UNUSED(ClientData),
@@ -2853,6 +2854,7 @@ StringBytesCmd(
     Tcl_SetObjResult(interp, Tcl_NewWideIntObj(length));
     return TCL_OK;
 }
+#endif
 
 /*
  *----------------------------------------------------------------------
@@ -3309,7 +3311,9 @@ TclInitStringCmd(
     Tcl_Interp *interp)		/* Current interpreter. */
 {
     static const EnsembleImplMap stringImplMap[] = {
+#if TCL_MAJOR_VERSION < 9 && !defined(TCL_NO_DEPRECATED)
 	{"bytelength",	StringBytesCmd,	TclCompileBasic1ArgCmd, NULL, NULL, 0},
+#endif
 	{"cat",		StringCatCmd,	TclCompileStringCatCmd, NULL, NULL, 0},
 	{"compare",	StringCmpCmd,	TclCompileStringCmpCmd, NULL, NULL, 0},
 	{"equal",	StringEqualCmd,	TclCompileStringEqualCmd, NULL, NULL, 0},
diff --git a/library/init.tcl b/library/init.tcl
index c9bfff6..749eed9 100644
--- a/library/init.tcl
+++ b/library/init.tcl
@@ -214,9 +214,9 @@ proc unknown args {
 		set errInfo [dict get $opts -errorinfo]
 		set errCode [dict get $opts -errorcode]
 		set cinfo $args
-		if {[string bytelength $cinfo] > 150} {
+		if {[string length [encoding convertto wtf-8 $cinfo]] > 150} {
 		    set cinfo [string range $cinfo 0 150]
-		    while {[string bytelength $cinfo] > 150} {
+		    while {[string length [encoding convertto wtf-8 $cinfo]] > 150} {
 			set cinfo [string range $cinfo 0 end-1]
 		    }
 		    append cinfo ...
diff --git a/tests/info.test b/tests/info.test
index d9a4f54..ced4435 100644
--- a/tests/info.test
+++ b/tests/info.test
@@ -22,7 +22,7 @@ if {{::tcltest} ni [namespace children]} {
 ::tcltest::loadTestedCommands
 catch [list package require -exact tcl::test [info patchlevel]]
 testConstraint zlib [llength [info commands zlib]]
-
+testConstraint nodep [info exists tcl_precision]
 # Set up namespaces needed to test operation of "info args", "info body",
 # "info default", and "info procs" with imported procedures.
 
@@ -101,7 +101,7 @@ test info-2.5 {info body option, returning bytecompiled bodies} -body {
 # Fix for problem tested for in info-2.5 caused problems when
 # procedure body had no string rep (i.e. was not yet bytecode)
 # causing an empty string to be returned [Bug #545644]
-test info-2.6 {info body option, returning list bodies} {
+test info-2.6 {info body option, returning list bodies} nodep {
     proc foo args [list subst bar]
     list [string bytelength [info body foo]] \
 	    [foo; string bytelength [info body foo]]
diff --git a/tests/regexp.test b/tests/regexp.test
index e788b7f..6bed21e 100644
--- a/tests/regexp.test
+++ b/tests/regexp.test
@@ -19,6 +19,7 @@ if {"::tcltest" ni [namespace children]} {
 unset -nocomplain foo
 
 testConstraint exec [llength [info commands exec]]
+testConstraint nodep [info exists tcl_precision]
 
 # Used for constraining memory leak tests
 testConstraint memory [llength [info commands memory]]
@@ -765,7 +766,7 @@ test regexp-19.2 {regsub null replacement} {
     string equal $result $expected
 } 1
 
-test regexp-20.1 {regsub shared object shimmering} -body {
+test regexp-20.1 {regsub shared object shimmering} -constraints nodep -body {
     # Bug #461322
     set a abcdefghijklmnopqurstuvwxyz
     set b $a
diff --git a/tests/regexpComp.test b/tests/regexpComp.test
index 76e708d..1587c72 100644
--- a/tests/regexpComp.test
+++ b/tests/regexpComp.test
@@ -16,6 +16,8 @@ if {"::tcltest" ni [namespace children]} {
     namespace import -force ::tcltest::*
 }
 
+testConstraint nodep [info exists tcl_precision]
+
 # Procedure to evaluate a script within a proc, to test compilation
 # functionality
 
@@ -791,7 +793,7 @@ test regexpComp-19.1 {regsub null replacement} {
     }
 } "\0a\0hel\0a\0lo\0a\0 14"
 
-test regexpComp-20.1 {regsub shared object shimmering} {
+test regexpComp-20.1 {regsub shared object shimmering} nodep {
     evalInProc {
 	# Bug #461322
 	set a abcdefghijklmnopqurstuvwxyz
diff --git a/tests/string.test b/tests/string.test
index 72aeb43..99c1517 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -33,6 +33,7 @@ testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
 testConstraint testevalex [expr {[info commands testevalex] ne {}}]
 testConstraint utf16 [expr {[string length \U010000] == 2}]
 testConstraint testbytestring   [llength [info commands testbytestring]]
+testConstraint nodep [info exists tcl_precision]
 
 # Used for constraining memory leak tests
 testConstraint memory [llength [info commands memory]]
@@ -72,9 +73,9 @@ if {$noComp} {
 }
 
 
-test string-1.1.$noComp {error conditions} {
+test string-1.1.$noComp {error conditions} -body {
     list [catch {run {string gorp a b}} msg] $msg
-} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+} -match glob -result {1 {unknown or ambiguous subcommand "gorp": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-1.2.$noComp {error conditions} {
     list [catch {run {string}} msg] $msg
 } {1 {wrong # args: should be "string subcommand ?arg ...?"}}
@@ -1035,16 +1036,16 @@ test string-7.16.$noComp {string last, start index} {
     run {string last Üa ÜadÜad end-1}
 } 3
 
-test string-8.1.$noComp {string bytelength} {
+test string-8.1.$noComp {string bytelength} nodep {
     list [catch {run {string bytelength}} msg] $msg
 } {1 {wrong # args: should be "string bytelength string"}}
-test string-8.2.$noComp {string bytelength} {
+test string-8.2.$noComp {string bytelength} nodep {
     list [catch {run {string bytelength a b}} msg] $msg
 } {1 {wrong # args: should be "string bytelength string"}}
-test string-8.3.$noComp {string bytelength} {
+test string-8.3.$noComp {string bytelength} nodep {
     run {string bytelength "\xC7"}
 } 2
-test string-8.4.$noComp {string bytelength} {
+test string-8.4.$noComp {string bytelength} nodep {
     run {string b ""}
 } 0
 
@@ -1827,9 +1828,9 @@ test string-19.3.$noComp {string trimleft, unicode default} {
 test string-20.1.$noComp {string trimright errors} {
     list [catch {run {string trimright}} msg] $msg
 } {1 {wrong # args: should be "string trimright string ?chars?"}}
-test string-20.2.$noComp {string trimright errors} {
+test string-20.2.$noComp {string trimright errors} -body {
     list [catch {run {string trimg a}} msg] $msg
-} {1 {unknown or ambiguous subcommand "trimg": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+} -match glob -result {1 {unknown or ambiguous subcommand "trimg": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-20.3.$noComp {string trimright} {
     run {string trimright "    XYZ      "}
 } {    XYZ}
@@ -1949,7 +1950,7 @@ test string-21.25.$noComp {string trimright, unicode} {
 
 test string-22.1.$noComp {string wordstart} -body {
     list [catch {run {string word a}} msg] $msg
-} -result {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
+} -match glob -result {1 {unknown or ambiguous subcommand "word": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-22.2.$noComp {string wordstart} -body {
     list [catch {run {string wordstart a}} msg] $msg
 } -result {1 {wrong # args: should be "string wordstart string index"}}
-- 
cgit v0.12


From 3c9c7e062138b5f21935974d667eec0ae10c346c Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 16 Apr 2021 20:34:33 +0000
Subject: Remove wtf-8/wtf-16/tcl-8 encodings

---
 doc/string.n          |   4 +-
 generic/tclEncoding.c |  40 ++++++--------------
 library/init.tcl      |   4 +-
 tests/encoding.test   | 100 +++++++++-----------------------------------------
 4 files changed, 32 insertions(+), 116 deletions(-)

diff --git a/doc/string.n b/doc/string.n
index f1a0592..f3d7616 100644
--- a/doc/string.n
+++ b/doc/string.n
@@ -415,11 +415,11 @@ etc.)
 .PP
 \fICompatibility note:\fR This subcommand is deprecated and will
 be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR
-command to convert a string to a known encoding (e.g. "wtf-8" or "tcl-8")
+command to convert a string to a known encoding (e.g. "utf-8" or "cesu-8")
 and then apply \fBstring length\fR to that.
 .PP
 .CS
-\fBstring length\fR [encoding convertto wtf-8 $theString]
+\fBstring length\fR [encoding convertto utf-8 $theString]
 .CE
 .RE
 .TP
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 29aeefd..21c254e 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -511,11 +511,10 @@ FillEncodingFileMap(void)
  */
 
 /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
-/* Since TCL_ENCODING_MODIFIED is only used for utf-8/wtf-8/cesu-8 and
- * TCL_ENCODING_LE is only used for  utf-16/wtf-16/ucs-2. re-use the same value */
+/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
+ * TCL_ENCODING_LE is only used for  utf-16/ucs-2. re-use the same value */
 #define TCL_ENCODING_MODIFIED	0x20	/* Converting NULL bytes to 0xC0 0x80 */
 #define TCL_ENCODING_LE		TCL_ENCODING_MODIFIED	/* Little-endian encoding */
-#define TCL_ENCODING_WTF	0x100	/* For WTF-8 encoding, don't check for surrogates/noncharacters */
 #define TCL_ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
 
 void
@@ -560,15 +559,9 @@ TclInitEncodingSubsystem(void)
     type.nullSize	= 1;
     type.clientData	= INT2PTR(TCL_ENCODING_UTF);
     Tcl_CreateEncoding(&type);
-    type.clientData	= INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF);
-    type.encodingName	= "wtf-8";
-    Tcl_CreateEncoding(&type);
     type.clientData	= INT2PTR(0);
     type.encodingName	= "cesu-8";
     Tcl_CreateEncoding(&type);
-    type.clientData	= INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF|TCL_ENCODING_MODIFIED);
-    type.encodingName	= "tcl-8";
-    Tcl_CreateEncoding(&type);
 
     type.toUtfProc	= Utf16ToUtfProc;
     type.fromUtfProc    = UtfToUcs2Proc;
@@ -591,21 +584,12 @@ TclInitEncodingSubsystem(void)
     type.encodingName   = "utf-16le";
     type.clientData	= INT2PTR(TCL_ENCODING_LE);
     Tcl_CreateEncoding(&type);
-    type.encodingName   = "wtf-16le";
-    type.clientData	= INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF);
-    Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16be";
     type.clientData	= INT2PTR(0);
     Tcl_CreateEncoding(&type);
-    type.encodingName   = "wtf-16be";
-    type.clientData	= INT2PTR(TCL_ENCODING_WTF);
-    Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16";
     type.clientData	= INT2PTR(isLe.c);
     Tcl_CreateEncoding(&type);
-    type.encodingName   = "wtf-16";
-    type.clientData	= INT2PTR(isLe.c + TCL_ENCODING_WTF);
-    Tcl_CreateEncoding(&type);
 
 #ifndef TCL_NO_DEPRECATED
     type.encodingName   = "unicode";
@@ -2315,15 +2299,13 @@ UtfToUtfProc(
 		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
 
 		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
-		    if (!(flags & TCL_ENCODING_WTF)) {
-			if (flags & TCL_ENCODING_STOPONERROR) {
-			    result = TCL_CONVERT_UNKNOWN;
-			    src = saveSrc;
-			    break;
-			}
-			if (!(flags & TCL_ENCODING_MODIFIED)) {
-			    ch = 0xFFFD;
-			}
+		    if (flags & TCL_ENCODING_STOPONERROR) {
+			result = TCL_CONVERT_UNKNOWN;
+			src = saveSrc;
+			break;
+		    }
+		    if (!(flags & TCL_ENCODING_MODIFIED)) {
+			ch = 0xFFFD;
 		    }
 		cesu8:
 		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
@@ -2334,7 +2316,7 @@ UtfToUtfProc(
 		src += len;
 		dst += Tcl_UniCharToUtf(ch, dst);
 		ch = low;
-	    } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+	    } else if (!Tcl_UniCharIsUnicode(ch)) {
 		if (flags & TCL_ENCODING_STOPONERROR) {
 		    result = TCL_CONVERT_UNKNOWN;
 		    src = saveSrc;
@@ -2530,7 +2512,7 @@ UtfToUtf16Proc(
 	    break;
 	}
 	len = TclUtfToUCS4(src, &ch);
-	if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+	if (!Tcl_UniCharIsUnicode(ch)) {
 	    if (flags & TCL_ENCODING_STOPONERROR) {
 		result = TCL_CONVERT_UNKNOWN;
 		break;
diff --git a/library/init.tcl b/library/init.tcl
index 749eed9..e30296e 100644
--- a/library/init.tcl
+++ b/library/init.tcl
@@ -214,9 +214,9 @@ proc unknown args {
 		set errInfo [dict get $opts -errorinfo]
 		set errCode [dict get $opts -errorcode]
 		set cinfo $args
-		if {[string length [encoding convertto wtf-8 $cinfo]] > 150} {
+		if {[string length [encoding convertto utf-8 $cinfo]] > 150} {
 		    set cinfo [string range $cinfo 0 150]
-		    while {[string length [encoding convertto wtf-8 $cinfo]] > 150} {
+		    while {[string length [encoding convertto utf-8 $cinfo]] > 150} {
 			set cinfo [string range $cinfo 0 end-1]
 		    }
 		    append cinfo ...
diff --git a/tests/encoding.test b/tests/encoding.test
index 9924886..82a2d6b 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -338,138 +338,78 @@ test encoding-15.5 {UtfToUtfProc emoji character input} {
     set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
     list [string length $x] $y
 } "4 😂"
-test encoding-15.6 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83D\uDE02\uD83D
-    set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D]
-    binary scan $y H* z
-    list [string length $y] $z
-} {10 edb882f09f9882eda0bd}
-test encoding-15.7 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83D\uD83D
-    set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {3 9 edb882eda0bdeda0bd}
-test encoding-15.8 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83Dé
-    set y [encoding convertto wtf-8 \uDE02\uD83Dé]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {3 8 edb882eda0bdc3a9}
-test encoding-15.9 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83DX
-    set y [encoding convertto wtf-8 \uDE02\uD83DX]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {3 7 edb882eda0bd58}
-test encoding-15.10 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02é
-    set y [encoding convertto wtf-8 \uDE02é]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 5 edb882c3a9}
-test encoding-15.11 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02é
-    set y [encoding convertto wtf-8 \uDA02é]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 5 eda882c3a9}
-test encoding-15.12 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02Y
-    set y [encoding convertto wtf-8 \uDE02Y]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 4 edb88259}
-test encoding-15.13 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02Y
-    set y [encoding convertto wtf-8 \uDA02Y]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 4 eda88259}
-test encoding-15.14 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02
-    set y [encoding convertto wtf-8 \uDE02]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {1 3 edb882}
-test encoding-15.15 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02
-    set y [encoding convertto wtf-8 \uDA02]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {1 3 eda882}
-test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
+test encoding-15.6 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
     set x \xF0\xA0\xA1\xC2
     set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2]
     list [string length $x] $y
 } "4 \xF0\xA0\xA1\xC2"
-test encoding-15.17 {UtfToUtfProc emoji character output} {
+test encoding-15.7 {UtfToUtfProc emoji character output} {
     set x 😂
     set y [encoding convertto utf-8 😂]
     binary scan $y H* z
     list [string length $y] $z
 } {4 f09f9882}
-test encoding-15.18 {UtfToUtfProc emoji character output} {
+test encoding-15.8 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uDE02\uD83D
     set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D]
     binary scan $y H* z
     list [string length $y] $z
 } {10 efbfbdf09f9882efbfbd}
-test encoding-15.19 {UtfToUtfProc emoji character output} {
+test encoding-15.9 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uD83D
     set y [encoding convertto utf-8 \uDE02\uD83D\uD83D]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 9 efbfbdefbfbdefbfbd}
-test encoding-15.20 {UtfToUtfProc emoji character output} {
+test encoding-15.10 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\xE9
     set y [encoding convertto utf-8 \uDE02\uD83D\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 8 efbfbdefbfbdc3a9}
-test encoding-15.21 {UtfToUtfProc emoji character output} {
+test encoding-15.11 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83DX
     set y [encoding convertto utf-8 \uDE02\uD83DX]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 7 efbfbdefbfbd58}
-test encoding-15.22 {UtfToUtfProc high surrogate character output} {
+test encoding-15.12 {UtfToUtfProc high surrogate character output} {
     set x \uDE02\xE9
     set y [encoding convertto utf-8 \uDE02\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 efbfbdc3a9}
-test encoding-15.23 {UtfToUtfProc low surrogate character output} {
+test encoding-15.13 {UtfToUtfProc low surrogate character output} {
     set x \uDA02\xE9
     set y [encoding convertto utf-8 \uDA02\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 efbfbdc3a9}
-test encoding-15.24 {UtfToUtfProc high surrogate character output} {
+test encoding-15.14 {UtfToUtfProc high surrogate character output} {
     set x \uDE02Y
     set y [encoding convertto utf-8 \uDE02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 efbfbd59}
-test encoding-15.25 {UtfToUtfProc low surrogate character output} {
+test encoding-15.15 {UtfToUtfProc low surrogate character output} {
     set x \uDA02Y
     set y [encoding convertto utf-8 \uDA02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 efbfbd59}
-test encoding-15.26 {UtfToUtfProc high surrogate character output} {
+test encoding-15.16 {UtfToUtfProc high surrogate character output} {
     set x \uDE02
     set y [encoding convertto utf-8 \uDE02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
-test encoding-15.27 {UtfToUtfProc low surrogate character output} {
+test encoding-15.17 {UtfToUtfProc low surrogate character output} {
     set x \uDA02
     set y [encoding convertto utf-8 \uDA02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
-test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} {
+test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} {
     set y [encoding convertto cesu-8 \U10000]
     binary scan $y H* z
     list [string length $y] $z
@@ -499,19 +439,13 @@ test encoding-16.4 {Ucs2ToUtfProc} -body {
 test encoding-17.1 {UtfToUtf16Proc} -body {
     encoding convertto utf-16 "\U460DC"
 } -result "\xD8\xD8\xDC\xDC"
-test encoding-17.2 {UtfToUtf16Proc} -body {
-    encoding convertto wtf-16 "\uDCDC"
-} -result "\xDC\xDC"
-test encoding-17.3 {UtfToUtf16Proc} -body {
-    encoding convertto wtf-16 "\uD8D8"
-} -result "\xD8\xD8"
-test encoding-17.4 {UtfToUcs2Proc} -body {
+test encoding-17.2 {UtfToUcs2Proc} -body {
     encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"]
 } -result "\uFFFD"
-test encoding-17.5 {UtfToUtf16Proc} -body {
+test encoding-17.3 {UtfToUtf16Proc} -body {
     encoding convertto utf-16be "\uDCDC"
 } -result "\xFF\xFD"
-test encoding-17.6 {UtfToUtf16Proc} -body {
+test encoding-17.4 {UtfToUtf16Proc} -body {
     encoding convertto utf-16le "\uD8D8"
 } -result "\xFD\xFF"
 
@@ -813,7 +747,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 92 : 91}]
+} -result [expr {[info exists ::tcl_precision] ? 87 : 86}]
 
 runtests
 
-- 
cgit v0.12


From 82d0339a6592bd7855cc08df656c252dda1352c5 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 16 Apr 2021 20:45:56 +0000
Subject: renumber testcases

---
 tests/encoding.test | 54 ++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/tests/encoding.test b/tests/encoding.test
index 82a2d6b..e1c55d7 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -338,77 +338,77 @@ test encoding-15.5 {UtfToUtfProc emoji character input} {
     set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
     list [string length $x] $y
 } "4 😂"
-test encoding-15.6 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
-    set x \xF0\xA0\xA1\xC2
-    set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2]
-    list [string length $x] $y
-} "4 \xF0\xA0\xA1\xC2"
-test encoding-15.7 {UtfToUtfProc emoji character output} {
-    set x 😂
-    set y [encoding convertto utf-8 😂]
-    binary scan $y H* z
-    list [string length $y] $z
-} {4 f09f9882}
-test encoding-15.8 {UtfToUtfProc emoji character output} {
+test encoding-15.6 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uDE02\uD83D
     set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D]
     binary scan $y H* z
     list [string length $y] $z
 } {10 efbfbdf09f9882efbfbd}
-test encoding-15.9 {UtfToUtfProc emoji character output} {
+test encoding-15.7 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uD83D
     set y [encoding convertto utf-8 \uDE02\uD83D\uD83D]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 9 efbfbdefbfbdefbfbd}
-test encoding-15.10 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83D\xE9
-    set y [encoding convertto utf-8 \uDE02\uD83D\xE9]
+test encoding-15.8 {UtfToUtfProc emoji character output} {
+    set x \uDE02\uD83Dé
+    set y [encoding convertto utf-8 \uDE02\uD83Dé]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 8 efbfbdefbfbdc3a9}
-test encoding-15.11 {UtfToUtfProc emoji character output} {
+test encoding-15.9 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83DX
     set y [encoding convertto utf-8 \uDE02\uD83DX]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 7 efbfbdefbfbd58}
-test encoding-15.12 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02\xE9
-    set y [encoding convertto utf-8 \uDE02\xE9]
+test encoding-15.10 {UtfToUtfProc high surrogate character output} {
+    set x \uDE02é
+    set y [encoding convertto utf-8 \uDE02é]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 efbfbdc3a9}
-test encoding-15.13 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02\xE9
-    set y [encoding convertto utf-8 \uDA02\xE9]
+test encoding-15.11 {UtfToUtfProc low surrogate character output} {
+    set x \uDA02é
+    set y [encoding convertto utf-8 \uDA02é]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 efbfbdc3a9}
-test encoding-15.14 {UtfToUtfProc high surrogate character output} {
+test encoding-15.12 {UtfToUtfProc high surrogate character output} {
     set x \uDE02Y
     set y [encoding convertto utf-8 \uDE02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 efbfbd59}
-test encoding-15.15 {UtfToUtfProc low surrogate character output} {
+test encoding-15.13 {UtfToUtfProc low surrogate character output} {
     set x \uDA02Y
     set y [encoding convertto utf-8 \uDA02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 efbfbd59}
-test encoding-15.16 {UtfToUtfProc high surrogate character output} {
+test encoding-15.14 {UtfToUtfProc high surrogate character output} {
     set x \uDE02
     set y [encoding convertto utf-8 \uDE02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
-test encoding-15.17 {UtfToUtfProc low surrogate character output} {
+test encoding-15.15 {UtfToUtfProc low surrogate character output} {
     set x \uDA02
     set y [encoding convertto utf-8 \uDA02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
+test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
+    set x \xF0\xA0\xA1\xC2
+    set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2]
+    list [string length $x] $y
+} "4 \xF0\xA0\xA1\xC2"
+test encoding-15.17 {UtfToUtfProc emoji character output} {
+    set x 😂
+    set y [encoding convertto utf-8 😂]
+    binary scan $y H* z
+    list [string length $y] $z
+} {4 f09f9882}
 test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} {
     set y [encoding convertto cesu-8 \U10000]
     binary scan $y H* z
-- 
cgit v0.12


From 3bfe6dba24b7c5f8f9f8669cb99e2bfb80b0e5da Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Mon, 19 Apr 2021 07:24:18 +0000
Subject: More testcases (cesu-8)

---
 tests/encoding.test | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/encoding.test b/tests/encoding.test
index e1c55d7..0e80c09 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -414,6 +414,21 @@ test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} {
     binary scan $y H* z
     list [string length $y] $z
 } {6 eda080edb080}
+test encoding-15.19 {UtfToUtfProc CESU-8 upper surrogate} {
+    set y [encoding convertto cesu-8 \uD800]
+    binary scan $y H* z
+    list [string length $y] $z
+} {3 eda080}
+test encoding-15.20 {UtfToUtfProc CESU-8 lower surrogate} {
+    set y [encoding convertto cesu-8 \uDC00]
+    binary scan $y H* z
+    list [string length $y] $z
+} {3 edb080}
+test encoding-15.21 {UtfToUtfProc CESU-8 noncharacter} {
+    set y [encoding convertto cesu-8 \uFFFF]
+    binary scan $y H* z
+    list [string length $y] $z
+} {3 efbfbf}
 
 test encoding-16.1 {Utf16ToUtfProc} -body {
     set val [encoding convertfrom utf-16 NN]
-- 
cgit v0.12