From 3c9c7e062138b5f21935974d667eec0ae10c346c Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 16 Apr 2021 20:34:33 +0000
Subject: Remove wtf-8/wtf-16/tcl-8 encodings

---
 doc/string.n          |   4 +-
 generic/tclEncoding.c |  40 ++++++--------------
 library/init.tcl      |   4 +-
 tests/encoding.test   | 100 +++++++++-----------------------------------------
 4 files changed, 32 insertions(+), 116 deletions(-)

diff --git a/doc/string.n b/doc/string.n
index f1a0592..f3d7616 100644
--- a/doc/string.n
+++ b/doc/string.n
@@ -415,11 +415,11 @@ etc.)
 .PP
 \fICompatibility note:\fR This subcommand is deprecated and will
 be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR
-command to convert a string to a known encoding (e.g. "wtf-8" or "tcl-8")
+command to convert a string to a known encoding (e.g. "utf-8" or "cesu-8")
 and then apply \fBstring length\fR to that.
 .PP
 .CS
-\fBstring length\fR [encoding convertto wtf-8 $theString]
+\fBstring length\fR [encoding convertto utf-8 $theString]
 .CE
 .RE
 .TP
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 29aeefd..21c254e 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -511,11 +511,10 @@ FillEncodingFileMap(void)
  */
 
 /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
-/* Since TCL_ENCODING_MODIFIED is only used for utf-8/wtf-8/cesu-8 and
- * TCL_ENCODING_LE is only used for  utf-16/wtf-16/ucs-2. re-use the same value */
+/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
+ * TCL_ENCODING_LE is only used for  utf-16/ucs-2. re-use the same value */
 #define TCL_ENCODING_MODIFIED	0x20	/* Converting NULL bytes to 0xC0 0x80 */
 #define TCL_ENCODING_LE		TCL_ENCODING_MODIFIED	/* Little-endian encoding */
-#define TCL_ENCODING_WTF	0x100	/* For WTF-8 encoding, don't check for surrogates/noncharacters */
 #define TCL_ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
 
 void
@@ -560,15 +559,9 @@ TclInitEncodingSubsystem(void)
     type.nullSize	= 1;
     type.clientData	= INT2PTR(TCL_ENCODING_UTF);
     Tcl_CreateEncoding(&type);
-    type.clientData	= INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF);
-    type.encodingName	= "wtf-8";
-    Tcl_CreateEncoding(&type);
     type.clientData	= INT2PTR(0);
     type.encodingName	= "cesu-8";
     Tcl_CreateEncoding(&type);
-    type.clientData	= INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF|TCL_ENCODING_MODIFIED);
-    type.encodingName	= "tcl-8";
-    Tcl_CreateEncoding(&type);
 
     type.toUtfProc	= Utf16ToUtfProc;
     type.fromUtfProc    = UtfToUcs2Proc;
@@ -591,21 +584,12 @@ TclInitEncodingSubsystem(void)
     type.encodingName   = "utf-16le";
     type.clientData	= INT2PTR(TCL_ENCODING_LE);
     Tcl_CreateEncoding(&type);
-    type.encodingName   = "wtf-16le";
-    type.clientData	= INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF);
-    Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16be";
     type.clientData	= INT2PTR(0);
     Tcl_CreateEncoding(&type);
-    type.encodingName   = "wtf-16be";
-    type.clientData	= INT2PTR(TCL_ENCODING_WTF);
-    Tcl_CreateEncoding(&type);
     type.encodingName   = "utf-16";
     type.clientData	= INT2PTR(isLe.c);
     Tcl_CreateEncoding(&type);
-    type.encodingName   = "wtf-16";
-    type.clientData	= INT2PTR(isLe.c + TCL_ENCODING_WTF);
-    Tcl_CreateEncoding(&type);
 
 #ifndef TCL_NO_DEPRECATED
     type.encodingName   = "unicode";
@@ -2315,15 +2299,13 @@ UtfToUtfProc(
 		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
 
 		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
-		    if (!(flags & TCL_ENCODING_WTF)) {
-			if (flags & TCL_ENCODING_STOPONERROR) {
-			    result = TCL_CONVERT_UNKNOWN;
-			    src = saveSrc;
-			    break;
-			}
-			if (!(flags & TCL_ENCODING_MODIFIED)) {
-			    ch = 0xFFFD;
-			}
+		    if (flags & TCL_ENCODING_STOPONERROR) {
+			result = TCL_CONVERT_UNKNOWN;
+			src = saveSrc;
+			break;
+		    }
+		    if (!(flags & TCL_ENCODING_MODIFIED)) {
+			ch = 0xFFFD;
 		    }
 		cesu8:
 		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
@@ -2334,7 +2316,7 @@ UtfToUtfProc(
 		src += len;
 		dst += Tcl_UniCharToUtf(ch, dst);
 		ch = low;
-	    } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+	    } else if (!Tcl_UniCharIsUnicode(ch)) {
 		if (flags & TCL_ENCODING_STOPONERROR) {
 		    result = TCL_CONVERT_UNKNOWN;
 		    src = saveSrc;
@@ -2530,7 +2512,7 @@ UtfToUtf16Proc(
 	    break;
 	}
 	len = TclUtfToUCS4(src, &ch);
-	if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+	if (!Tcl_UniCharIsUnicode(ch)) {
 	    if (flags & TCL_ENCODING_STOPONERROR) {
 		result = TCL_CONVERT_UNKNOWN;
 		break;
diff --git a/library/init.tcl b/library/init.tcl
index 749eed9..e30296e 100644
--- a/library/init.tcl
+++ b/library/init.tcl
@@ -214,9 +214,9 @@ proc unknown args {
 		set errInfo [dict get $opts -errorinfo]
 		set errCode [dict get $opts -errorcode]
 		set cinfo $args
-		if {[string length [encoding convertto wtf-8 $cinfo]] > 150} {
+		if {[string length [encoding convertto utf-8 $cinfo]] > 150} {
 		    set cinfo [string range $cinfo 0 150]
-		    while {[string length [encoding convertto wtf-8 $cinfo]] > 150} {
+		    while {[string length [encoding convertto utf-8 $cinfo]] > 150} {
 			set cinfo [string range $cinfo 0 end-1]
 		    }
 		    append cinfo ...
diff --git a/tests/encoding.test b/tests/encoding.test
index 9924886..82a2d6b 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -338,138 +338,78 @@ test encoding-15.5 {UtfToUtfProc emoji character input} {
     set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
     list [string length $x] $y
 } "4 😂"
-test encoding-15.6 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83D\uDE02\uD83D
-    set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D]
-    binary scan $y H* z
-    list [string length $y] $z
-} {10 edb882f09f9882eda0bd}
-test encoding-15.7 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83D\uD83D
-    set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {3 9 edb882eda0bdeda0bd}
-test encoding-15.8 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83Dé
-    set y [encoding convertto wtf-8 \uDE02\uD83Dé]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {3 8 edb882eda0bdc3a9}
-test encoding-15.9 {UtfToUtfProc emoji character output} {
-    set x \uDE02\uD83DX
-    set y [encoding convertto wtf-8 \uDE02\uD83DX]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {3 7 edb882eda0bd58}
-test encoding-15.10 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02é
-    set y [encoding convertto wtf-8 \uDE02é]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 5 edb882c3a9}
-test encoding-15.11 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02é
-    set y [encoding convertto wtf-8 \uDA02é]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 5 eda882c3a9}
-test encoding-15.12 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02Y
-    set y [encoding convertto wtf-8 \uDE02Y]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 4 edb88259}
-test encoding-15.13 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02Y
-    set y [encoding convertto wtf-8 \uDA02Y]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {2 4 eda88259}
-test encoding-15.14 {UtfToUtfProc high surrogate character output} {
-    set x \uDE02
-    set y [encoding convertto wtf-8 \uDE02]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {1 3 edb882}
-test encoding-15.15 {UtfToUtfProc low surrogate character output} {
-    set x \uDA02
-    set y [encoding convertto wtf-8 \uDA02]
-    binary scan $y H* z
-    list [string length $x] [string length $y] $z
-} {1 3 eda882}
-test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
+test encoding-15.6 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
     set x \xF0\xA0\xA1\xC2
     set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2]
     list [string length $x] $y
 } "4 \xF0\xA0\xA1\xC2"
-test encoding-15.17 {UtfToUtfProc emoji character output} {
+test encoding-15.7 {UtfToUtfProc emoji character output} {
     set x 😂
     set y [encoding convertto utf-8 😂]
     binary scan $y H* z
     list [string length $y] $z
 } {4 f09f9882}
-test encoding-15.18 {UtfToUtfProc emoji character output} {
+test encoding-15.8 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uDE02\uD83D
     set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D]
     binary scan $y H* z
     list [string length $y] $z
 } {10 efbfbdf09f9882efbfbd}
-test encoding-15.19 {UtfToUtfProc emoji character output} {
+test encoding-15.9 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\uD83D
     set y [encoding convertto utf-8 \uDE02\uD83D\uD83D]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 9 efbfbdefbfbdefbfbd}
-test encoding-15.20 {UtfToUtfProc emoji character output} {
+test encoding-15.10 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83D\xE9
     set y [encoding convertto utf-8 \uDE02\uD83D\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 8 efbfbdefbfbdc3a9}
-test encoding-15.21 {UtfToUtfProc emoji character output} {
+test encoding-15.11 {UtfToUtfProc emoji character output} {
     set x \uDE02\uD83DX
     set y [encoding convertto utf-8 \uDE02\uD83DX]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {3 7 efbfbdefbfbd58}
-test encoding-15.22 {UtfToUtfProc high surrogate character output} {
+test encoding-15.12 {UtfToUtfProc high surrogate character output} {
     set x \uDE02\xE9
     set y [encoding convertto utf-8 \uDE02\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 efbfbdc3a9}
-test encoding-15.23 {UtfToUtfProc low surrogate character output} {
+test encoding-15.13 {UtfToUtfProc low surrogate character output} {
     set x \uDA02\xE9
     set y [encoding convertto utf-8 \uDA02\xE9]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 5 efbfbdc3a9}
-test encoding-15.24 {UtfToUtfProc high surrogate character output} {
+test encoding-15.14 {UtfToUtfProc high surrogate character output} {
     set x \uDE02Y
     set y [encoding convertto utf-8 \uDE02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 efbfbd59}
-test encoding-15.25 {UtfToUtfProc low surrogate character output} {
+test encoding-15.15 {UtfToUtfProc low surrogate character output} {
     set x \uDA02Y
     set y [encoding convertto utf-8 \uDA02Y]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {2 4 efbfbd59}
-test encoding-15.26 {UtfToUtfProc high surrogate character output} {
+test encoding-15.16 {UtfToUtfProc high surrogate character output} {
     set x \uDE02
     set y [encoding convertto utf-8 \uDE02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
-test encoding-15.27 {UtfToUtfProc low surrogate character output} {
+test encoding-15.17 {UtfToUtfProc low surrogate character output} {
     set x \uDA02
     set y [encoding convertto utf-8 \uDA02]
     binary scan $y H* z
     list [string length $x] [string length $y] $z
 } {1 3 efbfbd}
-test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} {
+test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} {
     set y [encoding convertto cesu-8 \U10000]
     binary scan $y H* z
     list [string length $y] $z
@@ -499,19 +439,13 @@ test encoding-16.4 {Ucs2ToUtfProc} -body {
 test encoding-17.1 {UtfToUtf16Proc} -body {
     encoding convertto utf-16 "\U460DC"
 } -result "\xD8\xD8\xDC\xDC"
-test encoding-17.2 {UtfToUtf16Proc} -body {
-    encoding convertto wtf-16 "\uDCDC"
-} -result "\xDC\xDC"
-test encoding-17.3 {UtfToUtf16Proc} -body {
-    encoding convertto wtf-16 "\uD8D8"
-} -result "\xD8\xD8"
-test encoding-17.4 {UtfToUcs2Proc} -body {
+test encoding-17.2 {UtfToUcs2Proc} -body {
     encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"]
 } -result "\uFFFD"
-test encoding-17.5 {UtfToUtf16Proc} -body {
+test encoding-17.3 {UtfToUtf16Proc} -body {
     encoding convertto utf-16be "\uDCDC"
 } -result "\xFF\xFD"
-test encoding-17.6 {UtfToUtf16Proc} -body {
+test encoding-17.4 {UtfToUtf16Proc} -body {
     encoding convertto utf-16le "\uD8D8"
 } -result "\xFD\xFF"
 
@@ -813,7 +747,7 @@ test encoding-28.0 {all encodings load} -body {
 		llength $name
 	}
 	return $count
-} -result [expr {[info exists ::tcl_precision] ? 92 : 91}]
+} -result [expr {[info exists ::tcl_precision] ? 87 : 86}]
 
 runtests
 
-- 
cgit v0.12