From 3c9c7e062138b5f21935974d667eec0ae10c346c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 16 Apr 2021 20:34:33 +0000 Subject: Remove wtf-8/wtf-16/tcl-8 encodings --- doc/string.n | 4 +- generic/tclEncoding.c | 40 ++++++-------------- library/init.tcl | 4 +- tests/encoding.test | 100 +++++++++----------------------------------------- 4 files changed, 32 insertions(+), 116 deletions(-) diff --git a/doc/string.n b/doc/string.n index f1a0592..f3d7616 100644 --- a/doc/string.n +++ b/doc/string.n @@ -415,11 +415,11 @@ etc.) .PP \fICompatibility note:\fR This subcommand is deprecated and will be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR -command to convert a string to a known encoding (e.g. "wtf-8" or "tcl-8") +command to convert a string to a known encoding (e.g. "utf-8" or "cesu-8") and then apply \fBstring length\fR to that. .PP .CS -\fBstring length\fR [encoding convertto wtf-8 $theString] +\fBstring length\fR [encoding convertto utf-8 $theString] .CE .RE .TP diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 29aeefd..21c254e 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -511,11 +511,10 @@ FillEncodingFileMap(void) */ /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8/wtf-8/cesu-8 and - * TCL_ENCODING_LE is only used for utf-16/wtf-16/ucs-2. re-use the same value */ +/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and + * TCL_ENCODING_LE is only used for utf-16/ucs-2. re-use the same value */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ -#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void @@ -560,15 +559,9 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF); - type.encodingName = "wtf-8"; - Tcl_CreateEncoding(&type); type.clientData = INT2PTR(0); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF|TCL_ENCODING_MODIFIED); - type.encodingName = "tcl-8"; - Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; @@ -591,21 +584,12 @@ TclInitEncodingSubsystem(void) type.encodingName = "utf-16le"; type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); - type.encodingName = "wtf-16le"; - type.clientData = INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF); - Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); - type.encodingName = "wtf-16be"; - type.clientData = INT2PTR(TCL_ENCODING_WTF); - Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); - type.encodingName = "wtf-16"; - type.clientData = INT2PTR(isLe.c + TCL_ENCODING_WTF); - Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED type.encodingName = "unicode"; @@ -2315,15 +2299,13 @@ UtfToUtfProc( len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - if (!(flags & TCL_ENCODING_WTF)) { - if (flags & TCL_ENCODING_STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } - if (!(flags & TCL_ENCODING_MODIFIED)) { - ch = 0xFFFD; - } + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; } cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); @@ -2334,7 +2316,7 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + } else if (!Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; @@ -2530,7 +2512,7 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + if (!Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; diff --git a/library/init.tcl b/library/init.tcl index 749eed9..e30296e 100644 --- a/library/init.tcl +++ b/library/init.tcl @@ -214,9 +214,9 @@ proc unknown args { set errInfo [dict get $opts -errorinfo] set errCode [dict get $opts -errorcode] set cinfo $args - if {[string length [encoding convertto wtf-8 $cinfo]] > 150} { + if {[string length [encoding convertto utf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 150] - while {[string length [encoding convertto wtf-8 $cinfo]] > 150} { + while {[string length [encoding convertto utf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 end-1] } append cinfo ... diff --git a/tests/encoding.test b/tests/encoding.test index 9924886..82a2d6b 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -338,138 +338,78 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y } "4 😂" -test encoding-15.6 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83D\uDE02\uD83D - set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D] - binary scan $y H* z - list [string length $y] $z -} {10 edb882f09f9882eda0bd} -test encoding-15.7 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83D\uD83D - set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {3 9 edb882eda0bdeda0bd} -test encoding-15.8 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83Dé - set y [encoding convertto wtf-8 \uDE02\uD83Dé] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {3 8 edb882eda0bdc3a9} -test encoding-15.9 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83DX - set y [encoding convertto wtf-8 \uDE02\uD83DX] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {3 7 edb882eda0bd58} -test encoding-15.10 {UtfToUtfProc high surrogate character output} { - set x \uDE02é - set y [encoding convertto wtf-8 \uDE02é] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 5 edb882c3a9} -test encoding-15.11 {UtfToUtfProc low surrogate character output} { - set x \uDA02é - set y [encoding convertto wtf-8 \uDA02é] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 5 eda882c3a9} -test encoding-15.12 {UtfToUtfProc high surrogate character output} { - set x \uDE02Y - set y [encoding convertto wtf-8 \uDE02Y] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 4 edb88259} -test encoding-15.13 {UtfToUtfProc low surrogate character output} { - set x \uDA02Y - set y [encoding convertto wtf-8 \uDA02Y] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 4 eda88259} -test encoding-15.14 {UtfToUtfProc high surrogate character output} { - set x \uDE02 - set y [encoding convertto wtf-8 \uDE02] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {1 3 edb882} -test encoding-15.15 {UtfToUtfProc low surrogate character output} { - set x \uDA02 - set y [encoding convertto wtf-8 \uDA02] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {1 3 eda882} -test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { +test encoding-15.6 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" -test encoding-15.17 {UtfToUtfProc emoji character output} { +test encoding-15.7 {UtfToUtfProc emoji character output} { set x 😂 set y [encoding convertto utf-8 😂] binary scan $y H* z list [string length $y] $z } {4 f09f9882} -test encoding-15.18 {UtfToUtfProc emoji character output} { +test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 efbfbdf09f9882efbfbd} -test encoding-15.19 {UtfToUtfProc emoji character output} { +test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 efbfbdefbfbdefbfbd} -test encoding-15.20 {UtfToUtfProc emoji character output} { +test encoding-15.10 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\xE9 set y [encoding convertto utf-8 \uDE02\uD83D\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 efbfbdefbfbdc3a9} -test encoding-15.21 {UtfToUtfProc emoji character output} { +test encoding-15.11 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX set y [encoding convertto utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 efbfbdefbfbd58} -test encoding-15.22 {UtfToUtfProc high surrogate character output} { +test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02\xE9 set y [encoding convertto utf-8 \uDE02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} -test encoding-15.23 {UtfToUtfProc low surrogate character output} { +test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02\xE9 set y [encoding convertto utf-8 \uDA02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} -test encoding-15.24 {UtfToUtfProc high surrogate character output} { +test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02Y set y [encoding convertto utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} -test encoding-15.25 {UtfToUtfProc low surrogate character output} { +test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02Y set y [encoding convertto utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} -test encoding-15.26 {UtfToUtfProc high surrogate character output} { +test encoding-15.16 {UtfToUtfProc high surrogate character output} { set x \uDE02 set y [encoding convertto utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} -test encoding-15.27 {UtfToUtfProc low surrogate character output} { +test encoding-15.17 {UtfToUtfProc low surrogate character output} { set x \uDA02 set y [encoding convertto utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} -test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} { +test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} { set y [encoding convertto cesu-8 \U10000] binary scan $y H* z list [string length $y] $z @@ -499,19 +439,13 @@ test encoding-16.4 {Ucs2ToUtfProc} -body { test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" } -result "\xD8\xD8\xDC\xDC" -test encoding-17.2 {UtfToUtf16Proc} -body { - encoding convertto wtf-16 "\uDCDC" -} -result "\xDC\xDC" -test encoding-17.3 {UtfToUtf16Proc} -body { - encoding convertto wtf-16 "\uD8D8" -} -result "\xD8\xD8" -test encoding-17.4 {UtfToUcs2Proc} -body { +test encoding-17.2 {UtfToUcs2Proc} -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" -test encoding-17.5 {UtfToUtf16Proc} -body { +test encoding-17.3 {UtfToUtf16Proc} -body { encoding convertto utf-16be "\uDCDC" } -result "\xFF\xFD" -test encoding-17.6 {UtfToUtf16Proc} -body { +test encoding-17.4 {UtfToUtf16Proc} -body { encoding convertto utf-16le "\uD8D8" } -result "\xFD\xFF" @@ -813,7 +747,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 92 : 91}] +} -result [expr {[info exists ::tcl_precision] ? 87 : 86}] runtests -- cgit v0.12