From a5a30fa3e2b30971c18a067291c9144bdd22199f Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 10 Mar 2021 12:55:25 +0000 Subject: TIP #597 implementation: "string is unicode" and new wtf-8 encoding --- doc/UniCharIsAlpha.3 | 7 ++++- generic/tcl.decls | 4 +-- generic/tclCmdMZ.c | 11 ++++--- generic/tclCompCmdsSZ.c | 24 +++++++++------ generic/tclCompile.h | 3 +- generic/tclDecls.h | 13 +++++--- generic/tclEncoding.c | 19 ++++++++++-- generic/tclStubInit.c | 3 +- generic/tclUtf.c | 30 ++++++++++++++++++ tests/encoding.test | 82 ++++++++++++++++++++++++++++++++++++++++++------- tests/string.test | 26 ++++++++++++++-- 11 files changed, 182 insertions(+), 40 deletions(-) diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 61490ed..20828e4 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -8,7 +8,7 @@ .so man.macros .BS .SH NAME -Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters +Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsUnicode, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters .SH SYNOPSIS .nf \fB#include \fR @@ -44,6 +44,9 @@ int \fBTcl_UniCharIsUpper\fR(\fIch\fR) .sp int +\fBTcl_UniCharIsUnicode\fR(\fIch\fR) +.sp +int \fBTcl_UniCharIsWordChar\fR(\fIch\fR) .SH ARGUMENTS .AS int ch @@ -81,6 +84,8 @@ with the various routines. .PP \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character. .PP +\fBTcl_UniCharIsUpper\fR tests if the character is a Unicode character. +.PP \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or a connector punctuation mark. diff --git a/generic/tcl.decls b/generic/tcl.decls index 51ece1a..03d43ce 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2412,8 +2412,8 @@ declare 652 { declare 653 { unsigned char *TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) } -declare 656 { - void TclUnusedStubEntry(void) +declare 657 { + int Tcl_UniCharIsUnicode(int ch) } # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d020a93..61d1010 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1533,16 +1533,16 @@ StringIsCmd( "boolean", "dict", "digit", "double", "entier", "false", "graph", "integer", "list", "lower", "print", "punct", - "space", "true", "upper", "wideinteger", - "wordchar", "xdigit", NULL + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, - STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, - STR_IS_WORD, STR_IS_XDIGIT + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; static const char *const isOptions[] = { "-strict", "-failindex", NULL @@ -1872,6 +1872,9 @@ StringIsCmd( case STR_IS_UPPER: chcomp = Tcl_UniCharIsUpper; break; + case STR_IS_UNICODE: + chcomp = Tcl_UniCharIsUnicode; + break; case STR_IS_WORD: chcomp = Tcl_UniCharIsWordChar; break; diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index 0bac52b..a160625 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -505,19 +505,19 @@ TclCompileStringIsCmd( Tcl_Token *tokenPtr = TokenAfter(parsePtr->tokenPtr); static const char *const isClasses[] = { "alnum", "alpha", "ascii", "control", - "boolean", "dict", "digit", "double", "entier", - "false", "graph", "integer", "list", - "lower", "print", "punct", "space", - "true", "upper", "wideinteger", "wordchar", - "xdigit", NULL + "boolean", "dict", "digit", "double", + "entier", "false", "graph", "integer", + "list", "lower", "print", "punct", + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, - STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, - STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, - STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, STR_IS_SPACE, - STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD, - STR_IS_XDIGIT + STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, + STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, + STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; int t, range, allowEmpty = 0, end; InstStringClassType strClassType; @@ -609,6 +609,9 @@ TclCompileStringIsCmd( case STR_IS_UPPER: strClassType = STR_CLASS_UPPER; goto compileStrClass; + case STR_IS_UNICODE: + strClassType = STR_CLASS_UNICODE; + goto compileStrClass; case STR_IS_WORD: strClassType = STR_CLASS_WORD; goto compileStrClass; @@ -1415,6 +1418,7 @@ StringClassDesc const tclStringClassTable[] = { {"upper", Tcl_UniCharIsUpper}, {"word", Tcl_UniCharIsWordChar}, {"xdigit", UniCharIsHexDigit}, + {"unicode", Tcl_UniCharIsUnicode}, {"", NULL} }; diff --git a/generic/tclCompile.h b/generic/tclCompile.h index 21a27f7..6a5faaf 100644 --- a/generic/tclCompile.h +++ b/generic/tclCompile.h @@ -922,8 +922,9 @@ typedef enum InstStringClassType { STR_CLASS_UPPER, /* Unicode upper-case alphabet characters. */ STR_CLASS_WORD, /* Unicode word (alphabetic, digit, connector * punctuation) characters. */ - STR_CLASS_XDIGIT /* Characters that can be used as digits in + STR_CLASS_XDIGIT, /* Characters that can be used as digits in * hexadecimal numbers ([0-9A-Fa-f]). */ + STR_CLASS_UNICODE /* Unicode characters. */ } InstStringClassType; typedef struct StringClassDesc { diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 4b91817..bb31355 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1933,8 +1933,9 @@ EXTERN unsigned char * TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr); /* Slot 654 is reserved */ /* Slot 655 is reserved */ -/* 656 */ -EXTERN void TclUnusedStubEntry(void); +/* Slot 656 is reserved */ +/* 657 */ +EXTERN int Tcl_UniCharIsUnicode(int ch); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2626,7 +2627,8 @@ typedef struct TclStubs { unsigned char * (*tclGetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */ void (*reserved654)(void); void (*reserved655)(void); - void (*tclUnusedStubEntry) (void); /* 656 */ + void (*reserved656)(void); + int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3965,8 +3967,9 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tclGetByteArrayFromObj) /* 653 */ /* Slot 654 is reserved */ /* Slot 655 is reserved */ -#define TclUnusedStubEntry \ - (tclStubsPtr->tclUnusedStubEntry) /* 656 */ +/* Slot 656 is reserved */ +#define Tcl_UniCharIsUnicode \ + (tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */ #endif /* defined(USE_TCL_STUBS) */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index c4ef159..d994f10 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -517,6 +517,9 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ +#define FLAG_LE 1 +#define FLAG_WTF 2 + void TclInitEncodingSubsystem(void) { @@ -559,13 +562,16 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = NULL; Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(FLAG_WTF); + type.encodingName = "wtf-8"; + Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(1); + type.clientData = INT2PTR(FLAG_LE); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; type.clientData = INT2PTR(0); @@ -579,7 +585,7 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "utf-16le"; - type.clientData = INT2PTR(1); + type.clientData = INT2PTR(FLAG_LE); Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); @@ -2271,7 +2277,7 @@ UtfExtToUtfIntProc( static int UtfToUtfProc( - TCL_UNUSED(ClientData), + void *clientData, /* flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2302,6 +2308,7 @@ UtfToUtfProc( const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; + int encflags = PTR2INT(clientData); int *chPtr = (int *) statePtr; if (flags & TCL_ENCODING_START) { @@ -2370,6 +2377,9 @@ UtfToUtfProc( int low = *chPtr; size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { + if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) { + *chPtr = 0xFFFD; + } *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); *dst++ = (char) ((*chPtr | 0x80) & 0xBF); @@ -2378,6 +2388,9 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); *chPtr = low; + } else if ((pureNullMode == 1) && !(encflags & FLAG_WTF) + && !Tcl_UniCharIsUnicode(*chPtr)) { + *chPtr = 0xFFFD; } dst += Tcl_UniCharToUtf(*chPtr, dst); } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index fa5562d..cd1ce81 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1905,7 +1905,8 @@ const TclStubs tclStubs = { TclGetByteArrayFromObj, /* 653 */ 0, /* 654 */ 0, /* 655 */ - TclUnusedStubEntry, /* 656 */ + 0, /* 656 */ + Tcl_UniCharIsUnicode, /* 657 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e096c06..2687a1d 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2187,6 +2187,36 @@ Tcl_UniCharIsUpper( /* *---------------------------------------------------------------------- * + * Tcl_UniCharIsUnicode -- + * + * Test if a character is a Unicode character. + * + * Results: + * Returns non-zero if character belongs to the Unicode set. + * + * Excluded are: + * 1) All characters > U+10FFFF + * 2) Surrogates U+D800 - U+DFFF + * 3) Last 2 characters of each plane, so U+??FFFE and U+??FFFF + * 4) The characters in the range U+FDD0 - U+FDEF + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsUnicode( + int ch) /* Unicode character to test. */ +{ + return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800) + && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32); +} + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharIsWordChar -- * * Test if a character is alphanumeric or a connector punctuation mark. diff --git a/tests/encoding.test b/tests/encoding.test index b1150c6..76b830d 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -340,61 +340,61 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { } "4 😂" test encoding-15.6 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D - set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] + set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 edb882f09f9882eda0bd} test encoding-15.7 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D - set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] + set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 edb882eda0bdeda0bd} test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\xE9 - set y [encoding convertto utf-8 \uDE02\uD83D\xE9] + set y [encoding convertto wtf-8 \uDE02\uD83D\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 edb882eda0bdc3a9} test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX - set y [encoding convertto utf-8 \uDE02\uD83DX] + set y [encoding convertto wtf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 edb882eda0bd58} test encoding-15.10 {UtfToUtfProc high surrogate character output} { set x \uDE02\xE9 - set y [encoding convertto utf-8 \uDE02\xE9] + set y [encoding convertto wtf-8 \uDE02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 edb882c3a9} test encoding-15.11 {UtfToUtfProc low surrogate character output} { set x \uDA02\xE9 - set y [encoding convertto utf-8 \uDA02\xE9] + set y [encoding convertto wtf-8 \uDA02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 eda882c3a9} test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02Y - set y [encoding convertto utf-8 \uDE02Y] + set y [encoding convertto wtf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 edb88259} test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02Y - set y [encoding convertto utf-8 \uDA02Y] + set y [encoding convertto wtf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 eda88259} test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02 - set y [encoding convertto utf-8 \uDE02] + set y [encoding convertto wtf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 edb882} test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02 - set y [encoding convertto utf-8 \uDA02] + set y [encoding convertto wtf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 eda882} @@ -409,6 +409,66 @@ test encoding-15.17 {UtfToUtfProc emoji character output} { binary scan $y H* z list [string length $y] $z } {4 f09f9882} +test encoding-15.18 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83D\uDE02\uD83D + set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] + binary scan $y H* z + list [string length $y] $z +} {10 efbfbdf09f9882efbfbd} +test encoding-15.19 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83D\uD83D + set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {3 9 efbfbdefbfbdefbfbd} +test encoding-15.20 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83D\xE9 + set y [encoding convertto utf-8 \uDE02\uD83D\xE9] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {3 8 efbfbdefbfbdc3a9} +test encoding-15.21 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83DX + set y [encoding convertto utf-8 \uDE02\uD83DX] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {3 7 efbfbdefbfbd58} +test encoding-15.22 {UtfToUtfProc high surrogate character output} { + set x \uDE02\xE9 + set y [encoding convertto utf-8 \uDE02\xE9] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 5 efbfbdc3a9} +test encoding-15.23 {UtfToUtfProc low surrogate character output} { + set x \uDA02\xE9 + set y [encoding convertto utf-8 \uDA02\xE9] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 5 efbfbdc3a9} +test encoding-15.24 {UtfToUtfProc high surrogate character output} { + set x \uDE02Y + set y [encoding convertto utf-8 \uDE02Y] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 4 efbfbd59} +test encoding-15.25 {UtfToUtfProc low surrogate character output} { + set x \uDA02Y + set y [encoding convertto utf-8 \uDA02Y] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 4 efbfbd59} +test encoding-15.26 {UtfToUtfProc high surrogate character output} { + set x \uDE02 + set y [encoding convertto utf-8 \uDE02] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {1 3 efbfbd} +test encoding-15.27 {UtfToUtfProc low surrogate character output} { + set x \uDA02 + set y [encoding convertto utf-8 \uDA02] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {1 3 efbfbd} test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] @@ -742,7 +802,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 86 : 85}] +} -result [expr {[info exists ::tcl_precision] ? 87 : 86}] runtests diff --git a/tests/string.test b/tests/string.test index 0eaa3da..b3b278c 100644 --- a/tests/string.test +++ b/tests/string.test @@ -525,10 +525,10 @@ test string-6.4.$noComp {string is, too many args} { } {1 {wrong # args: should be "string is class ?-strict? ?-failindex var? str"}} test string-6.5.$noComp {string is, class check} { list [catch {run {string is bogus str}} msg] $msg -} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} +} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} test string-6.6.$noComp {string is, ambiguous class} { list [catch {run {string is al str}} msg] $msg -} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} +} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} test string-6.7.$noComp {string is alpha, all ok} { run {string is alpha -strict -failindex var abc} } 1 @@ -961,6 +961,28 @@ test string-6.130.1.$noComp {string is entier, false on bad octal} { test string-6.131.$noComp {string is entier, false on bad hex} { list [run {string is entier -fail var 0X12345611234123456123456562345612345612345612345612345612345612345612345612345612345345XYZ}] $var } {0 88} +test string-6.132.$noComp {string is unicode} { + run {string is unicode \U10FFFD\uD7FF\uE000\uFDCF\uFDF0} +} 1 +test string-6.133.$noComp {string is unicode, upper surrogate} { + run {string is unicode \uD800} +} 0 +test string-6.134.$noComp {string is unicode, lower surrogate} { + run {string is unicode \uDFFF} +} 0 +test string-6.135.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFFFE} +} 0 +test string-6.136.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFFFF} +} 0 +test string-6.137.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFDD0} +} 0 +test string-6.138.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFDEF} +} 0 + test string-7.1.$noComp {string last, not enough args} { list [catch {run {string last a}} msg] $msg -- cgit v0.12 From b51e777fe970a9ffcf0919f7262a6d474aeeaf2b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 10 Mar 2021 15:00:44 +0000 Subject: Fix documentation --- doc/UniCharIsAlpha.3 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 20828e4..a07af9a 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -84,7 +84,8 @@ with the various routines. .PP \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character. .PP -\fBTcl_UniCharIsUpper\fR tests if the character is a Unicode character. +\fBTcl_UniCharIsUnicode\fR tests if the character is a Unicode character, not being +a surrogate or noncharacter. .PP \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or a connector punctuation mark. -- cgit v0.12 From 74f570670c3ca45a4ff87a051d9ecdadb396dc56 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 17 Mar 2021 12:46:31 +0000 Subject: Add wtf-16 encodings to the set. With testcases --- generic/tclEncoding.c | 119 ++++++++++++++++++++++---------------------------- tests/encoding.test | 12 +++-- 2 files changed, 62 insertions(+), 69 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2198c33..2707972 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -587,12 +587,21 @@ TclInitEncodingSubsystem(void) type.encodingName = "utf-16le"; type.clientData = INT2PTR(FLAG_LE); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16le"; + type.clientData = INT2PTR(FLAG_LE + FLAG_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16be"; + type.clientData = INT2PTR(FLAG_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16"; + type.clientData = INT2PTR(isLe.c + FLAG_WTF); + Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED type.encodingName = "unicode"; @@ -2281,11 +2290,7 @@ UtfToUtfProc( const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2309,11 +2314,8 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int encflags = PTR2INT(clientData); - int *chPtr = (int *) statePtr; + int ch; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } result = TCL_OK; srcStart = src; @@ -2350,7 +2352,6 @@ UtfToUtfProc( */ *dst++ = *src++; - *chPtr = 0; /* reset surrogate handling */ } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { /* @@ -2358,7 +2359,6 @@ UtfToUtfProc( */ *dst++ = 0; - *chPtr = 0; /* reset surrogate handling */ src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* @@ -2367,32 +2367,32 @@ UtfToUtfProc( * incomplete char its bytes are made to represent themselves. */ - *chPtr = UCHAR(*src); + ch = UCHAR(*src); src += 1; - dst += Tcl_UniCharToUtf(*chPtr, dst); + dst += Tcl_UniCharToUtf(ch, dst); } else { - src += TclUtfToUCS4(src, chPtr); - if ((*chPtr | 0x7FF) == 0xDFFF) { + src += TclUtfToUCS4(src, &ch); + if ((ch | 0x7FF) == 0xDFFF) { /* A surrogate character is detected, handle especially */ - int low = *chPtr; + int low = ch; size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { + if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) { - *chPtr = 0xFFFD; + ch = 0xFFFD; } - *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((*chPtr | 0x80) & 0xBF); + *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); continue; } src += len; - dst += Tcl_UniCharToUtf(*chPtr, dst); - *chPtr = low; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } else if ((pureNullMode == 1) && !(encflags & FLAG_WTF) - && !Tcl_UniCharIsUnicode(*chPtr)) { - *chPtr = 0xFFFD; + && !Tcl_UniCharIsUnicode(ch)) { + ch = 0xFFFD; } - dst += Tcl_UniCharToUtf(*chPtr, dst); + dst += Tcl_UniCharToUtf(ch, dst); } } @@ -2420,7 +2420,7 @@ UtfToUtfProc( static int Utf16ToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* flags */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2444,6 +2444,7 @@ Utf16ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; + int encflags = PTR2INT(clientData); unsigned short ch; if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2457,7 +2458,7 @@ Utf16ToUtfProc( srcLen--; } /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + if ((srcLen >= 2) && ((src[srcLen - ((encflags & FLAG_LE)?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } @@ -2474,7 +2475,7 @@ Utf16ToUtfProc( break; } - if (clientData) { + if (encflags & FLAG_LE) { ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); @@ -2515,15 +2516,11 @@ Utf16ToUtfProc( static int UtfToUtf16Proc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData1, /* flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ - Tcl_EncodingState *statePtr,/* Place for conversion routine to store state - * information used during a piecewise - * conversion. Contents of statePtr are - * initialized and/or reset by conversion - * routine under control of flags argument. */ + TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2542,11 +2539,9 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + int encflags = PTR2INT(clientData1); + int ch; - if (flags & TCL_ENCODING_START) { - *statePtr = 0; - } srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -2572,38 +2567,30 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUniChar(src, chPtr); - - if (clientData) { -#if TCL_UTF_MAX > 3 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); + src += TclUtfToUCS4(src, &ch); + if (!(encflags & FLAG_WTF) && !Tcl_UniCharIsUnicode(ch)) { + ch = 0xFFFD; + } + if (encflags & FLAG_LE) { + if (ch <= 0xFFFF) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); -#endif } else { -#if TCL_UTF_MAX > 3 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); + if (ch <= 0xFFFF) { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); } else { - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = ((*chPtr >> 8) & 0x3) | 0xDC; - *dst++ = (*chPtr & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = ((ch >> 8) & 0x3) | 0xDC; + *dst++ = (ch & 0xFF); } -#else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); -#endif } } *srcReadPtr = src - srcStart; diff --git a/tests/encoding.test b/tests/encoding.test index 76b830d..43aecbb 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -495,14 +495,20 @@ test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" } -result "\xD8\xD8\xDC\xDC" test encoding-17.2 {UtfToUtf16Proc} -body { - encoding convertto utf-16 "\uDCDC" + encoding convertto wtf-16 "\uDCDC" } -result "\xDC\xDC" test encoding-17.3 {UtfToUtf16Proc} -body { - encoding convertto utf-16 "\uD8D8" + encoding convertto wtf-16 "\uD8D8" } -result "\xD8\xD8" test encoding-17.4 {UtfToUcs2Proc} -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" +test encoding-17.5 {UtfToUtf16Proc} -body { + encoding convertto utf-16be "\uDCDC" +} -result "\xFF\xFD" +test encoding-17.6 {UtfToUtf16Proc} -body { + encoding convertto utf-16le "\uD8D8" +} -result "\xFD\xFF" test encoding-18.1 {TableToUtfProc} { } {} @@ -802,7 +808,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 87 : 86}] +} -result [expr {[info exists ::tcl_precision] ? 90 : 89}] runtests -- cgit v0.12 From 1806e5755f1240beb778c171d3b7a2797276adf2 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 28 Mar 2021 22:01:51 +0000 Subject: Make a start with CESU-8 encoder/decoder. Not finished yet --- generic/tclEncoding.c | 25 +++++++++++++++++-------- tests/encoding.test | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index b69f7fc..a158269 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -513,7 +513,8 @@ FillEncodingFileMap(void) /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ #define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */ -#define TCL_ENCODING_WTF 0x100 /* For wtf-8 encoding */ +#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) @@ -555,11 +556,14 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_WTF); + type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF); type.encodingName = "wtf-8"; Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; + Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; @@ -1150,7 +1154,7 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED; + flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1266,7 +1270,7 @@ Tcl_ExternalToUtf( dstLen--; } - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; do { Tcl_EncodingState savedState = *statePtr; @@ -2221,8 +2225,8 @@ UtfToUtfProc( } dstStart = dst; - dstEnd = dst + dstLen - TCL_UTF_MAX; flags |= PTR2INT(clientData); + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2269,18 +2273,22 @@ UtfToUtfProc( src += 1; dst += Tcl_UniCharToUtf(ch, dst); } else { + int low; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF)) { + // TODO : handle chars > U+FFFF + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - int low = ch; + low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { @@ -2293,6 +2301,7 @@ UtfToUtfProc( ch = 0xFFFD; } } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); diff --git a/tests/encoding.test b/tests/encoding.test index 43aecbb..0ce009f 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -808,7 +808,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 90 : 89}] +} -result [expr {[info exists ::tcl_precision] ? 91 : 90}] runtests -- cgit v0.12 From 958892fdd0e13a651556c8f6d06207ed5f759974 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 29 Mar 2021 09:51:54 +0000 Subject: Finish CESU-8 encoder/decoder --- generic/tclEncoding.c | 9 ++++++++- tests/encoding.test | 6 ++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index a158269..79f9896 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2281,7 +2281,14 @@ UtfToUtfProc( } src += len; if (!(flags & TCL_ENCODING_UTF)) { - // TODO : handle chars > U+FFFF + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } goto cesu8; } else if ((ch | 0x7FF) == 0xDFFF) { /* diff --git a/tests/encoding.test b/tests/encoding.test index 0ce009f..2964a56 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -469,6 +469,12 @@ test encoding-15.27 {UtfToUtfProc low surrogate character output} { binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} +test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} { + set x \U10000 + set y [encoding convertto cesu-8 \U10000] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 6 eda080edb080} test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] -- cgit v0.12 From 48a57e0116ab6513050d1e280014bde2010c8903 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 30 Mar 2021 14:14:02 +0000 Subject: Bugfix (backported from encodings-with-flags branch): Use correct byte/char positions in error-situation --- generic/tclEncoding.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 8f4612b..445e613 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1161,13 +1161,12 @@ Tcl_ExternalToUtfDString( flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { Tcl_DStringSetLength(dstPtr, soFar); return Tcl_DStringValue(dstPtr); } - flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -1348,6 +1347,7 @@ Tcl_UtfToExternalDString( &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { if (encodingPtr->nullSize == 2) { Tcl_DStringSetLength(dstPtr, soFar + 1); @@ -1357,7 +1357,6 @@ Tcl_UtfToExternalDString( } flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -2274,6 +2273,7 @@ UtfToUtfProc( dst += Tcl_UniCharToUtf(ch, dst); } else { int low; + const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) { result = TCL_CONVERT_SYNTAX; @@ -2302,6 +2302,7 @@ UtfToUtfProc( if (!(flags & TCL_ENCODING_WTF)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; + src = saveSrc; break; } if (!(flags & TCL_ENCODING_MODIFIED)) { @@ -2320,6 +2321,7 @@ UtfToUtfProc( } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; + src = saveSrc; break; } if (!(flags & TCL_ENCODING_MODIFIED)) { @@ -2483,7 +2485,7 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch; + int ch, len; srcStart = src; srcEnd = src + srcLen; @@ -2511,7 +2513,7 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUCS4(src, &ch); + len = TclUtfToUCS4(src, &ch); if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; @@ -2519,6 +2521,7 @@ UtfToUtf16Proc( } ch = 0xFFFD; } + src += len; if (flags & TCL_ENCODING_LE) { if (ch <= 0xFFFF) { *dst++ = (ch & 0xFF); -- cgit v0.12 From 3175cb43a5f12eff36cc942e3b096164a5622574 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 31 Mar 2021 06:40:02 +0000 Subject: Re-use TCL_ENCODING_MODIFIED flag value for TCL_ENCODING_LE too, since they are used for different encoders. --- generic/tclEncoding.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 445e613..0100326 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -511,8 +511,10 @@ FillEncodingFileMap(void) */ /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ +/* Since TCL_ENCODING_MODIFIED is only used for utf-8/wtf-8/cesu-8 and + * TCL_ENCODING_LE is only used for utf-16/wtf-16/ucs-2. re-use the same value */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ -#define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */ +#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ #define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ @@ -524,7 +526,7 @@ TclInitEncodingSubsystem(void) unsigned size; unsigned short i; union { - unsigned char c; + char c; short s; } isLe; @@ -1154,7 +1156,10 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags = TCL_ENCODING_START | TCL_ENCODING_END; + if (encodingPtr->toUtfProc == UtfToUtfProc) { + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + } while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1269,7 +1274,9 @@ Tcl_ExternalToUtf( dstLen--; } - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + if (encodingPtr->toUtfProc == UtfToUtfProc) { + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + } do { Tcl_EncodingState savedState = *statePtr; @@ -2185,7 +2192,7 @@ BinaryProc( static int UtfToUtfProc( - ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ -- cgit v0.12 From 6b90b986b2b9db1fa7d0d34fb406ce0b66f6889d Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 31 Mar 2021 06:53:27 +0000 Subject: Fix testcase for TCL_UTF_MAX=4 --- tests/encoding.test | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/encoding.test b/tests/encoding.test index 0a40805..071ac27 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -470,11 +470,10 @@ test encoding-15.27 {UtfToUtfProc low surrogate character output} { list [string length $x] [string length $y] $z } {1 3 efbfbd} test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} { - set x \U10000 set y [encoding convertto cesu-8 \U10000] binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 6 eda080edb080} + list [string length $y] $z +} {6 eda080edb080} test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] -- cgit v0.12 From 2e7b2436f1c5ea439341ba19e85964ae9444f76c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 8 Apr 2021 11:14:56 +0000 Subject: Add "tcl-8" encoding --- generic/tclEncoding.c | 3 +++ tests/encoding.test | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0632839..29aeefd 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -566,6 +566,9 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(0); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF|TCL_ENCODING_MODIFIED); + type.encodingName = "tcl-8"; + Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; diff --git a/tests/encoding.test b/tests/encoding.test index 071ac27..9924886 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -813,7 +813,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 91 : 90}] +} -result [expr {[info exists ::tcl_precision] ? 92 : 91}] runtests -- cgit v0.12 From aa4f7bcd47ef35e83a3a5a6c3f16398bf3758e49 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 9 Apr 2021 15:19:51 +0000 Subject: Deprecate/remove "string bytelength" --- doc/string.n | 12 ++++++------ generic/tclCmdMZ.c | 4 ++++ library/init.tcl | 4 ++-- tests/info.test | 4 ++-- tests/regexp.test | 3 ++- tests/regexpComp.test | 4 +++- tests/string.test | 19 ++++++++++--------- 7 files changed, 29 insertions(+), 21 deletions(-) diff --git a/doc/string.n b/doc/string.n index 7cd53ca..f1a0592 100644 --- a/doc/string.n +++ b/doc/string.n @@ -404,7 +404,7 @@ Formally, the \fBstring bytelength\fR operation returns the content of the \fIlength\fR field of the \fBTcl_Obj\fR structure, after calling \fBTcl_GetString\fR to ensure that the \fIbytes\fR field is populated. This is highly unlikely to be useful to Tcl scripts, as Tcl's internal -encoding is not strict UTF\-8, but rather a modified CESU\-8 with a +encoding is not strict UTF\-8, but rather a modified WTF\-8 with a denormalized NUL (identical to that used in a number of places by Java's serialization mechanism) to enable basic processing with non-Unicode-aware C functions. As this representation should only @@ -413,13 +413,13 @@ store the representation is of very low value (except to C extension code, which has direct access for the purpose of memory management, etc.) .PP -\fICompatibility note:\fR it is likely that this subcommand will be -withdrawn in a future version of Tcl. It is better to use the -\fBencoding convertto\fR command to convert a string to a known -encoding and then apply \fBstring length\fR to that. +\fICompatibility note:\fR This subcommand is deprecated and will +be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR +command to convert a string to a known encoding (e.g. "wtf-8" or "tcl-8") +and then apply \fBstring length\fR to that. .PP .CS -\fBstring length\fR [encoding convertto utf-8 $theString] +\fBstring length\fR [encoding convertto wtf-8 $theString] .CE .RE .TP diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 61d1010..6f71198 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2835,6 +2835,7 @@ StringCatCmd( * *---------------------------------------------------------------------- */ +#if TCL_MAJOR_VERSION < 9 && !defined(TCL_NO_DEPRECATED) static int StringBytesCmd( TCL_UNUSED(ClientData), @@ -2853,6 +2854,7 @@ StringBytesCmd( Tcl_SetObjResult(interp, Tcl_NewWideIntObj(length)); return TCL_OK; } +#endif /* *---------------------------------------------------------------------- @@ -3309,7 +3311,9 @@ TclInitStringCmd( Tcl_Interp *interp) /* Current interpreter. */ { static const EnsembleImplMap stringImplMap[] = { +#if TCL_MAJOR_VERSION < 9 && !defined(TCL_NO_DEPRECATED) {"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0}, +#endif {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0}, {"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0}, {"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0}, diff --git a/library/init.tcl b/library/init.tcl index c9bfff6..749eed9 100644 --- a/library/init.tcl +++ b/library/init.tcl @@ -214,9 +214,9 @@ proc unknown args { set errInfo [dict get $opts -errorinfo] set errCode [dict get $opts -errorcode] set cinfo $args - if {[string bytelength $cinfo] > 150} { + if {[string length [encoding convertto wtf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 150] - while {[string bytelength $cinfo] > 150} { + while {[string length [encoding convertto wtf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 end-1] } append cinfo ... diff --git a/tests/info.test b/tests/info.test index d9a4f54..ced4435 100644 --- a/tests/info.test +++ b/tests/info.test @@ -22,7 +22,7 @@ if {{::tcltest} ni [namespace children]} { ::tcltest::loadTestedCommands catch [list package require -exact tcl::test [info patchlevel]] testConstraint zlib [llength [info commands zlib]] - +testConstraint nodep [info exists tcl_precision] # Set up namespaces needed to test operation of "info args", "info body", # "info default", and "info procs" with imported procedures. @@ -101,7 +101,7 @@ test info-2.5 {info body option, returning bytecompiled bodies} -body { # Fix for problem tested for in info-2.5 caused problems when # procedure body had no string rep (i.e. was not yet bytecode) # causing an empty string to be returned [Bug #545644] -test info-2.6 {info body option, returning list bodies} { +test info-2.6 {info body option, returning list bodies} nodep { proc foo args [list subst bar] list [string bytelength [info body foo]] \ [foo; string bytelength [info body foo]] diff --git a/tests/regexp.test b/tests/regexp.test index e788b7f..6bed21e 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -19,6 +19,7 @@ if {"::tcltest" ni [namespace children]} { unset -nocomplain foo testConstraint exec [llength [info commands exec]] +testConstraint nodep [info exists tcl_precision] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -765,7 +766,7 @@ test regexp-19.2 {regsub null replacement} { string equal $result $expected } 1 -test regexp-20.1 {regsub shared object shimmering} -body { +test regexp-20.1 {regsub shared object shimmering} -constraints nodep -body { # Bug #461322 set a abcdefghijklmnopqurstuvwxyz set b $a diff --git a/tests/regexpComp.test b/tests/regexpComp.test index 76e708d..1587c72 100644 --- a/tests/regexpComp.test +++ b/tests/regexpComp.test @@ -16,6 +16,8 @@ if {"::tcltest" ni [namespace children]} { namespace import -force ::tcltest::* } +testConstraint nodep [info exists tcl_precision] + # Procedure to evaluate a script within a proc, to test compilation # functionality @@ -791,7 +793,7 @@ test regexpComp-19.1 {regsub null replacement} { } } "\0a\0hel\0a\0lo\0a\0 14" -test regexpComp-20.1 {regsub shared object shimmering} { +test regexpComp-20.1 {regsub shared object shimmering} nodep { evalInProc { # Bug #461322 set a abcdefghijklmnopqurstuvwxyz diff --git a/tests/string.test b/tests/string.test index 72aeb43..99c1517 100644 --- a/tests/string.test +++ b/tests/string.test @@ -33,6 +33,7 @@ testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf16 [expr {[string length \U010000] == 2}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint nodep [info exists tcl_precision] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -72,9 +73,9 @@ if {$noComp} { } -test string-1.1.$noComp {error conditions} { +test string-1.1.$noComp {error conditions} -body { list [catch {run {string gorp a b}} msg] $msg -} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match glob -result {1 {unknown or ambiguous subcommand "gorp": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-1.2.$noComp {error conditions} { list [catch {run {string}} msg] $msg } {1 {wrong # args: should be "string subcommand ?arg ...?"}} @@ -1035,16 +1036,16 @@ test string-7.16.$noComp {string last, start index} { run {string last Üa ÜadÜad end-1} } 3 -test string-8.1.$noComp {string bytelength} { +test string-8.1.$noComp {string bytelength} nodep { list [catch {run {string bytelength}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.2.$noComp {string bytelength} { +test string-8.2.$noComp {string bytelength} nodep { list [catch {run {string bytelength a b}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.3.$noComp {string bytelength} { +test string-8.3.$noComp {string bytelength} nodep { run {string bytelength "\xC7"} } 2 -test string-8.4.$noComp {string bytelength} { +test string-8.4.$noComp {string bytelength} nodep { run {string b ""} } 0 @@ -1827,9 +1828,9 @@ test string-19.3.$noComp {string trimleft, unicode default} { test string-20.1.$noComp {string trimright errors} { list [catch {run {string trimright}} msg] $msg } {1 {wrong # args: should be "string trimright string ?chars?"}} -test string-20.2.$noComp {string trimright errors} { +test string-20.2.$noComp {string trimright errors} -body { list [catch {run {string trimg a}} msg] $msg -} {1 {unknown or ambiguous subcommand "trimg": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match glob -result {1 {unknown or ambiguous subcommand "trimg": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-20.3.$noComp {string trimright} { run {string trimright " XYZ "} } { XYZ} @@ -1949,7 +1950,7 @@ test string-21.25.$noComp {string trimright, unicode} { test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg -} -result {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match glob -result {1 {unknown or ambiguous subcommand "word": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-22.2.$noComp {string wordstart} -body { list [catch {run {string wordstart a}} msg] $msg } -result {1 {wrong # args: should be "string wordstart string index"}} -- cgit v0.12 From 3c9c7e062138b5f21935974d667eec0ae10c346c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 16 Apr 2021 20:34:33 +0000 Subject: Remove wtf-8/wtf-16/tcl-8 encodings --- doc/string.n | 4 +- generic/tclEncoding.c | 40 ++++++-------------- library/init.tcl | 4 +- tests/encoding.test | 100 +++++++++----------------------------------------- 4 files changed, 32 insertions(+), 116 deletions(-) diff --git a/doc/string.n b/doc/string.n index f1a0592..f3d7616 100644 --- a/doc/string.n +++ b/doc/string.n @@ -415,11 +415,11 @@ etc.) .PP \fICompatibility note:\fR This subcommand is deprecated and will be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR -command to convert a string to a known encoding (e.g. "wtf-8" or "tcl-8") +command to convert a string to a known encoding (e.g. "utf-8" or "cesu-8") and then apply \fBstring length\fR to that. .PP .CS -\fBstring length\fR [encoding convertto wtf-8 $theString] +\fBstring length\fR [encoding convertto utf-8 $theString] .CE .RE .TP diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 29aeefd..21c254e 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -511,11 +511,10 @@ FillEncodingFileMap(void) */ /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8/wtf-8/cesu-8 and - * TCL_ENCODING_LE is only used for utf-16/wtf-16/ucs-2. re-use the same value */ +/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and + * TCL_ENCODING_LE is only used for utf-16/ucs-2. re-use the same value */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ -#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void @@ -560,15 +559,9 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF); - type.encodingName = "wtf-8"; - Tcl_CreateEncoding(&type); type.clientData = INT2PTR(0); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF|TCL_ENCODING_MODIFIED); - type.encodingName = "tcl-8"; - Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; @@ -591,21 +584,12 @@ TclInitEncodingSubsystem(void) type.encodingName = "utf-16le"; type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); - type.encodingName = "wtf-16le"; - type.clientData = INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF); - Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); - type.encodingName = "wtf-16be"; - type.clientData = INT2PTR(TCL_ENCODING_WTF); - Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); - type.encodingName = "wtf-16"; - type.clientData = INT2PTR(isLe.c + TCL_ENCODING_WTF); - Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED type.encodingName = "unicode"; @@ -2315,15 +2299,13 @@ UtfToUtfProc( len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - if (!(flags & TCL_ENCODING_WTF)) { - if (flags & TCL_ENCODING_STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } - if (!(flags & TCL_ENCODING_MODIFIED)) { - ch = 0xFFFD; - } + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; } cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); @@ -2334,7 +2316,7 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + } else if (!Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; @@ -2530,7 +2512,7 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + if (!Tcl_UniCharIsUnicode(ch)) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; diff --git a/library/init.tcl b/library/init.tcl index 749eed9..e30296e 100644 --- a/library/init.tcl +++ b/library/init.tcl @@ -214,9 +214,9 @@ proc unknown args { set errInfo [dict get $opts -errorinfo] set errCode [dict get $opts -errorcode] set cinfo $args - if {[string length [encoding convertto wtf-8 $cinfo]] > 150} { + if {[string length [encoding convertto utf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 150] - while {[string length [encoding convertto wtf-8 $cinfo]] > 150} { + while {[string length [encoding convertto utf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 end-1] } append cinfo ... diff --git a/tests/encoding.test b/tests/encoding.test index 9924886..82a2d6b 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -338,138 +338,78 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y } "4 😂" -test encoding-15.6 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83D\uDE02\uD83D - set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D] - binary scan $y H* z - list [string length $y] $z -} {10 edb882f09f9882eda0bd} -test encoding-15.7 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83D\uD83D - set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {3 9 edb882eda0bdeda0bd} -test encoding-15.8 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83Dé - set y [encoding convertto wtf-8 \uDE02\uD83Dé] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {3 8 edb882eda0bdc3a9} -test encoding-15.9 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83DX - set y [encoding convertto wtf-8 \uDE02\uD83DX] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {3 7 edb882eda0bd58} -test encoding-15.10 {UtfToUtfProc high surrogate character output} { - set x \uDE02é - set y [encoding convertto wtf-8 \uDE02é] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 5 edb882c3a9} -test encoding-15.11 {UtfToUtfProc low surrogate character output} { - set x \uDA02é - set y [encoding convertto wtf-8 \uDA02é] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 5 eda882c3a9} -test encoding-15.12 {UtfToUtfProc high surrogate character output} { - set x \uDE02Y - set y [encoding convertto wtf-8 \uDE02Y] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 4 edb88259} -test encoding-15.13 {UtfToUtfProc low surrogate character output} { - set x \uDA02Y - set y [encoding convertto wtf-8 \uDA02Y] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {2 4 eda88259} -test encoding-15.14 {UtfToUtfProc high surrogate character output} { - set x \uDE02 - set y [encoding convertto wtf-8 \uDE02] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {1 3 edb882} -test encoding-15.15 {UtfToUtfProc low surrogate character output} { - set x \uDA02 - set y [encoding convertto wtf-8 \uDA02] - binary scan $y H* z - list [string length $x] [string length $y] $z -} {1 3 eda882} -test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { +test encoding-15.6 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" -test encoding-15.17 {UtfToUtfProc emoji character output} { +test encoding-15.7 {UtfToUtfProc emoji character output} { set x 😂 set y [encoding convertto utf-8 😂] binary scan $y H* z list [string length $y] $z } {4 f09f9882} -test encoding-15.18 {UtfToUtfProc emoji character output} { +test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 efbfbdf09f9882efbfbd} -test encoding-15.19 {UtfToUtfProc emoji character output} { +test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 efbfbdefbfbdefbfbd} -test encoding-15.20 {UtfToUtfProc emoji character output} { +test encoding-15.10 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\xE9 set y [encoding convertto utf-8 \uDE02\uD83D\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 efbfbdefbfbdc3a9} -test encoding-15.21 {UtfToUtfProc emoji character output} { +test encoding-15.11 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX set y [encoding convertto utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 efbfbdefbfbd58} -test encoding-15.22 {UtfToUtfProc high surrogate character output} { +test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02\xE9 set y [encoding convertto utf-8 \uDE02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} -test encoding-15.23 {UtfToUtfProc low surrogate character output} { +test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02\xE9 set y [encoding convertto utf-8 \uDA02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} -test encoding-15.24 {UtfToUtfProc high surrogate character output} { +test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02Y set y [encoding convertto utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} -test encoding-15.25 {UtfToUtfProc low surrogate character output} { +test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02Y set y [encoding convertto utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} -test encoding-15.26 {UtfToUtfProc high surrogate character output} { +test encoding-15.16 {UtfToUtfProc high surrogate character output} { set x \uDE02 set y [encoding convertto utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} -test encoding-15.27 {UtfToUtfProc low surrogate character output} { +test encoding-15.17 {UtfToUtfProc low surrogate character output} { set x \uDA02 set y [encoding convertto utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} -test encoding-15.28 {UtfToUtfProc CESU-8 6-byte sequence} { +test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} { set y [encoding convertto cesu-8 \U10000] binary scan $y H* z list [string length $y] $z @@ -499,19 +439,13 @@ test encoding-16.4 {Ucs2ToUtfProc} -body { test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" } -result "\xD8\xD8\xDC\xDC" -test encoding-17.2 {UtfToUtf16Proc} -body { - encoding convertto wtf-16 "\uDCDC" -} -result "\xDC\xDC" -test encoding-17.3 {UtfToUtf16Proc} -body { - encoding convertto wtf-16 "\uD8D8" -} -result "\xD8\xD8" -test encoding-17.4 {UtfToUcs2Proc} -body { +test encoding-17.2 {UtfToUcs2Proc} -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" -test encoding-17.5 {UtfToUtf16Proc} -body { +test encoding-17.3 {UtfToUtf16Proc} -body { encoding convertto utf-16be "\uDCDC" } -result "\xFF\xFD" -test encoding-17.6 {UtfToUtf16Proc} -body { +test encoding-17.4 {UtfToUtf16Proc} -body { encoding convertto utf-16le "\uD8D8" } -result "\xFD\xFF" @@ -813,7 +747,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 92 : 91}] +} -result [expr {[info exists ::tcl_precision] ? 87 : 86}] runtests -- cgit v0.12 From 82d0339a6592bd7855cc08df656c252dda1352c5 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 16 Apr 2021 20:45:56 +0000 Subject: renumber testcases --- tests/encoding.test | 54 ++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/encoding.test b/tests/encoding.test index 82a2d6b..e1c55d7 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -338,77 +338,77 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y } "4 😂" -test encoding-15.6 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { - set x \xF0\xA0\xA1\xC2 - set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2] - list [string length $x] $y -} "4 \xF0\xA0\xA1\xC2" -test encoding-15.7 {UtfToUtfProc emoji character output} { - set x 😂 - set y [encoding convertto utf-8 😂] - binary scan $y H* z - list [string length $y] $z -} {4 f09f9882} -test encoding-15.8 {UtfToUtfProc emoji character output} { +test encoding-15.6 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 efbfbdf09f9882efbfbd} -test encoding-15.9 {UtfToUtfProc emoji character output} { +test encoding-15.7 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 efbfbdefbfbdefbfbd} -test encoding-15.10 {UtfToUtfProc emoji character output} { - set x \uDE02\uD83D\xE9 - set y [encoding convertto utf-8 \uDE02\uD83D\xE9] +test encoding-15.8 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83Dé + set y [encoding convertto utf-8 \uDE02\uD83Dé] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 efbfbdefbfbdc3a9} -test encoding-15.11 {UtfToUtfProc emoji character output} { +test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX set y [encoding convertto utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 efbfbdefbfbd58} -test encoding-15.12 {UtfToUtfProc high surrogate character output} { - set x \uDE02\xE9 - set y [encoding convertto utf-8 \uDE02\xE9] +test encoding-15.10 {UtfToUtfProc high surrogate character output} { + set x \uDE02é + set y [encoding convertto utf-8 \uDE02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} -test encoding-15.13 {UtfToUtfProc low surrogate character output} { - set x \uDA02\xE9 - set y [encoding convertto utf-8 \uDA02\xE9] +test encoding-15.11 {UtfToUtfProc low surrogate character output} { + set x \uDA02é + set y [encoding convertto utf-8 \uDA02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} -test encoding-15.14 {UtfToUtfProc high surrogate character output} { +test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02Y set y [encoding convertto utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} -test encoding-15.15 {UtfToUtfProc low surrogate character output} { +test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02Y set y [encoding convertto utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} -test encoding-15.16 {UtfToUtfProc high surrogate character output} { +test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02 set y [encoding convertto utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} -test encoding-15.17 {UtfToUtfProc low surrogate character output} { +test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02 set y [encoding convertto utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} +test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { + set x \xF0\xA0\xA1\xC2 + set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2] + list [string length $x] $y +} "4 \xF0\xA0\xA1\xC2" +test encoding-15.17 {UtfToUtfProc emoji character output} { + set x 😂 + set y [encoding convertto utf-8 😂] + binary scan $y H* z + list [string length $y] $z +} {4 f09f9882} test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} { set y [encoding convertto cesu-8 \U10000] binary scan $y H* z -- cgit v0.12 From 3bfe6dba24b7c5f8f9f8669cb99e2bfb80b0e5da Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 19 Apr 2021 07:24:18 +0000 Subject: More testcases (cesu-8) --- tests/encoding.test | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/encoding.test b/tests/encoding.test index e1c55d7..0e80c09 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -414,6 +414,21 @@ test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} { binary scan $y H* z list [string length $y] $z } {6 eda080edb080} +test encoding-15.19 {UtfToUtfProc CESU-8 upper surrogate} { + set y [encoding convertto cesu-8 \uD800] + binary scan $y H* z + list [string length $y] $z +} {3 eda080} +test encoding-15.20 {UtfToUtfProc CESU-8 lower surrogate} { + set y [encoding convertto cesu-8 \uDC00] + binary scan $y H* z + list [string length $y] $z +} {3 edb080} +test encoding-15.21 {UtfToUtfProc CESU-8 noncharacter} { + set y [encoding convertto cesu-8 \uFFFF] + binary scan $y H* z + list [string length $y] $z +} {3 efbfbf} test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] -- cgit v0.12