diff options
-rw-r--r-- | doc/UniCharIsAlpha.3 | 7 | ||||
-rw-r--r-- | generic/tcl.decls | 4 | ||||
-rw-r--r-- | generic/tclCmdMZ.c | 11 | ||||
-rw-r--r-- | generic/tclCompCmdsSZ.c | 24 | ||||
-rw-r--r-- | generic/tclCompile.h | 3 | ||||
-rw-r--r-- | generic/tclDecls.h | 13 | ||||
-rw-r--r-- | generic/tclEncoding.c | 19 | ||||
-rw-r--r-- | generic/tclStubInit.c | 3 | ||||
-rw-r--r-- | generic/tclUtf.c | 30 | ||||
-rw-r--r-- | tests/encoding.test | 82 | ||||
-rw-r--r-- | tests/string.test | 26 |
11 files changed, 182 insertions, 40 deletions
diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 61490ed..20828e4 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -8,7 +8,7 @@ .so man.macros .BS .SH NAME -Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters +Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsUnicode, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters .SH SYNOPSIS .nf \fB#include <tcl.h>\fR @@ -44,6 +44,9 @@ int \fBTcl_UniCharIsUpper\fR(\fIch\fR) .sp int +\fBTcl_UniCharIsUnicode\fR(\fIch\fR) +.sp +int \fBTcl_UniCharIsWordChar\fR(\fIch\fR) .SH ARGUMENTS .AS int ch @@ -81,6 +84,8 @@ with the various routines. .PP \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character. .PP +\fBTcl_UniCharIsUpper\fR tests if the character is a Unicode character. +.PP \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or a connector punctuation mark. diff --git a/generic/tcl.decls b/generic/tcl.decls index 51ece1a..03d43ce 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2412,8 +2412,8 @@ declare 652 { declare 653 { unsigned char *TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) } -declare 656 { - void TclUnusedStubEntry(void) +declare 657 { + int Tcl_UniCharIsUnicode(int ch) } # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d020a93..61d1010 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1533,16 +1533,16 @@ StringIsCmd( "boolean", "dict", "digit", "double", "entier", "false", "graph", "integer", "list", "lower", "print", "punct", - "space", "true", "upper", "wideinteger", - "wordchar", "xdigit", NULL + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, - STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, - STR_IS_WORD, STR_IS_XDIGIT + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; static const char *const isOptions[] = { "-strict", "-failindex", NULL @@ -1872,6 +1872,9 @@ StringIsCmd( case STR_IS_UPPER: chcomp = Tcl_UniCharIsUpper; break; + case STR_IS_UNICODE: + chcomp = Tcl_UniCharIsUnicode; + break; case STR_IS_WORD: chcomp = Tcl_UniCharIsWordChar; break; diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index 0bac52b..a160625 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -505,19 +505,19 @@ TclCompileStringIsCmd( Tcl_Token *tokenPtr = TokenAfter(parsePtr->tokenPtr); static const char *const isClasses[] = { "alnum", "alpha", "ascii", "control", - "boolean", "dict", "digit", "double", "entier", - "false", "graph", "integer", "list", - "lower", "print", "punct", "space", - "true", "upper", "wideinteger", "wordchar", - "xdigit", NULL + "boolean", "dict", "digit", "double", + "entier", "false", "graph", "integer", + "list", "lower", "print", "punct", + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, - STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, - STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, - STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, STR_IS_SPACE, - STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD, - STR_IS_XDIGIT + STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, + STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, + STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; int t, range, allowEmpty = 0, end; InstStringClassType strClassType; @@ -609,6 +609,9 @@ TclCompileStringIsCmd( case STR_IS_UPPER: strClassType = STR_CLASS_UPPER; goto compileStrClass; + case STR_IS_UNICODE: + strClassType = STR_CLASS_UNICODE; + goto compileStrClass; case STR_IS_WORD: strClassType = STR_CLASS_WORD; goto compileStrClass; @@ -1415,6 +1418,7 @@ StringClassDesc const tclStringClassTable[] = { {"upper", Tcl_UniCharIsUpper}, {"word", Tcl_UniCharIsWordChar}, {"xdigit", UniCharIsHexDigit}, + {"unicode", Tcl_UniCharIsUnicode}, {"", NULL} }; diff --git a/generic/tclCompile.h b/generic/tclCompile.h index 21a27f7..6a5faaf 100644 --- a/generic/tclCompile.h +++ b/generic/tclCompile.h @@ -922,8 +922,9 @@ typedef enum InstStringClassType { STR_CLASS_UPPER, /* Unicode upper-case alphabet characters. */ STR_CLASS_WORD, /* Unicode word (alphabetic, digit, connector * punctuation) characters. */ - STR_CLASS_XDIGIT /* Characters that can be used as digits in + STR_CLASS_XDIGIT, /* Characters that can be used as digits in * hexadecimal numbers ([0-9A-Fa-f]). */ + STR_CLASS_UNICODE /* Unicode characters. */ } InstStringClassType; typedef struct StringClassDesc { diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 4b91817..bb31355 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1933,8 +1933,9 @@ EXTERN unsigned char * TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr); /* Slot 654 is reserved */ /* Slot 655 is reserved */ -/* 656 */ -EXTERN void TclUnusedStubEntry(void); +/* Slot 656 is reserved */ +/* 657 */ +EXTERN int Tcl_UniCharIsUnicode(int ch); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2626,7 +2627,8 @@ typedef struct TclStubs { unsigned char * (*tclGetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */ void (*reserved654)(void); void (*reserved655)(void); - void (*tclUnusedStubEntry) (void); /* 656 */ + void (*reserved656)(void); + int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3965,8 +3967,9 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tclGetByteArrayFromObj) /* 653 */ /* Slot 654 is reserved */ /* Slot 655 is reserved */ -#define TclUnusedStubEntry \ - (tclStubsPtr->tclUnusedStubEntry) /* 656 */ +/* Slot 656 is reserved */ +#define Tcl_UniCharIsUnicode \ + (tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */ #endif /* defined(USE_TCL_STUBS) */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index c4ef159..d994f10 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -517,6 +517,9 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ +#define FLAG_LE 1 +#define FLAG_WTF 2 + void TclInitEncodingSubsystem(void) { @@ -559,13 +562,16 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = NULL; Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(FLAG_WTF); + type.encodingName = "wtf-8"; + Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(1); + type.clientData = INT2PTR(FLAG_LE); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; type.clientData = INT2PTR(0); @@ -579,7 +585,7 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "utf-16le"; - type.clientData = INT2PTR(1); + type.clientData = INT2PTR(FLAG_LE); Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); @@ -2271,7 +2277,7 @@ UtfExtToUtfIntProc( static int UtfToUtfProc( - TCL_UNUSED(ClientData), + void *clientData, /* flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2302,6 +2308,7 @@ UtfToUtfProc( const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; + int encflags = PTR2INT(clientData); int *chPtr = (int *) statePtr; if (flags & TCL_ENCODING_START) { @@ -2370,6 +2377,9 @@ UtfToUtfProc( int low = *chPtr; size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { + if ((pureNullMode == 1) && !(encflags & FLAG_WTF)) { + *chPtr = 0xFFFD; + } *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); *dst++ = (char) ((*chPtr | 0x80) & 0xBF); @@ -2378,6 +2388,9 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); *chPtr = low; + } else if ((pureNullMode == 1) && !(encflags & FLAG_WTF) + && !Tcl_UniCharIsUnicode(*chPtr)) { + *chPtr = 0xFFFD; } dst += Tcl_UniCharToUtf(*chPtr, dst); } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index fa5562d..cd1ce81 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1905,7 +1905,8 @@ const TclStubs tclStubs = { TclGetByteArrayFromObj, /* 653 */ 0, /* 654 */ 0, /* 655 */ - TclUnusedStubEntry, /* 656 */ + 0, /* 656 */ + Tcl_UniCharIsUnicode, /* 657 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e096c06..2687a1d 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2187,6 +2187,36 @@ Tcl_UniCharIsUpper( /* *---------------------------------------------------------------------- * + * Tcl_UniCharIsUnicode -- + * + * Test if a character is a Unicode character. + * + * Results: + * Returns non-zero if character belongs to the Unicode set. + * + * Excluded are: + * 1) All characters > U+10FFFF + * 2) Surrogates U+D800 - U+DFFF + * 3) Last 2 characters of each plane, so U+??FFFE and U+??FFFF + * 4) The characters in the range U+FDD0 - U+FDEF + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsUnicode( + int ch) /* Unicode character to test. */ +{ + return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800) + && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32); +} + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharIsWordChar -- * * Test if a character is alphanumeric or a connector punctuation mark. diff --git a/tests/encoding.test b/tests/encoding.test index b1150c6..76b830d 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -340,61 +340,61 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { } "4 😂" test encoding-15.6 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D - set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] + set y [encoding convertto wtf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 edb882f09f9882eda0bd} test encoding-15.7 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D - set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] + set y [encoding convertto wtf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 edb882eda0bdeda0bd} test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\xE9 - set y [encoding convertto utf-8 \uDE02\uD83D\xE9] + set y [encoding convertto wtf-8 \uDE02\uD83D\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 edb882eda0bdc3a9} test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX - set y [encoding convertto utf-8 \uDE02\uD83DX] + set y [encoding convertto wtf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 edb882eda0bd58} test encoding-15.10 {UtfToUtfProc high surrogate character output} { set x \uDE02\xE9 - set y [encoding convertto utf-8 \uDE02\xE9] + set y [encoding convertto wtf-8 \uDE02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 edb882c3a9} test encoding-15.11 {UtfToUtfProc low surrogate character output} { set x \uDA02\xE9 - set y [encoding convertto utf-8 \uDA02\xE9] + set y [encoding convertto wtf-8 \uDA02\xE9] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 eda882c3a9} test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02Y - set y [encoding convertto utf-8 \uDE02Y] + set y [encoding convertto wtf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 edb88259} test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02Y - set y [encoding convertto utf-8 \uDA02Y] + set y [encoding convertto wtf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 eda88259} test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02 - set y [encoding convertto utf-8 \uDE02] + set y [encoding convertto wtf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 edb882} test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02 - set y [encoding convertto utf-8 \uDA02] + set y [encoding convertto wtf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 eda882} @@ -409,6 +409,66 @@ test encoding-15.17 {UtfToUtfProc emoji character output} { binary scan $y H* z list [string length $y] $z } {4 f09f9882} +test encoding-15.18 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83D\uDE02\uD83D + set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] + binary scan $y H* z + list [string length $y] $z +} {10 efbfbdf09f9882efbfbd} +test encoding-15.19 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83D\uD83D + set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {3 9 efbfbdefbfbdefbfbd} +test encoding-15.20 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83D\xE9 + set y [encoding convertto utf-8 \uDE02\uD83D\xE9] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {3 8 efbfbdefbfbdc3a9} +test encoding-15.21 {UtfToUtfProc emoji character output} { + set x \uDE02\uD83DX + set y [encoding convertto utf-8 \uDE02\uD83DX] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {3 7 efbfbdefbfbd58} +test encoding-15.22 {UtfToUtfProc high surrogate character output} { + set x \uDE02\xE9 + set y [encoding convertto utf-8 \uDE02\xE9] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 5 efbfbdc3a9} +test encoding-15.23 {UtfToUtfProc low surrogate character output} { + set x \uDA02\xE9 + set y [encoding convertto utf-8 \uDA02\xE9] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 5 efbfbdc3a9} +test encoding-15.24 {UtfToUtfProc high surrogate character output} { + set x \uDE02Y + set y [encoding convertto utf-8 \uDE02Y] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 4 efbfbd59} +test encoding-15.25 {UtfToUtfProc low surrogate character output} { + set x \uDA02Y + set y [encoding convertto utf-8 \uDA02Y] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {2 4 efbfbd59} +test encoding-15.26 {UtfToUtfProc high surrogate character output} { + set x \uDE02 + set y [encoding convertto utf-8 \uDE02] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {1 3 efbfbd} +test encoding-15.27 {UtfToUtfProc low surrogate character output} { + set x \uDA02 + set y [encoding convertto utf-8 \uDA02] + binary scan $y H* z + list [string length $x] [string length $y] $z +} {1 3 efbfbd} test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] @@ -742,7 +802,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 86 : 85}] +} -result [expr {[info exists ::tcl_precision] ? 87 : 86}] runtests diff --git a/tests/string.test b/tests/string.test index 0eaa3da..b3b278c 100644 --- a/tests/string.test +++ b/tests/string.test @@ -525,10 +525,10 @@ test string-6.4.$noComp {string is, too many args} { } {1 {wrong # args: should be "string is class ?-strict? ?-failindex var? str"}} test string-6.5.$noComp {string is, class check} { list [catch {run {string is bogus str}} msg] $msg -} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} +} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} test string-6.6.$noComp {string is, ambiguous class} { list [catch {run {string is al str}} msg] $msg -} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} +} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} test string-6.7.$noComp {string is alpha, all ok} { run {string is alpha -strict -failindex var abc} } 1 @@ -961,6 +961,28 @@ test string-6.130.1.$noComp {string is entier, false on bad octal} { test string-6.131.$noComp {string is entier, false on bad hex} { list [run {string is entier -fail var 0X12345611234123456123456562345612345612345612345612345612345612345612345612345612345345XYZ}] $var } {0 88} +test string-6.132.$noComp {string is unicode} { + run {string is unicode \U10FFFD\uD7FF\uE000\uFDCF\uFDF0} +} 1 +test string-6.133.$noComp {string is unicode, upper surrogate} { + run {string is unicode \uD800} +} 0 +test string-6.134.$noComp {string is unicode, lower surrogate} { + run {string is unicode \uDFFF} +} 0 +test string-6.135.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFFFE} +} 0 +test string-6.136.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFFFF} +} 0 +test string-6.137.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFDD0} +} 0 +test string-6.138.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFDEF} +} 0 + test string-7.1.$noComp {string last, not enough args} { list [catch {run {string last a}} msg] $msg |