diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-30 14:04:40 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-30 14:04:40 (GMT) |
commit | 923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2 (patch) | |
tree | 5e2b6a1ba08eb5538726ec81b021e4fee09422f5 /generic | |
parent | 78a0992b4431f976641f3d08f63c13fab742e1b9 (diff) | |
parent | 269722c41046a3adc7c2d5fed46b28063ed714b0 (diff) | |
download | tcl-923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2.zip tcl-923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2.tar.gz tcl-923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2.tar.bz2 |
Merge tip-597. More testcases
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tcl.decls | 5 | ||||
-rw-r--r-- | generic/tclCmdMZ.c | 11 | ||||
-rw-r--r-- | generic/tclCompCmdsSZ.c | 24 | ||||
-rw-r--r-- | generic/tclCompile.h | 3 | ||||
-rw-r--r-- | generic/tclDecls.h | 8 | ||||
-rw-r--r-- | generic/tclEncoding.c | 83 | ||||
-rw-r--r-- | generic/tclStubInit.c | 2 | ||||
-rw-r--r-- | generic/tclUtf.c | 30 |
8 files changed, 133 insertions, 33 deletions
diff --git a/generic/tcl.decls b/generic/tcl.decls index 0dfa415..768896b 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2423,7 +2423,9 @@ declare 655 { declare 656 { const char *Tcl_UtfPrev(const char *src, const char *start) } - +declare 657 { + int Tcl_UniCharIsUnicode(int ch) +} declare 658 { size_t Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr) @@ -2433,7 +2435,6 @@ declare 659 { const char *src, int srcLen, int flags, Tcl_DString *dsPtr) } - # ----- BASELINE -- FOR -- 8.7.0 ----- # ############################################################################## diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d020a93..61d1010 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1533,16 +1533,16 @@ StringIsCmd( "boolean", "dict", "digit", "double", "entier", "false", "graph", "integer", "list", "lower", "print", "punct", - "space", "true", "upper", "wideinteger", - "wordchar", "xdigit", NULL + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, - STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, - STR_IS_WORD, STR_IS_XDIGIT + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; static const char *const isOptions[] = { "-strict", "-failindex", NULL @@ -1872,6 +1872,9 @@ StringIsCmd( case STR_IS_UPPER: chcomp = Tcl_UniCharIsUpper; break; + case STR_IS_UNICODE: + chcomp = Tcl_UniCharIsUnicode; + break; case STR_IS_WORD: chcomp = Tcl_UniCharIsWordChar; break; diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index 0bac52b..a160625 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -505,19 +505,19 @@ TclCompileStringIsCmd( Tcl_Token *tokenPtr = TokenAfter(parsePtr->tokenPtr); static const char *const isClasses[] = { "alnum", "alpha", "ascii", "control", - "boolean", "dict", "digit", "double", "entier", - "false", "graph", "integer", "list", - "lower", "print", "punct", "space", - "true", "upper", "wideinteger", "wordchar", - "xdigit", NULL + "boolean", "dict", "digit", "double", + "entier", "false", "graph", "integer", + "list", "lower", "print", "punct", + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, - STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, - STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, - STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, STR_IS_SPACE, - STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD, - STR_IS_XDIGIT + STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, + STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, + STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; int t, range, allowEmpty = 0, end; InstStringClassType strClassType; @@ -609,6 +609,9 @@ TclCompileStringIsCmd( case STR_IS_UPPER: strClassType = STR_CLASS_UPPER; goto compileStrClass; + case STR_IS_UNICODE: + strClassType = STR_CLASS_UNICODE; + goto compileStrClass; case STR_IS_WORD: strClassType = STR_CLASS_WORD; goto compileStrClass; @@ -1415,6 +1418,7 @@ StringClassDesc const tclStringClassTable[] = { {"upper", Tcl_UniCharIsUpper}, {"word", Tcl_UniCharIsWordChar}, {"xdigit", UniCharIsHexDigit}, + {"unicode", Tcl_UniCharIsUnicode}, {"", NULL} }; diff --git a/generic/tclCompile.h b/generic/tclCompile.h index 21a27f7..6a5faaf 100644 --- a/generic/tclCompile.h +++ b/generic/tclCompile.h @@ -922,8 +922,9 @@ typedef enum InstStringClassType { STR_CLASS_UPPER, /* Unicode upper-case alphabet characters. */ STR_CLASS_WORD, /* Unicode word (alphabetic, digit, connector * punctuation) characters. */ - STR_CLASS_XDIGIT /* Characters that can be used as digits in + STR_CLASS_XDIGIT, /* Characters that can be used as digits in * hexadecimal numbers ([0-9A-Fa-f]). */ + STR_CLASS_UNICODE /* Unicode characters. */ } InstStringClassType; typedef struct StringClassDesc { diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 6ee645d..8a61eb8 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1937,7 +1937,8 @@ EXTERN int Tcl_UtfCharComplete(const char *src, int length); EXTERN const char * Tcl_UtfNext(const char *src); /* 656 */ EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); -/* Slot 657 is reserved */ +/* 657 */ +EXTERN int Tcl_UniCharIsUnicode(int ch); /* 658 */ EXTERN size_t Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, const char *src, int srcLen, int flags, @@ -2638,7 +2639,7 @@ typedef struct TclStubs { int (*tcl_UtfCharComplete) (const char *src, int length); /* 654 */ const char * (*tcl_UtfNext) (const char *src); /* 655 */ const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ - void (*reserved657)(void); + int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ size_t (*tcl_ExternalToUtfDStringEx) (Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr); /* 658 */ size_t (*tcl_UtfToExternalDStringEx) (Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr); /* 659 */ } TclStubs; @@ -3983,7 +3984,8 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UtfNext) /* 655 */ #define Tcl_UtfPrev \ (tclStubsPtr->tcl_UtfPrev) /* 656 */ -/* Slot 657 is reserved */ +#define Tcl_UniCharIsUnicode \ + (tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */ #define Tcl_ExternalToUtfDStringEx \ (tclStubsPtr->tcl_ExternalToUtfDStringEx) /* 658 */ #define Tcl_UtfToExternalDStringEx \ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0bce51b..e68aa25 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -512,6 +512,8 @@ FillEncodingFileMap(void) /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ #define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */ +#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) @@ -553,7 +555,13 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF); + type.encodingName = "wtf-8"; + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -577,12 +585,21 @@ TclInitEncodingSubsystem(void) type.encodingName = "utf-16le"; type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16le"; + type.clientData = INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16be"; type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16be"; + type.clientData = INT2PTR(TCL_ENCODING_WTF); + Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); + type.encodingName = "wtf-16"; + type.clientData = INT2PTR(isLe.c + TCL_ENCODING_WTF); + Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED type.encodingName = "unicode"; @@ -1153,7 +1170,7 @@ Tcl_ExternalToUtfDStringEx( srcLen = encodingPtr->lengthProc(src); } - flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1268,12 +1285,12 @@ Tcl_ExternalToUtf( dstLen--; } - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; do { Tcl_EncodingState savedState = *statePtr; result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); if (*dstCharsPtr <= maxChars) { break; @@ -2200,7 +2217,7 @@ BinaryProc( static int UtfToUtfProc( - TCL_UNUSED(ClientData), + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2239,7 +2256,8 @@ UtfToUtfProc( } dstStart = dst; - dstEnd = dst + dstLen - TCL_UTF_MAX; + flags |= PTR2INT(clientData); + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2286,21 +2304,44 @@ UtfToUtfProc( src += 1; dst += Tcl_UniCharToUtf(ch, dst); } else { + int low; + const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF)) { + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - int low = ch; + low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + if (!(flags & TCL_ENCODING_WTF)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } + } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); @@ -2309,6 +2350,15 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; + } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } } dst += Tcl_UniCharToUtf(ch, dst); } @@ -2444,7 +2494,7 @@ Utf16ToUtfProc( static int UtfToUtf16Proc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2467,7 +2517,7 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch; + int ch, len; srcStart = src; srcEnd = src + srcLen; @@ -2478,6 +2528,7 @@ UtfToUtf16Proc( dstStart = dst; dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + flags |= PTR2INT(clientData); result = TCL_OK; for (numChars = 0; src < srcEnd; numChars++) { @@ -2494,8 +2545,16 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUCS4(src, &ch); - if (clientData) { + len = TclUtfToUCS4(src, &ch); + if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; + if (flags & TCL_ENCODING_LE) { if (ch <= 0xFFFF) { *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 54ab4b6..ba71f3e 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1932,7 +1932,7 @@ const TclStubs tclStubs = { Tcl_UtfCharComplete, /* 654 */ Tcl_UtfNext, /* 655 */ Tcl_UtfPrev, /* 656 */ - 0, /* 657 */ + Tcl_UniCharIsUnicode, /* 657 */ Tcl_ExternalToUtfDStringEx, /* 658 */ Tcl_UtfToExternalDStringEx, /* 659 */ }; diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 28f725a..fcdf80a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2182,6 +2182,36 @@ Tcl_UniCharIsUpper( /* *---------------------------------------------------------------------- * + * Tcl_UniCharIsUnicode -- + * + * Test if a character is a Unicode character. + * + * Results: + * Returns non-zero if character belongs to the Unicode set. + * + * Excluded are: + * 1) All characters > U+10FFFF + * 2) Surrogates U+D800 - U+DFFF + * 3) Last 2 characters of each plane, so U+??FFFE and U+??FFFF + * 4) The characters in the range U+FDD0 - U+FDEF + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsUnicode( + int ch) /* Unicode character to test. */ +{ + return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800) + && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32); +} + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharIsWordChar -- * * Test if a character is alphanumeric or a connector punctuation mark. |