summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2021-03-30 14:04:40 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2021-03-30 14:04:40 (GMT)
commit923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2 (patch)
tree5e2b6a1ba08eb5538726ec81b021e4fee09422f5 /generic
parent78a0992b4431f976641f3d08f63c13fab742e1b9 (diff)
parent269722c41046a3adc7c2d5fed46b28063ed714b0 (diff)
downloadtcl-923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2.zip
tcl-923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2.tar.gz
tcl-923c7e81f475fdeabbb4276eb3ff6d31dfcf17f2.tar.bz2
Merge tip-597. More testcases
Diffstat (limited to 'generic')
-rw-r--r--generic/tcl.decls5
-rw-r--r--generic/tclCmdMZ.c11
-rw-r--r--generic/tclCompCmdsSZ.c24
-rw-r--r--generic/tclCompile.h3
-rw-r--r--generic/tclDecls.h8
-rw-r--r--generic/tclEncoding.c83
-rw-r--r--generic/tclStubInit.c2
-rw-r--r--generic/tclUtf.c30
8 files changed, 133 insertions, 33 deletions
diff --git a/generic/tcl.decls b/generic/tcl.decls
index 0dfa415..768896b 100644
--- a/generic/tcl.decls
+++ b/generic/tcl.decls
@@ -2423,7 +2423,9 @@ declare 655 {
declare 656 {
const char *Tcl_UtfPrev(const char *src, const char *start)
}
-
+declare 657 {
+ int Tcl_UniCharIsUnicode(int ch)
+}
declare 658 {
size_t Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding,
const char *src, int srcLen, int flags, Tcl_DString *dsPtr)
@@ -2433,7 +2435,6 @@ declare 659 {
const char *src, int srcLen, int flags, Tcl_DString *dsPtr)
}
-
# ----- BASELINE -- FOR -- 8.7.0 ----- #
##############################################################################
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index d020a93..61d1010 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1533,16 +1533,16 @@ StringIsCmd(
"boolean", "dict", "digit", "double",
"entier", "false", "graph", "integer",
"list", "lower", "print", "punct",
- "space", "true", "upper", "wideinteger",
- "wordchar", "xdigit", NULL
+ "space", "true", "upper", "unicode",
+ "wideinteger", "wordchar", "xdigit", NULL
};
enum isClassesEnum {
STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL,
STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE,
STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT,
STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT,
- STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE,
- STR_IS_WORD, STR_IS_XDIGIT
+ STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE,
+ STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT
};
static const char *const isOptions[] = {
"-strict", "-failindex", NULL
@@ -1872,6 +1872,9 @@ StringIsCmd(
case STR_IS_UPPER:
chcomp = Tcl_UniCharIsUpper;
break;
+ case STR_IS_UNICODE:
+ chcomp = Tcl_UniCharIsUnicode;
+ break;
case STR_IS_WORD:
chcomp = Tcl_UniCharIsWordChar;
break;
diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c
index 0bac52b..a160625 100644
--- a/generic/tclCompCmdsSZ.c
+++ b/generic/tclCompCmdsSZ.c
@@ -505,19 +505,19 @@ TclCompileStringIsCmd(
Tcl_Token *tokenPtr = TokenAfter(parsePtr->tokenPtr);
static const char *const isClasses[] = {
"alnum", "alpha", "ascii", "control",
- "boolean", "dict", "digit", "double", "entier",
- "false", "graph", "integer", "list",
- "lower", "print", "punct", "space",
- "true", "upper", "wideinteger", "wordchar",
- "xdigit", NULL
+ "boolean", "dict", "digit", "double",
+ "entier", "false", "graph", "integer",
+ "list", "lower", "print", "punct",
+ "space", "true", "upper", "unicode",
+ "wideinteger", "wordchar", "xdigit", NULL
};
enum isClassesEnum {
STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL,
- STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER,
- STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST,
- STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, STR_IS_SPACE,
- STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD,
- STR_IS_XDIGIT
+ STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE,
+ STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT,
+ STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT,
+ STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE,
+ STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT
};
int t, range, allowEmpty = 0, end;
InstStringClassType strClassType;
@@ -609,6 +609,9 @@ TclCompileStringIsCmd(
case STR_IS_UPPER:
strClassType = STR_CLASS_UPPER;
goto compileStrClass;
+ case STR_IS_UNICODE:
+ strClassType = STR_CLASS_UNICODE;
+ goto compileStrClass;
case STR_IS_WORD:
strClassType = STR_CLASS_WORD;
goto compileStrClass;
@@ -1415,6 +1418,7 @@ StringClassDesc const tclStringClassTable[] = {
{"upper", Tcl_UniCharIsUpper},
{"word", Tcl_UniCharIsWordChar},
{"xdigit", UniCharIsHexDigit},
+ {"unicode", Tcl_UniCharIsUnicode},
{"", NULL}
};
diff --git a/generic/tclCompile.h b/generic/tclCompile.h
index 21a27f7..6a5faaf 100644
--- a/generic/tclCompile.h
+++ b/generic/tclCompile.h
@@ -922,8 +922,9 @@ typedef enum InstStringClassType {
STR_CLASS_UPPER, /* Unicode upper-case alphabet characters. */
STR_CLASS_WORD, /* Unicode word (alphabetic, digit, connector
* punctuation) characters. */
- STR_CLASS_XDIGIT /* Characters that can be used as digits in
+ STR_CLASS_XDIGIT, /* Characters that can be used as digits in
* hexadecimal numbers ([0-9A-Fa-f]). */
+ STR_CLASS_UNICODE /* Unicode characters. */
} InstStringClassType;
typedef struct StringClassDesc {
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 6ee645d..8a61eb8 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -1937,7 +1937,8 @@ EXTERN int Tcl_UtfCharComplete(const char *src, int length);
EXTERN const char * Tcl_UtfNext(const char *src);
/* 656 */
EXTERN const char * Tcl_UtfPrev(const char *src, const char *start);
-/* Slot 657 is reserved */
+/* 657 */
+EXTERN int Tcl_UniCharIsUnicode(int ch);
/* 658 */
EXTERN size_t Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding,
const char *src, int srcLen, int flags,
@@ -2638,7 +2639,7 @@ typedef struct TclStubs {
int (*tcl_UtfCharComplete) (const char *src, int length); /* 654 */
const char * (*tcl_UtfNext) (const char *src); /* 655 */
const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */
- void (*reserved657)(void);
+ int (*tcl_UniCharIsUnicode) (int ch); /* 657 */
size_t (*tcl_ExternalToUtfDStringEx) (Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr); /* 658 */
size_t (*tcl_UtfToExternalDStringEx) (Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr); /* 659 */
} TclStubs;
@@ -3983,7 +3984,8 @@ extern const TclStubs *tclStubsPtr;
(tclStubsPtr->tcl_UtfNext) /* 655 */
#define Tcl_UtfPrev \
(tclStubsPtr->tcl_UtfPrev) /* 656 */
-/* Slot 657 is reserved */
+#define Tcl_UniCharIsUnicode \
+ (tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */
#define Tcl_ExternalToUtfDStringEx \
(tclStubsPtr->tcl_ExternalToUtfDStringEx) /* 658 */
#define Tcl_UtfToExternalDStringEx \
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 0bce51b..e68aa25 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -512,6 +512,8 @@ FillEncodingFileMap(void)
/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
#define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */
+#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */
+#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
void
TclInitEncodingSubsystem(void)
@@ -553,7 +555,13 @@ TclInitEncodingSubsystem(void)
type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
- type.clientData = NULL;
+ type.clientData = INT2PTR(TCL_ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF);
+ type.encodingName = "wtf-8";
+ Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(0);
+ type.encodingName = "cesu-8";
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf16ToUtfProc;
@@ -577,12 +585,21 @@ TclInitEncodingSubsystem(void)
type.encodingName = "utf-16le";
type.clientData = INT2PTR(TCL_ENCODING_LE);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16le";
+ type.clientData = INT2PTR(TCL_ENCODING_LE + TCL_ENCODING_WTF);
+ Tcl_CreateEncoding(&type);
type.encodingName = "utf-16be";
type.clientData = INT2PTR(0);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16be";
+ type.clientData = INT2PTR(TCL_ENCODING_WTF);
+ Tcl_CreateEncoding(&type);
type.encodingName = "utf-16";
type.clientData = INT2PTR(isLe.c);
Tcl_CreateEncoding(&type);
+ type.encodingName = "wtf-16";
+ type.clientData = INT2PTR(isLe.c + TCL_ENCODING_WTF);
+ Tcl_CreateEncoding(&type);
#ifndef TCL_NO_DEPRECATED
type.encodingName = "unicode";
@@ -1153,7 +1170,7 @@ Tcl_ExternalToUtfDStringEx(
srcLen = encodingPtr->lengthProc(src);
}
- flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
while (1) {
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1268,12 +1285,12 @@ Tcl_ExternalToUtf(
dstLen--;
}
- flags |= TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
do {
Tcl_EncodingState savedState = *statePtr;
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
- flags , statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
+ flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
dstCharsPtr);
if (*dstCharsPtr <= maxChars) {
break;
@@ -2200,7 +2217,7 @@ BinaryProc(
static int
UtfToUtfProc(
- TCL_UNUSED(ClientData),
+ ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2239,7 +2256,8 @@ UtfToUtfProc(
}
dstStart = dst;
- dstEnd = dst + dstLen - TCL_UTF_MAX;
+ flags |= PTR2INT(clientData);
+ dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2286,21 +2304,44 @@ UtfToUtfProc(
src += 1;
dst += Tcl_UniCharToUtf(ch, dst);
} else {
+ int low;
+ const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
result = TCL_CONVERT_SYNTAX;
break;
}
src += len;
- if ((ch | 0x7FF) == 0xDFFF) {
+ if (!(flags & TCL_ENCODING_UTF)) {
+ if (ch > 0xFFFF) {
+ /* CESU-8 6-byte sequence for chars > U+FFFF */
+ ch -= 0x10000;
+ *dst++ = 0xED;
+ *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
+ *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
+ ch = (ch & 0x0CFF) | 0xDC00;
+ }
+ goto cesu8;
+ } else if ((ch | 0x7FF) == 0xDFFF) {
/*
* A surrogate character is detected, handle especially.
*/
- int low = ch;
+ low = ch;
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
+ if (!(flags & TCL_ENCODING_WTF)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (!(flags & TCL_ENCODING_MODIFIED)) {
+ ch = 0xFFFD;
+ }
+ }
+ cesu8:
*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
*dst++ = (char) ((ch | 0x80) & 0xBF);
@@ -2309,6 +2350,15 @@ UtfToUtfProc(
src += len;
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
+ } else if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (!(flags & TCL_ENCODING_MODIFIED)) {
+ ch = 0xFFFD;
+ }
}
dst += Tcl_UniCharToUtf(ch, dst);
}
@@ -2444,7 +2494,7 @@ Utf16ToUtfProc(
static int
UtfToUtf16Proc(
- ClientData clientData, /* != NULL means LE, == NUL means BE */
+ ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* Conversion control flags. */
@@ -2467,7 +2517,7 @@ UtfToUtf16Proc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- int ch;
+ int ch, len;
srcStart = src;
srcEnd = src + srcLen;
@@ -2478,6 +2528,7 @@ UtfToUtf16Proc(
dstStart = dst;
dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
+ flags |= PTR2INT(clientData);
result = TCL_OK;
for (numChars = 0; src < srcEnd; numChars++) {
@@ -2494,8 +2545,16 @@ UtfToUtf16Proc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUCS4(src, &ch);
- if (clientData) {
+ len = TclUtfToUCS4(src, &ch);
+ if (!(flags & TCL_ENCODING_WTF) && !Tcl_UniCharIsUnicode(ch)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ ch = 0xFFFD;
+ }
+ src += len;
+ if (flags & TCL_ENCODING_LE) {
if (ch <= 0xFFFF) {
*dst++ = (ch & 0xFF);
*dst++ = (ch >> 8);
diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c
index 54ab4b6..ba71f3e 100644
--- a/generic/tclStubInit.c
+++ b/generic/tclStubInit.c
@@ -1932,7 +1932,7 @@ const TclStubs tclStubs = {
Tcl_UtfCharComplete, /* 654 */
Tcl_UtfNext, /* 655 */
Tcl_UtfPrev, /* 656 */
- 0, /* 657 */
+ Tcl_UniCharIsUnicode, /* 657 */
Tcl_ExternalToUtfDStringEx, /* 658 */
Tcl_UtfToExternalDStringEx, /* 659 */
};
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 28f725a..fcdf80a 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2182,6 +2182,36 @@ Tcl_UniCharIsUpper(
/*
*----------------------------------------------------------------------
*
+ * Tcl_UniCharIsUnicode --
+ *
+ * Test if a character is a Unicode character.
+ *
+ * Results:
+ * Returns non-zero if character belongs to the Unicode set.
+ *
+ * Excluded are:
+ * 1) All characters > U+10FFFF
+ * 2) Surrogates U+D800 - U+DFFF
+ * 3) Last 2 characters of each plane, so U+??FFFE and U+??FFFF
+ * 4) The characters in the range U+FDD0 - U+FDEF
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsUnicode(
+ int ch) /* Unicode character to test. */
+{
+ return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800)
+ && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UniCharIsWordChar --
*
* Test if a character is alphanumeric or a connector punctuation mark.