From bbcdfd9e87e6ffd0c35950b0d6777239785d892a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 6 Feb 2024 13:01:11 +0000 Subject: Proposed fix for [80d4c1c7e5]: Isolated surrogates are not replaced when encoding to / from utf-8 --- generic/tclEncoding.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d780299..63ebdf4 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2485,7 +2485,6 @@ UtfToUtfProc( flags |= PTR2INT(clientData); dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); - profile = ENCODING_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { @@ -2596,6 +2595,8 @@ UtfToUtfProc( result = (flags & ENCODING_INPUT) ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN; src = saveSrc; break; + } else if (PROFILE_REPLACE(profile) && SURROGATE(ch)) { + ch = UNICODE_REPLACE_CHAR; } dst += Tcl_UniCharToUtf(ch, dst); } -- cgit v0.12 From ed502a9c1fb2a4504b8de6a475fcf19e92bd6607 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Wed, 7 Feb 2024 03:06:05 +0000 Subject: Add test cases for [80d4c1c7e5] --- tests/encoding.test | 6 ++++-- tests/encodingVectors.tcl | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/encoding.test b/tests/encoding.test index 8bc096c..c27fcd3 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -881,8 +881,8 @@ test encoding-24.37 {Parse invalid utf-8 with -profile tcl8} -body { test encoding-24.38.1 {Try to generate invalid utf-8} -body { encoding convertto -profile tcl8 utf-8 \uD800 } -result \xED\xA0\x80 -test encoding-24.38.2 {Try to generate invalid utf-8} -body { - encoding convertto -profile strict utf-8 \uD800 +test encoding-24.38.2 {Try to generate invalid utf-8 - default profile} -body { + encoding convertto utf-8 \uD800 } -returnCodes 1 -result {unexpected character at index 0: 'U+00D800'} test encoding-24.39 {Try to generate invalid utf-8 with -profile strict} -body { encoding convertto -profile strict utf-8 \uD800 @@ -1176,6 +1176,8 @@ test encoding-bug-66ffafd309-2-replace {Bug [66ffafd309] - invalid DBCS} -body { encoding convertfrom -profile replace jis0208 \x78\x79 } -result \uFFFD\uFFFD + + # cleanup namespace delete ::tcl::test::encoding ::tcltest::cleanupTests diff --git a/tests/encodingVectors.tcl b/tests/encodingVectors.tcl index 8bd6b87..6583473 100644 --- a/tests/encodingVectors.tcl +++ b/tests/encodingVectors.tcl @@ -641,10 +641,12 @@ lappend encUnencodableStrings {*}{ iso8859-1 \u0141 tcl8 3f -1 {} unencodable iso8859-1 \u0141 strict {} 0 {} unencodable - utf-8 \uD800 tcl8 eda080 -1 {} High-surrogate - utf-8 \uD800 strict {} 0 {} High-surrogate + utf-8 \uD800 tcl8 eda080 -1 {} Low-surrogate + utf-8 \uD800 replace efbfbd -1 {} Low-surrogate + utf-8 \uD800 strict {} 0 {} Low-surrogate utf-8 \uDC00 tcl8 edb080 -1 {} High-surrogate utf-8 \uDC00 strict {} 0 {} High-surrogate + utf-8 \uDC00 replace efbfbd -1 {} High-surrogate } -- cgit v0.12 From f531ea653b3043bcd9a542083ee9c6e07e286cfc Mon Sep 17 00:00:00 2001 From: pooryorick Date: Wed, 7 Feb 2024 14:47:07 +0000 Subject: Implementation of TIP 652. --- doc/UniCharIsAlpha.3 | 8 +------- generic/tcl.decls | 8 +++++--- generic/tclCmdMZ.c | 7 ++----- generic/tclCompCmdsSZ.c | 8 ++------ generic/tclCompile.h | 1 - generic/tclStubInit.c | 2 +- generic/tclUtf.c | 30 ------------------------------ tests/string.test | 25 ++----------------------- 8 files changed, 13 insertions(+), 76 deletions(-) diff --git a/doc/UniCharIsAlpha.3 b/doc/UniCharIsAlpha.3 index 4ae4612..2b8c0d7 100644 --- a/doc/UniCharIsAlpha.3 +++ b/doc/UniCharIsAlpha.3 @@ -8,7 +8,7 @@ .so man.macros .BS .SH NAME -Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsUnicode, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters +Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters .SH SYNOPSIS .nf \fB#include \fR @@ -44,9 +44,6 @@ int \fBTcl_UniCharIsUpper\fR(\fIch\fR) .sp int -\fBTcl_UniCharIsUnicode\fR(\fIch\fR) -.sp -int \fBTcl_UniCharIsWordChar\fR(\fIch\fR) .fi .SH ARGUMENTS @@ -91,9 +88,6 @@ character. \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character. .PP -\fBTcl_UniCharIsUnicode\fR tests if the character is a Unicode character, -not being a surrogate or noncharacter. -.PP \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or a connector punctuation mark. diff --git a/generic/tcl.decls b/generic/tcl.decls index 8e047d0..b8e1e1f 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2499,9 +2499,11 @@ declare 655 { declare 656 { const char *Tcl_UtfPrev(const char *src, const char *start) } -declare 657 { - int Tcl_UniCharIsUnicode(int ch) -} +# Removed by TIP #652 +# +#declare 657 { +# int Tcl_UniCharIsUnicode(int ch) +#} # TIP 656 declare 658 { diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 38e04cb..2660ff1 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1534,7 +1534,7 @@ StringIsCmd( "boolean", "dict", "digit", "double", "entier", "false", "graph", "integer", "list", "lower", "print", "punct", - "space", "true", "upper", "unicode", + "space", "true", "upper", "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { @@ -1542,7 +1542,7 @@ StringIsCmd( STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, - STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT } index; static const char *const isOptions[] = { @@ -1871,9 +1871,6 @@ StringIsCmd( case STR_IS_UPPER: chcomp = Tcl_UniCharIsUpper; break; - case STR_IS_UNICODE: - chcomp = Tcl_UniCharIsUnicode; - break; case STR_IS_WORD: chcomp = Tcl_UniCharIsWordChar; break; diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index 0a21226..d79b7b9 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -508,7 +508,7 @@ TclCompileStringIsCmd( "boolean", "dict", "digit", "double", "entier", "false", "graph", "integer", "list", "lower", "print", "punct", - "space", "true", "upper", "unicode", + "space", "true", "upper", "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { @@ -516,7 +516,7 @@ TclCompileStringIsCmd( STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, - STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT } t; int range, allowEmpty = 0, end; @@ -609,9 +609,6 @@ TclCompileStringIsCmd( case STR_IS_UPPER: strClassType = STR_CLASS_UPPER; goto compileStrClass; - case STR_IS_UNICODE: - strClassType = STR_CLASS_UNICODE; - goto compileStrClass; case STR_IS_WORD: strClassType = STR_CLASS_WORD; goto compileStrClass; @@ -1423,7 +1420,6 @@ StringClassDesc const tclStringClassTable[] = { {"upper", Tcl_UniCharIsUpper}, {"word", Tcl_UniCharIsWordChar}, {"xdigit", UniCharIsHexDigit}, - {"unicode", Tcl_UniCharIsUnicode}, {"", NULL} }; diff --git a/generic/tclCompile.h b/generic/tclCompile.h index 2ea2565..5bbbb8f 100644 --- a/generic/tclCompile.h +++ b/generic/tclCompile.h @@ -918,7 +918,6 @@ typedef enum InstStringClassType { * punctuation) characters. */ STR_CLASS_XDIGIT, /* Characters that can be used as digits in * hexadecimal numbers ([0-9A-Fa-f]). */ - STR_CLASS_UNICODE /* Unicode characters. */ } InstStringClassType; typedef struct StringClassDesc { diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 34e8c27..9072796 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1452,7 +1452,7 @@ const TclStubs tclStubs = { Tcl_UtfCharComplete, /* 654 */ Tcl_UtfNext, /* 655 */ Tcl_UtfPrev, /* 656 */ - Tcl_UniCharIsUnicode, /* 657 */ + 0, /* 657 */ Tcl_ExternalToUtfDStringEx, /* 658 */ Tcl_UtfToExternalDStringEx, /* 659 */ Tcl_AsyncMarkFromSignal, /* 660 */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index ca4a166..9888772 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2283,36 +2283,6 @@ Tcl_UniCharIsUpper( /* *---------------------------------------------------------------------- * - * Tcl_UniCharIsUnicode -- - * - * Test if a character is a Unicode character. - * - * Results: - * Returns non-zero if character belongs to the Unicode set. - * - * Excluded are: - * 1) All characters > U+10FFFF - * 2) Surrogates U+D800 - U+DFFF - * 3) Last 2 characters of each plane, so U+??FFFE and U+??FFFF - * 4) The characters in the range U+FDD0 - U+FDEF - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsUnicode( - int ch) /* Unicode character to test. */ -{ - return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800) - && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32); -} - -/* - *---------------------------------------------------------------------- - * * Tcl_UniCharIsWordChar -- * * Test if a character is alphanumeric or a connector punctuation mark. diff --git a/tests/string.test b/tests/string.test index a232f1e..26cd8a7 100644 --- a/tests/string.test +++ b/tests/string.test @@ -537,10 +537,10 @@ test string-6.4.$noComp {string is, too many args} { } {1 {wrong # args: should be "string is class ?-strict? ?-failindex var? str"}} test string-6.5.$noComp {string is, class check} { list [catch {run {string is bogus str}} msg] $msg -} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} +} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} test string-6.6.$noComp {string is, ambiguous class} { list [catch {run {string is al str}} msg] $msg -} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} +} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} test string-6.7.$noComp {string is alpha, all ok} { run {string is alpha -strict -failindex var abc} } 1 @@ -973,27 +973,6 @@ test string-6.130.1.$noComp {string is entier, false on bad octal} { test string-6.131.$noComp {string is entier, false on bad hex} { list [run {string is entier -fail var 0X12345611234123456123456562345612345612345612345612345612345612345612345612345612345345XYZ}] $var } {0 88} -test string-6.132.$noComp {string is unicode} { - run {string is unicode \U10FFFD\uD7FF\uE000\uFDCF\uFDF0} -} 1 -test string-6.133.$noComp {string is unicode, upper surrogate} { - run {string is unicode \uD800} -} 0 -test string-6.134.$noComp {string is unicode, lower surrogate} { - run {string is unicode \uDFFF} -} 0 -test string-6.135.$noComp {string is unicode, noncharacter} { - run {string is unicode \uFFFE} -} 0 -test string-6.136.$noComp {string is unicode, noncharacter} { - run {string is unicode \uFFFF} -} 0 -test string-6.137.$noComp {string is unicode, noncharacter} { - run {string is unicode \uFDD0} -} 0 -test string-6.138.$noComp {string is unicode, noncharacter} { - run {string is unicode \uFDEF} -} 0 test string-6.139.$noComp {string is integer, bug [76ad7aeba3]} { run {string is integer 18446744073709551615} } 1 -- cgit v0.12 From 181274a8a0e892445b3150b57fd22c904bbe0ce2 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 9 Feb 2024 14:17:42 +0000 Subject: This should be removed too --- doc/string.n | 7 ------- 1 file changed, 7 deletions(-) diff --git a/doc/string.n b/doc/string.n index 3b9af03..f07a591 100644 --- a/doc/string.n +++ b/doc/string.n @@ -181,13 +181,6 @@ zero width no-break space (U+feff) (=BOM). .IP \fBtrue\fR 12 Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is true. -.IP \fBunicode\fR 12 -Any Unicode character, except surrogates and noncharacters. -.RS -.PP -\fIWarning: this option is under discussion and may be renamed or replaced -by another solution within the Tcl 9.0 series.\fR -.RE .IP \fBupper\fR 12 Any upper case alphabet character in the Unicode character set. .IP \fBwideinteger\fR 12 -- cgit v0.12