From 4a8b2cf10f8ebcebaa9d3546f3399d3d9a8aa00e Mon Sep 17 00:00:00 2001 From: pooryorick Date: Wed, 28 Dec 2022 12:07:18 +0000 Subject: A better fix for [b8f575aa23], as it maintains the expectation that synchronous [read] results in an error when invalid data is encountered. someone other than pooryorick: Pushed this check-in back on to a review branch. It needs more baking/review. As is, it makes two tests fail, and it introduces a new element "-result" to the return options dictionary. --- generic/tclIO.c | 13 +++++++++++-- generic/tclIOCmd.c | 10 +++++++++- tests/io.test | 29 ++++++++++++++++------------- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index e6e3560..63b9a7d 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -6041,7 +6041,7 @@ DoReadChars( assert(statePtr->inputEncodingFlags & TCL_ENCODING_END); assert(!GotFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR)); - /* TODO: We don't need this call? */ + /* TODO: UpdateInterest isn't needed here? */ UpdateInterest(chanPtr); return 0; } @@ -6055,7 +6055,7 @@ DoReadChars( } ResetFlag(statePtr, CHANNEL_BLOCKED|CHANNEL_EOF); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; - /* TODO: We don't need this call? */ + /* TODO: UpdateInterest isn't needed here? */ UpdateInterest(chanPtr); return 0; } @@ -6084,6 +6084,15 @@ DoReadChars( } else { copiedNow = ReadChars(statePtr, objPtr, toRead, &factor); } + if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { + /* Channel is Synchronous. Return an error so that [read] and + * friends can return an error + */ + TclChannelRelease((Tcl_Channel)chanPtr); + UpdateInterest(chanPtr); + Tcl_SetErrno(EILSEQ); + return -1; + } /* * If the current buffer is empty recycle it. diff --git a/generic/tclIOCmd.c b/generic/tclIOCmd.c index e8a534f..8794365 100644 --- a/generic/tclIOCmd.c +++ b/generic/tclIOCmd.c @@ -381,7 +381,7 @@ Tcl_ReadObjCmd( int toRead; /* How many bytes to read? */ int charactersRead; /* How many characters were read? */ int mode; /* Mode in which channel is opened. */ - Tcl_Obj *resultPtr, *chanObjPtr; + Tcl_Obj *resultPtr, *returnOptsPtr, *chanObjPtr; if ((objc != 2) && (objc != 3)) { Interp *iPtr; @@ -470,8 +470,16 @@ Tcl_ReadObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + returnOptsPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-code", -1) + , Tcl_NewStringObj("error", -1)); + Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-level", -1) + , Tcl_NewIntObj(0)); + Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) + , resultPtr); TclChannelRelease(chan); Tcl_DecrRefCount(resultPtr); + Tcl_SetReturnOptions(interp, returnOptsPtr); return TCL_ERROR; } diff --git a/tests/io.test b/tests/io.test index d10e1e4..451a790 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9056,12 +9056,12 @@ test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -s seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result] binary scan $d H* hd - lappend hd [catch {read $f} msg] - close $f - lappend hd $msg + lappend hd $status $cres } -cleanup { + close $f removeFile io-75.6 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} @@ -9075,11 +9075,12 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result] binary scan $d H* hd lappend hd [eof $f] - lappend hd [catch {read $f} msg] - lappend hd $msg + lappend hd $status + lappend hd $cres fconfigure $f -encoding iso8859-1 lappend hd [read $f];# We changed encoding, so now we can read the \xA1 close $f @@ -9157,10 +9158,11 @@ test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { seek $f 0 fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result] binary scan $d H* hd - lappend hd [catch {set d [read $f]} msg] - lappend hd $msg + lappend hd $status + lappend hd $cres } -cleanup { close $f removeFile io-75.11 @@ -9192,11 +9194,12 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { - set d [read $f] + set status [catch {read $f} cres copts] + set d [dict get $copts -result] binary scan $d H* hd - lappend hd [catch {read $f} msg] + lappend hd $status close $f - lappend hd $msg + lappend hd $cres } -cleanup { removeFile io-75.13 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} -- cgit v0.12 From f2cc84c99a732dbde0a6845d0809443e43276d17 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Wed, 28 Dec 2022 22:46:37 +0000 Subject: Update fix so that the two failing tests, iocmd-23.8 and iortrans-4.7 now pass. --- generic/tclIOCmd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/generic/tclIOCmd.c b/generic/tclIOCmd.c index 8794365..e5ba298 100644 --- a/generic/tclIOCmd.c +++ b/generic/tclIOCmd.c @@ -471,10 +471,6 @@ Tcl_ReadObjCmd( TclGetString(chanObjPtr), Tcl_PosixError(interp))); } returnOptsPtr = Tcl_NewDictObj(); - Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-code", -1) - , Tcl_NewStringObj("error", -1)); - Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-level", -1) - , Tcl_NewIntObj(0)); Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) , resultPtr); TclChannelRelease(chan); -- cgit v0.12 From 3b45005127de0885251471d5591ecb58c5b3e286 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Thu, 29 Dec 2022 22:59:10 +0000 Subject: Arrange new code in DoReadChars to ensure that final steps are always taken. --- generic/tclIO.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 63b9a7d..9ae8fb5 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -6024,8 +6024,9 @@ DoReadChars( } if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { - /* TODO: We don't need this call? */ + /* TODO: UpdateInterest not needed here? */ UpdateInterest(chanPtr); + Tcl_SetErrno(EILSEQ); return -1; } @@ -6041,7 +6042,7 @@ DoReadChars( assert(statePtr->inputEncodingFlags & TCL_ENCODING_END); assert(!GotFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR)); - /* TODO: UpdateInterest isn't needed here? */ + /* TODO: UpdateInterest not needed here? */ UpdateInterest(chanPtr); return 0; } @@ -6055,7 +6056,7 @@ DoReadChars( } ResetFlag(statePtr, CHANNEL_BLOCKED|CHANNEL_EOF); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; - /* TODO: UpdateInterest isn't needed here? */ + /* TODO: UpdateInterest not needed here? */ UpdateInterest(chanPtr); return 0; } @@ -6084,18 +6085,9 @@ DoReadChars( } else { copiedNow = ReadChars(statePtr, objPtr, toRead, &factor); } - if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { - /* Channel is Synchronous. Return an error so that [read] and - * friends can return an error - */ - TclChannelRelease((Tcl_Channel)chanPtr); - UpdateInterest(chanPtr); - Tcl_SetErrno(EILSEQ); - return -1; - } /* - * If the current buffer is empty recycle it. + * Recycle current buffer if empty. */ bufPtr = statePtr->inQueueHead; @@ -6108,6 +6100,15 @@ DoReadChars( statePtr->inQueueTail = NULL; } } + + if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { + /* Channel is synchronous. Return an error so that callers + * like [read] can return an error. + */ + Tcl_SetErrno(EILSEQ); + copied = -1; + goto finish; + } } if (copiedNow < 0) { @@ -6136,6 +6137,7 @@ DoReadChars( } } +finish: /* * Failure to fill a channel buffer may have left channel reporting a * "blocked" state, but so long as we fulfilled the request here, the -- cgit v0.12 From a801c2d4741015dbb5875938248eff1701e1ff29 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Fri, 30 Dec 2022 20:27:47 +0000 Subject: Fix DoReadChars() to correctly discard encoding errors after eofchar has been seen, and add new test, io-75.8.invalid. --- generic/tclEncoding.c | 7 ++++++- generic/tclIO.c | 16 ++++++++++++++-- tests/io.test | 36 ++++++++++++++++++++++++++++++++++-- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d10d9ca..37b3073 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2386,7 +2386,12 @@ UtfToUtfProc( *dst++ = *src++; } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { + && (UCHAR(src[1]) == 0x80) + && ( + !(flags & TCL_ENCODING_MODIFIED) + || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) + )) + { /* * If in input mode, and -strict is specified: This is an error. */ diff --git a/generic/tclIO.c b/generic/tclIO.c index 9ae8fb5..3b47de5 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -6101,7 +6101,16 @@ DoReadChars( } } - if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { + /* + * If CHANNEL_ENCODING_ERROR and CHANNEL_STICKY_EOF are both set, + * then CHANNEL_ENCODING_ERROR was caused by data that occurred + * after the EOF character was encountered, so it doesn't count as + * a real error. + */ + + if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) + && !GotFlag(statePtr, CHANNEL_STICKY_EOF) + && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { /* Channel is synchronous. Return an error so that callers * like [read] can return an error. */ @@ -6816,11 +6825,14 @@ TranslateInputEOL( * EOF character was seen in EOL translated range. Leave current file * position pointing at the EOF character, but don't store the EOF * character in the output string. + * + * If CHANNEL_ENCODING_ERROR is set, it can only be because of data + * encountered after the EOF character, so it is nonsense. Unset it. */ SetFlag(statePtr, CHANNEL_EOF | CHANNEL_STICKY_EOF); statePtr->inputEncodingFlags |= TCL_ENCODING_END; - ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR); + ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR|CHANNEL_ENCODING_ERROR); } } diff --git a/tests/io.test b/tests/io.test index 451a790..aece338 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9089,11 +9089,15 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { removeFile io-75.7 } -match glob -result {41 0 1 {error reading "*": illegal byte sequence} ¡} -test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { +test io-75.8.incomplete { + incomplete uft-8 char after eof char is not an error (-strictencoding 1) +} -setup { + set hd {} set fn [makeFile {} io-75.8] set f [open $fn w+] fconfigure $f -encoding binary - # \x81 is invalid in utf-8, but since \x1A comes first, -eofchar takes precedence. + # \x81 is invalid and also incomplete utf-8 data, but because the eof + # character \x1A appears first, it's not an error. puts -nonewline $f A\x1A\x81 flush $f seek $f 0 @@ -9102,6 +9106,7 @@ test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { set d [read $f] binary scan $d H* hd lappend hd [eof $f] + # there should be no error on additional reads lappend hd [read $f] close $f set hd @@ -9109,6 +9114,33 @@ test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { removeFile io-75.8 } -result {41 1 {}} + +test io-75.8.invalid {invalid utf-8 after eof char is not an error (-strictencoding 1)} -setup { + set res {} + set fn [makeFile {} io-75.8] + set f [open $fn w+] + fconfigure $f -encoding binary + # \xc0\x80 is invalid utf-8 data, but because the eof character \x1A + # appears first, it's not an error. + puts -nonewline $f A\x1a\xc0\x80 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 +} -body { + set d [read $f] + foreach char [split $d {}] { + lappend res [format %x [scan $char %c]] + } + lappend res [eof $f] + # there should be no error on additional reads + lappend res [read $f] + close $f + set res +} -cleanup { + removeFile io-75.8 +} -result {41 1 {}} + + test io-75.9 {unrepresentable character write passes and is replaced by ?} -setup { set fn [makeFile {} io-75.9] set f [open $fn w+] -- cgit v0.12 From 63e04b3c2dc7ecaf014a93f2116b5913a256e875 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Fri, 30 Dec 2022 21:05:56 +0000 Subject: New test, io-12.9.strict, for issue report [1bedc53c8cb878f0]. --- tests/io.test | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/io.test b/tests/io.test index aece338..6fb3587 100644 --- a/tests/io.test +++ b/tests/io.test @@ -1555,11 +1555,29 @@ test io-12.9 {ReadChars: multibyte chars split} -body { set f [open $path(test1)] fconfigure $f -encoding utf-8 -buffersize 10 set in [read $f] + read $f close $f scan [string index $in end] %c } -cleanup { catch {close $f} } -result 194 +test io-12.9.strict {ReadChars: multibyte chars split} -body { + set res {} + set f [open $path(test1) w] + fconfigure $f -translation binary + puts -nonewline $f [string repeat a 9]\xC2 + close $f + set f [open $path(test1)] + fconfigure $f -encoding utf-8 -strictencoding 1 -buffersize 10 + set status [catch {read $f} cres copts] + set in [dict get $copts -result] + lappend res $in + lappend res $status $cres + set res +} -cleanup { + close $f + catch {close $f} +} -match glob -result {aaaaaaaaa 1 {error reading "*": illegal byte sequence}} test io-12.10 {ReadChars: multibyte chars split} -body { set f [open $path(test1) w] fconfigure $f -translation binary -- cgit v0.12 From 3919b0a0b4e371b574d16adaa1c73df6da8007ce Mon Sep 17 00:00:00 2001 From: pooryorick Date: Fri, 30 Dec 2022 21:53:47 +0000 Subject: Add test for [gets] in non-strict mode after an encoding error. --- tests/io.test | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/io.test b/tests/io.test index 6fb3587..2fa06ea 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9254,6 +9254,28 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - removeFile io-75.13 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} +test io-75.14 {invalid utf-8 encoding [gets] coninues in non-strict mode after error} -setup { + set fn [makeFile {} io-75.14] + set f [open $fn w+] + fconfigure $f -encoding binary + # \xc0 is invalid in utf-8 + puts -nonewline $f a\nb\xc0\nc\n + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf -strictencoding 1 +} -body { + lappend res [gets $f] + set status [catch {gets $f} cres copts] + lappend res $status $cres + chan configure $f -strictencoding 0 + lappend res [gets $f] + lappend res [gets $f] + close $f + return $res +} -cleanup { + removeFile io-75.14 +} -match glob -result {a 1 {error reading "*": illegal byte sequence} bÀ c} + # ### ### ### ######### ######### ######### -- cgit v0.12 From 985ea00b16865c0dccc99eb9b006f97e8e59edb0 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Mon, 2 Jan 2023 23:12:02 +0000 Subject: Merge py-b8f575aa23: Fix for [154ed7ce56], Tcl 9: [gets] on -strictencoding 1 configured channel. --- generic/tclIO.c | 28 ++++++++++++++++++++++++++-- generic/tclIOCmd.c | 7 +++++-- tests/io.test | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 3b47de5..81af96e 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -4656,7 +4656,8 @@ Tcl_GetsObj( /* State info for channel */ ChannelBuffer *bufPtr; int inEofChar, skip, copiedTotal, oldFlags, oldRemoved; - int oldLength; + int reportError = 0; + size_t oldLength; Tcl_Encoding encoding; char *dst, *dstEnd, *eol, *eof; Tcl_EncodingState oldState; @@ -4664,6 +4665,7 @@ Tcl_GetsObj( if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { UpdateInterest(chanPtr); Tcl_SetErrno(EILSEQ); + ResetFlag(statePtr, CHANNEL_ENCODING_ERROR); return TCL_INDEX_NONE; } @@ -4938,6 +4940,19 @@ Tcl_GetsObj( goto done; } goto gotEOL; + } else if (gs.bytesWrote == 0 + && GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { + /* Set eol to the position that caused the encoding error, and then + * coninue to gotEOL, which stores the data that was decoded + * without error to objPtr. This allows the caller to do something + * useful with the data decoded so far, and also results in the + * position of the file being the first byte that was not + * succesfully decoded, allowing further processing at exactly that + * point, if desired. + */ + eol = dstEnd; + reportError = 1; + goto gotEOL; } dst = dstEnd; } @@ -4981,7 +4996,16 @@ Tcl_GetsObj( Tcl_SetObjLength(objPtr, eol - objPtr->bytes); CommonGetsCleanup(chanPtr); ResetFlag(statePtr, CHANNEL_BLOCKED); - copiedTotal = gs.totalChars + gs.charsWrote - skip; + if (reportError) { + ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR|CHANNEL_ENCODING_ERROR); + /* reset CHANNEL_ENCODING_ERROR to afford a chance to reconfigure + * the channel and try again + */ + Tcl_SetErrno(EILSEQ); + copiedTotal = -1; + } else { + copiedTotal = gs.totalChars + gs.charsWrote - skip; + } goto done; /* diff --git a/generic/tclIOCmd.c b/generic/tclIOCmd.c index e5ba298..bc52b8e 100644 --- a/generic/tclIOCmd.c +++ b/generic/tclIOCmd.c @@ -295,7 +295,7 @@ Tcl_GetsObjCmd( Tcl_Channel chan; /* The channel to read from. */ int lineLen; /* Length of line just read. */ int mode; /* Mode in which channel is opened. */ - Tcl_Obj *linePtr, *chanObjPtr; + Tcl_Obj *linePtr, *chanObjPtr, *returnOptsPtr; int code = TCL_OK; if ((objc != 2) && (objc != 3)) { @@ -318,7 +318,6 @@ Tcl_GetsObjCmd( lineLen = Tcl_GetsObj(chan, linePtr); if (lineLen < 0) { if (!Tcl_Eof(chan) && !Tcl_InputBlocked(chan)) { - Tcl_DecrRefCount(linePtr); /* * TIP #219. @@ -332,7 +331,11 @@ Tcl_GetsObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + returnOptsPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) + , linePtr); code = TCL_ERROR; + Tcl_SetReturnOptions(interp, returnOptsPtr); goto done; } lineLen = TCL_INDEX_NONE; diff --git a/tests/io.test b/tests/io.test index 2fa06ea..854759e 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9255,6 +9255,7 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - } -match glob -result {41 1 {error reading "*": illegal byte sequence}} test io-75.14 {invalid utf-8 encoding [gets] coninues in non-strict mode after error} -setup { + set res {} set fn [makeFile {} io-75.14] set f [open $fn w+] fconfigure $f -encoding binary @@ -9271,13 +9272,40 @@ test io-75.14 {invalid utf-8 encoding [gets] coninues in non-strict mode after e lappend res [gets $f] lappend res [gets $f] close $f - return $res + return $res } -cleanup { removeFile io-75.14 } -match glob -result {a 1 {error reading "*": illegal byte sequence} bÀ c} -# ### ### ### ######### ######### ######### +test io-75.15 {invalid utf-8 encoding strict gets should not hang} -setup { + set res {} + set fn [makeFile {} io-75.15] + set chan [open $fn w+] + fconfigure $chan -encoding binary + # This is not valid UTF-8 + puts $chan hello\nAB\xc0\x40CD\nEFG + close $chan +} -body { + #Now try to read it with [gets] + set chan [open $fn] + fconfigure $chan -encoding utf-8 -strictencoding 1 + lappend res [gets $chan] + set status [catch {gets $chan} cres copts] + lappend res $status $cres + set status [catch {gets $chan} cres copts] + lappend res $status $cres + lappend res [dict get $copts -result] + chan configur $chan -encoding binary + foreach char [split [read $chan 2] {}] { + lappend res [format %x [scan $char %c]] + } + return $res +} -cleanup { + close $chan + removeFile io-75.15 +} -match glob -result {hello 1 {error reading "*": illegal byte sequence}\ + 1 {error reading "*": illegal byte sequence} AB c0 40} test io-76.0 {channel modes} -setup { -- cgit v0.12 From 805fa175fc88005a9955a6202f05d17b91b70c19 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Sun, 8 Jan 2023 10:07:46 +0000 Subject: For [read] and [gets] encoding errors, use "-result read" in return options dictionary instead of just "-result". --- generic/tclIOCmd.c | 14 ++++-- tests/io.test | 141 ++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 106 insertions(+), 49 deletions(-) diff --git a/generic/tclIOCmd.c b/generic/tclIOCmd.c index bc52b8e..2eeb04c 100644 --- a/generic/tclIOCmd.c +++ b/generic/tclIOCmd.c @@ -295,7 +295,7 @@ Tcl_GetsObjCmd( Tcl_Channel chan; /* The channel to read from. */ int lineLen; /* Length of line just read. */ int mode; /* Mode in which channel is opened. */ - Tcl_Obj *linePtr, *chanObjPtr, *returnOptsPtr; + Tcl_Obj *linePtr, *chanObjPtr, *resultDictPtr, *returnOptsPtr; int code = TCL_OK; if ((objc != 2) && (objc != 3)) { @@ -331,9 +331,12 @@ Tcl_GetsObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + resultDictPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) + , linePtr); returnOptsPtr = Tcl_NewDictObj(); Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) - , linePtr); + , resultDictPtr); code = TCL_ERROR; Tcl_SetReturnOptions(interp, returnOptsPtr); goto done; @@ -384,7 +387,7 @@ Tcl_ReadObjCmd( int toRead; /* How many bytes to read? */ int charactersRead; /* How many characters were read? */ int mode; /* Mode in which channel is opened. */ - Tcl_Obj *resultPtr, *returnOptsPtr, *chanObjPtr; + Tcl_Obj *resultPtr, *resultDictPtr, *returnOptsPtr, *chanObjPtr; if ((objc != 2) && (objc != 3)) { Interp *iPtr; @@ -473,9 +476,12 @@ Tcl_ReadObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + resultDictPtr = Tcl_NewDictObj(); + Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) + , resultPtr); returnOptsPtr = Tcl_NewDictObj(); Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) - , resultPtr); + , resultDictPtr); TclChannelRelease(chan); Tcl_DecrRefCount(resultPtr); Tcl_SetReturnOptions(interp, returnOptsPtr); diff --git a/tests/io.test b/tests/io.test index 854759e..3f00561 100644 --- a/tests/io.test +++ b/tests/io.test @@ -1547,37 +1547,43 @@ test io-12.8 {ReadChars: multibyte chars split} { close $f scan [string index $in end] %c } 160 -test io-12.9 {ReadChars: multibyte chars split} -body { - set f [open $path(test1) w] - fconfigure $f -translation binary - puts -nonewline $f [string repeat a 9]\xC2 - close $f - set f [open $path(test1)] - fconfigure $f -encoding utf-8 -buffersize 10 - set in [read $f] - read $f - close $f - scan [string index $in end] %c -} -cleanup { - catch {close $f} -} -result 194 -test io-12.9.strict {ReadChars: multibyte chars split} -body { - set res {} - set f [open $path(test1) w] - fconfigure $f -translation binary - puts -nonewline $f [string repeat a 9]\xC2 - close $f - set f [open $path(test1)] - fconfigure $f -encoding utf-8 -strictencoding 1 -buffersize 10 - set status [catch {read $f} cres copts] - set in [dict get $copts -result] - lappend res $in - lappend res $status $cres - set res -} -cleanup { - close $f - catch {close $f} -} -match glob -result {aaaaaaaaa 1 {error reading "*": illegal byte sequence}} + + +apply [list {} { + set template { + test io-12.9.@variant@ {ReadChars: multibyte chars split, default (strict)} -body { + set res {} + set f [open $path(test1) w] + fconfigure $f -translation binary + puts -nonewline $f [string repeat a 9]\xC2 + close $f + set f [open $path(test1)] + fconfigure $f -encoding utf-8 @strict@ -buffersize 10 + set status [catch {read $f} cres copts] + set in [dict get $copts -result] + lappend res $in + lappend res $status $cres + set status [catch {read $f} cres copts] + set in [dict get $copts -result] + lappend res $in + lappend res $status $cres + set res + } -cleanup { + catch {close $f} + } -match glob -result {{read aaaaaaaaa} 1\ + {error reading "*": illegal byte sequence}\ + {read {}} 1 {error reading "*": illegal byte sequence}} + } + + # strict encoding may be the default in Tcl 9, but in 8 it is not + foreach variant {encodingstrict} strict {{-strictencoding 1}} { + set script [string map [ + list @variant@ $variant @strict@ $strict] $template] + uplevel 1 $script + } +} [namespace current]] + + test io-12.10 {ReadChars: multibyte chars split} -body { set f [open $path(test1) w] fconfigure $f -translation binary @@ -9075,7 +9081,7 @@ test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -s fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { set status [catch {read $f} cres copts] - set d [dict get $copts -result] + set d [dict get $copts -result read] binary scan $d H* hd lappend hd $status $cres } -cleanup { @@ -9094,7 +9100,7 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 } -body { set status [catch {read $f} cres copts] - set d [dict get $copts -result] + set d [dict get $copts -result read] binary scan $d H* hd lappend hd [eof $f] lappend hd $status @@ -9173,9 +9179,7 @@ test io-75.9 {unrepresentable character write passes and is replaced by ?} -setu removeFile io-75.9 } -match glob -result [list {A} {error writing "*": illegal byte sequence}] -# Incomplete sequence test. -# This error may IMHO only be detected with the close. -# But the read already returns the incomplete sequence. + test io-75.10 {incomplete multibyte encoding read is ignored} -setup { set fn [makeFile {} io-75.10] set f [open $fn w+] @@ -9183,7 +9187,7 @@ test io-75.10 {incomplete multibyte encoding read is ignored} -setup { puts -nonewline $f A\xC0 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none + fconfigure $f -encoding utf-8 -strictencoding 0 -buffering none } -body { set d [read $f] close $f @@ -9192,8 +9196,32 @@ test io-75.10 {incomplete multibyte encoding read is ignored} -setup { } -cleanup { removeFile io-75.10 } -result 41c0 -# The current result returns the orphan byte as byte. -# This may be expected due to special utf-8 handling. + + +test io-75.10_strict {incomplete multibyte encoding read is an error} -setup { + set res {} + set fn [makeFile {} io-75.10] + set f [open $fn w+] + fconfigure $f -encoding binary + puts -nonewline $f A\xC0 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -strictencoding 1 -buffering none +} -body { + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] + binary scan $d H* hd + lappend res $hd $cres + chan configure $f -encoding iso8859-1 + set d [read $f] + binary scan $d H* hd + lappend res $hd + close $f + return $res +} -cleanup { + removeFile io-75.10 +} -match glob -result {41 {error reading "*": illegal byte sequence} c0} + # As utf-8 has a special treatment in multi-byte decoding, also test another # one. @@ -9206,10 +9234,11 @@ test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" \ + -translation lf -strictencoding 1 } -body { set status [catch {read $f} cres copts] - set d [dict get $copts -result] + set d [dict get $copts -result read] binary scan $d H* hd lappend hd $status lappend hd $cres @@ -9218,14 +9247,36 @@ test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { removeFile io-75.11 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} -test io-75.12 {invalid utf-8 encoding read is ignored} -setup { + +test io-75.12 {invalid utf-8 encoding read is an error} -setup { + set res {} + set fn [makeFile {} io-75.12] + set f [open $fn w+] + fconfigure $f -encoding binary + puts -nonewline $f A\x81 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf \ + -strictencoding 1 +} -body { + set status [catch {read $f} cres copts] + set d [dict get $copts -result read] + close $f + binary scan $d H* hd + lappend res $hd $status $cres + return $res +} -cleanup { + removeFile io-75.12 +} -match glob -result {41 1 {error reading "*": illegal byte sequence}} +test io-75.12_ignore {invalid utf-8 encoding read is ignored} -setup { set fn [makeFile {} io-75.12] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf + fconfigure $f -encoding utf-8 -buffering none -eofchar {} \ + -translation lf -strictencoding 0 } -body { set d [read $f] close $f @@ -9245,7 +9296,7 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { set status [catch {read $f} cres copts] - set d [dict get $copts -result] + set d [dict get $copts -result read] binary scan $d H* hd lappend hd $status close $f @@ -9305,7 +9356,7 @@ test io-75.15 {invalid utf-8 encoding strict gets should not hang} -setup { close $chan removeFile io-75.15 } -match glob -result {hello 1 {error reading "*": illegal byte sequence}\ - 1 {error reading "*": illegal byte sequence} AB c0 40} + 1 {error reading "*": illegal byte sequence} {read AB} c0 40} test io-76.0 {channel modes} -setup { -- cgit v0.12 From 637e7224c9b4c5bde7709455dc262bdf476f9b4d Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 2 Feb 2023 11:52:31 +0000 Subject: Replace encoding -strict etc. with -profile --- generic/tclCmdAH.c | 325 +++++++++++++++++++++++++++++--------------------- generic/tclEncoding.c | 34 ++++++ generic/tclInt.h | 20 ++++ tests/encoding.test | 132 ++++++++++---------- 4 files changed, 310 insertions(+), 201 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 4f743cc..818159d 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -527,6 +527,137 @@ TclInitEncodingCmd( } /* + *------------------------------------------------------------------------ + * + * EncodingConvertParseOptions -- + * + * Common routine for parsing arguments passed to encoding convertfrom + * and encoding convertto. + * + * Results: + * TCL_OK or TCL_ERROR. + * + * Side effects: + * On success, + * - *encPtr is set to the encoding. Must be freed with Tcl_FreeEncoding + * if non-NULL + * - *dataObjPtr is set to the Tcl_Obj containing the data to encode or + * decode + * - *flagsPtr is set to encoding error handling flags + * - *failVarPtr is set to -failindex option value or NULL + * On error, all of the above are uninitialized. + * + *------------------------------------------------------------------------ + */ +static int +EncodingConvertParseOptions ( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int objc, /* Number of arguments */ + Tcl_Obj *const objv[], /* Argument objects as passed to command. */ + int isEncoder, /* 1 -> convertto, 0 -> convertfrom */ + Tcl_Encoding *encPtr, /* Where to store the encoding */ + Tcl_Obj **dataObjPtr, /* Where to store ptr to Tcl_Obj containing data */ + int *flagsPtr, /* Bit mask of encoding option flags */ + Tcl_Obj **failVarPtr /* Where to store -failindex option value */ +) +{ + static const char *const options[] = {"-profile", "-failindex", NULL}; + enum convertfromOptions { PROFILE, FAILINDEX } optIndex; + enum TclEncodingProfile profile; + Tcl_Encoding encoding; + Tcl_Obj *dataObj; + Tcl_Obj *failVarObj; +#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) + int flags = TCL_ENCODING_STOPONERROR; +#else + int flags = TCL_ENCODING_NOCOMPLAIN; +#endif + + /* + * Possible combinations: + * 1) data -> objc = 2 + * 2) ?options? encoding data -> objc >= 3 + * It is intentional that specifying option forces encoding to be + * specified. Less prone to user error. This should have always been + * the case even in 8.6 imho where there were no options (ie (1) + * should never have been allowed) + */ + + if (objc == 1) { +numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ + Tcl_WrongNumArgs( + interp, + 1, + objv, + "??-profile profile? ?-failindex var? ?encoding?? data"); + return TCL_ERROR; + } + + failVarObj = NULL; + if (objc == 2) { + encoding = Tcl_GetEncoding(interp, NULL); + dataObj = objv[1]; + } else { + int argIndex; + for (argIndex = 1; argIndex < (objc-2); ++argIndex) { + if (Tcl_GetIndexFromObj( + interp, objv[argIndex], options, "option", 0, &optIndex) + != TCL_OK) { + return TCL_ERROR; + } + if (++argIndex == (objc - 2)) { + goto numArgsError; + } + switch (optIndex) { + case PROFILE: + if (TclEncodingProfileParseName( + interp, objv[argIndex], &profile) + != TCL_OK) { + return TCL_ERROR; + } + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + flags = TCL_ENCODING_NOCOMPLAIN; + break; + case TCL_ENCODING_PROFILE_STRICT: + flags = TCL_ENCODING_STRICT; + break; + case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ + default: + break; + } + break; + case FAILINDEX: + failVarObj = objv[argIndex]; + break; + } + } + /* Get encoding after opts so no need to free it on option error */ + if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) + != TCL_OK) { + return TCL_ERROR; + } + dataObj = objv[objc - 1]; + } + + /* -failindex forces checking*/ + if (failVarObj != NULL && flags == TCL_ENCODING_NOCOMPLAIN) { + /* + * Historical, but I really don't like this mixing of defines + * from two different bit mask domains - ENCODING_FAILINDEX + */ + flags = isEncoder ? TCL_ENCODING_STOPONERROR : ENCODING_FAILINDEX; + } + + *encPtr = encoding; + *dataObjPtr = dataObj; + *flagsPtr = flags; + *failVarPtr = failVarObj; + + return TCL_OK; +} + +/* *---------------------------------------------------------------------- * * EncodingConvertfromObjCmd -- @@ -559,78 +690,73 @@ EncodingConvertfromObjCmd( #endif int result; Tcl_Obj *failVarObj = NULL; + static const char *const options[] = {"-profile", "-failindex", NULL}; + enum convertfromOptions { PROFILE, FAILINDEX } optIndex; + enum TclEncodingProfile profile; + /* - * Decode parameters: * Possible combinations: * 1) data -> objc = 2 - * 2) encoding data -> objc = 3 - * 3) -nocomplain data -> objc = 3 - * 4) -nocomplain encoding data -> objc = 4 - * 5) -strict data -> objc = 3 - * 6) -strict encoding data -> objc = 4 - * 7) -failindex val data -> objc = 4 - * 8) -failindex val encoding data -> objc = 5 + * 2) ?options? encoding data -> objc >= 3 + * It is intentional that specifying option forces encoding to be + * specified. Less prone to user error. This should have always been + * the case even in 8.6 imho where there were no options (ie (1) + * should never have been allowed) */ - if (objc == 2) { + if (objc == 1) { +numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ + Tcl_WrongNumArgs( + interp, + 1, + objv, + "??-profile profile? ?-failindex var? ?encoding?? data"); + return TCL_ERROR; + } + else if (objc == 2) { encoding = Tcl_GetEncoding(interp, NULL); data = objv[1]; - } else if (objc > 2 && objc < 7) { - int objcUnprocessed = objc; - data = objv[objc - 1]; - bytesPtr = Tcl_GetString(objv[1]); - if (bytesPtr[0] == '-' && bytesPtr[1] == 'n' - && !strncmp(bytesPtr, "-nocomplain", strlen(bytesPtr))) { - flags = TCL_ENCODING_NOCOMPLAIN; - objcUnprocessed--; - } else if (bytesPtr[0] == '-' && bytesPtr[1] == 's' - && !strncmp(bytesPtr, "-strict", strlen(bytesPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed--; - bytesPtr = Tcl_GetString(objv[2]); - if (bytesPtr[0] == '-' && bytesPtr[1] == 'f' - && !strncmp(bytesPtr, "-failindex", strlen(bytesPtr))) { - /* at least two additional arguments needed */ - if (objc < 6) { - goto encConvFromError; - } - failVarObj = objv[3]; - objcUnprocessed -= 2; - } - } else if (bytesPtr[0] == '-' && bytesPtr[1] == 'f' - && !strncmp(bytesPtr, "-failindex", strlen(bytesPtr))) { - /* at least two additional arguments needed */ - if (objc < 4) { - goto encConvFromError; + } else { + int argIndex; + for (argIndex = 1; argIndex < (objc-2); ++argIndex) { + if (Tcl_GetIndexFromObj( + interp, objv[argIndex], options, "option", 0, &optIndex) + != TCL_OK) { + return TCL_ERROR; } - failVarObj = objv[2]; - flags = ENCODING_FAILINDEX; - objcUnprocessed -= 2; - bytesPtr = Tcl_GetString(objv[3]); - if (bytesPtr[0] == '-' && bytesPtr[1] == 's' - && !strncmp(bytesPtr, "-strict", strlen(bytesPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed --; - } - } - switch (objcUnprocessed) { - case 3: - if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { + if (++argIndex == (objc - 2)) { + goto numArgsError; + } + switch (optIndex) { + case PROFILE: + if (TclEncodingProfileParseName( + interp, objv[argIndex], &profile) + != TCL_OK) { return TCL_ERROR; } + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + flags = TCL_ENCODING_NOCOMPLAIN; + break; + case TCL_ENCODING_PROFILE_STRICT: + flags = TCL_ENCODING_STRICT; + break; + case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ + default: + break; + } break; - case 2: - encoding = Tcl_GetEncoding(interp, NULL); + case FAILINDEX: + failVarObj = objv[argIndex]; break; - default: - goto encConvFromError; + } } - } else { - encConvFromError: - Tcl_WrongNumArgs(interp, 1, objv, "?-strict? ?-failindex var? ?encoding? data"); - ((Interp *) interp)->flags |= INTERP_ALTERNATE_WRONG_ARGS; - Tcl_WrongNumArgs(interp, 1, objv, "-nocomplain ?encoding? data"); - return TCL_ERROR; + /* Get encoding after opts so no need to free it on option error */ + if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) + != TCL_OK) { + return TCL_ERROR; + } + data = objv[objc - 1]; } /* @@ -711,83 +837,12 @@ EncodingConverttoObjCmd( int length; /* Length of the string being converted */ const char *stringPtr; /* Pointer to the first byte of the string */ int result; -#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) - int flags = TCL_ENCODING_STOPONERROR; -#else - int flags = TCL_ENCODING_NOCOMPLAIN; -#endif - Tcl_Obj *failVarObj = NULL; - - /* - * Decode parameters: - * Possible combinations: - * 1) data -> objc = 2 - * 2) encoding data -> objc = 3 - * 3) -nocomplain data -> objc = 3 - * 4) -nocomplain encoding data -> objc = 4 - * 5) -failindex val data -> objc = 4 - * 6) -failindex val encoding data -> objc = 5 - */ - - if (objc == 2) { - encoding = Tcl_GetEncoding(interp, NULL); - data = objv[1]; - } else if (objc > 2 && objc < 7) { - int objcUnprocessed = objc; - data = objv[objc - 1]; - stringPtr = Tcl_GetString(objv[1]); - if (stringPtr[0] == '-' && stringPtr[1] == 'n' - && !strncmp(stringPtr, "-nocomplain", strlen(stringPtr))) { - flags = TCL_ENCODING_NOCOMPLAIN; - objcUnprocessed--; - } else if (stringPtr[0] == '-' && stringPtr[1] == 's' - && !strncmp(stringPtr, "-strict", strlen(stringPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed--; - stringPtr = Tcl_GetString(objv[2]); - if (stringPtr[0] == '-' && stringPtr[1] == 'f' - && !strncmp(stringPtr, "-failindex", strlen(stringPtr))) { - /* at least two additional arguments needed */ - if (objc < 6) { - goto encConvToError; - } - failVarObj = objv[3]; - objcUnprocessed -= 2; - } - } else if (stringPtr[0] == '-' && stringPtr[1] == 'f' - && !strncmp(stringPtr, "-failindex", strlen(stringPtr))) { - /* at least two additional arguments needed */ - if (objc < 4) { - goto encConvToError; - } - failVarObj = objv[2]; - flags = TCL_ENCODING_STOPONERROR; - objcUnprocessed -= 2; - stringPtr = Tcl_GetString(objv[3]); - if (stringPtr[0] == '-' && stringPtr[1] == 's' - && !strncmp(stringPtr, "-strict", strlen(stringPtr))) { - flags = TCL_ENCODING_STRICT; - objcUnprocessed --; - } - } - switch (objcUnprocessed) { - case 3: - if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) != TCL_OK) { - return TCL_ERROR; - } - break; - case 2: - encoding = Tcl_GetEncoding(interp, NULL); - break; - default: - goto encConvToError; - } - } else { - encConvToError: - Tcl_WrongNumArgs(interp, 1, objv, "?-strict? ?-failindex var? ?encoding? data"); - ((Interp *) interp)->flags |= INTERP_ALTERNATE_WRONG_ARGS; - Tcl_WrongNumArgs(interp, 1, objv, "-nocomplain ?encoding? data"); + int flags; + Tcl_Obj *failVarObj; + if (EncodingConvertParseOptions( + interp, objc, objv, 1, &encoding, &data, &flags, &failVarObj) + != TCL_OK) { return TCL_ERROR; } diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 288b07c..bdd091f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -4085,6 +4085,40 @@ InitializeEncodingSearchPath( } /* + *------------------------------------------------------------------------ + * + * TclEncodingProfileParseName -- + * + * Maps an encoding profile name to its enum value. + * + * Results: + * TCL_OK on success or TCL_ERROR on failure. + * + * Side effects: + * Returns the profile enum value in *profilePtr + * + *------------------------------------------------------------------------ + */ +int +TclEncodingProfileParseName( + Tcl_Interp *interp, /* For error messages. May be NULL */ + Tcl_Obj *profileName, /* Name of profile */ + enum TclEncodingProfile *profilePtr) /* Output */ +{ + /* NOTE: Order must match enum TclEncodingProfile !!! */ + static const char *const profileNames[] = {"", "tcl8", "strict"}; + int idx; + + if (Tcl_GetIndexFromObj( + interp, profileName, profileNames, "profile", 0, &idx) + != TCL_OK) { + return TCL_ERROR; + } + *profilePtr = (enum TclEncodingProfile)idx; + return TCL_OK; +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 diff --git a/generic/tclInt.h b/generic/tclInt.h index 31c7fcb..db8ee9f 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2879,7 +2879,25 @@ MODULE_SCOPE int tclFindExecutableSearchDone; MODULE_SCOPE char *tclMemDumpFileName; MODULE_SCOPE TclPlatformType tclPlatform; +/* + * Declarations related to internal encoding functions. + */ + +/* + * Enum for encoding profiles that control encoding treatment of + * invalid bytes. NOTE: Order must match that of encodingProfileNames in + * TclEncodingProfileParseName() !!! + */ +enum TclEncodingProfile { + TCL_ENCODING_PROFILE_DEFAULT, + TCL_ENCODING_PROFILE_TCL8, + TCL_ENCODING_PROFILE_STRICT, +}; MODULE_SCOPE Tcl_Encoding tclIdentityEncoding; +MODULE_SCOPE int +TclEncodingProfileParseName(Tcl_Interp *interp, + Tcl_Obj *profileName, + enum TclEncodingProfile *profilePtr); /* * TIP #233 (Virtualized Time) @@ -4787,6 +4805,8 @@ MODULE_SCOPE Tcl_LibraryInitProc TclThread_Init; MODULE_SCOPE Tcl_LibraryInitProc Procbodytest_Init; MODULE_SCOPE Tcl_LibraryInitProc Procbodytest_SafeInit; + + /* *---------------------------------------------------------------- * Macro used by the Tcl core to check whether a pattern has any characters diff --git a/tests/encoding.test b/tests/encoding.test index ae6c78a..813cd84 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -299,7 +299,7 @@ test encoding-11.11 {encoding: extended Unicode UTF-32} { test encoding-12.1 {LoadTableEncoding: normal encoding} { set x [encoding convertto iso8859-3 Ġ] - append x [encoding convertto -nocomplain iso8859-3 Õ] + append x [encoding convertto -profile tcl8 iso8859-3 Õ] append x [encoding convertfrom iso8859-3 Õ] } "Õ?Ġ" test encoding-12.2 {LoadTableEncoding: single-byte encoding} { @@ -348,67 +348,67 @@ test encoding-15.5 {UtfToUtfProc emoji character input} { } "4 😂" test encoding-15.6 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D - set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D] + set y [encoding convertto -profile tcl8 utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {10 edb882f09f9882eda0bd} test encoding-15.7 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D - set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uD83D] + set y [encoding convertto -profile tcl8 utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 edb882eda0bdeda0bd} test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83Dé - set y [encoding convertto -nocomplain utf-8 \uDE02\uD83Dé] + set y [encoding convertto -profile tcl8 utf-8 \uDE02\uD83Dé] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 edb882eda0bdc3a9} test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX - set y [encoding convertto -nocomplain utf-8 \uDE02\uD83DX] + set y [encoding convertto -profile tcl8 utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 edb882eda0bd58} test encoding-15.10 {UtfToUtfProc high surrogate character output} { set x \uDE02é - set y [encoding convertto -nocomplain utf-8 \uDE02é] + set y [encoding convertto -profile tcl8 utf-8 \uDE02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 edb882c3a9} test encoding-15.11 {UtfToUtfProc low surrogate character output} { set x \uDA02é - set y [encoding convertto -nocomplain utf-8 \uDA02é] + set y [encoding convertto -profile tcl8 utf-8 \uDA02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 eda882c3a9} test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02Y - set y [encoding convertto -nocomplain utf-8 \uDE02Y] + set y [encoding convertto -profile tcl8 utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 edb88259} test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02Y - set y [encoding convertto -nocomplain utf-8 \uDA02Y] + set y [encoding convertto -profile tcl8 utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 eda88259} test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02 - set y [encoding convertto -nocomplain utf-8 \uDE02] + set y [encoding convertto -profile tcl8 utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 edb882} test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02 - set y [encoding convertto -nocomplain utf-8 \uDA02] + set y [encoding convertto -profile tcl8 utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 eda882} test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 - set y [encoding convertfrom -nocomplain utf-8 \xF0\xA0\xA1\xC2] + set y [encoding convertfrom -profile tcl8 utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" test encoding-15.17 {UtfToUtfProc emoji character output} { @@ -513,10 +513,10 @@ test encoding-17.2 {UtfToUcs2Proc} -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" test encoding-17.3 {UtfToUtf16Proc} -body { - encoding convertto -nocomplain utf-16be "\uDCDC" + encoding convertto -profile tcl8 utf-16be "\uDCDC" } -result "\xDC\xDC" test encoding-17.4 {UtfToUtf16Proc} -body { - encoding convertto -nocomplain utf-16le "\uD8D8" + encoding convertto -profile tcl8 utf-16le "\uD8D8" } -result "\xD8\xD8" test encoding-17.5 {UtfToUtf16Proc} -body { encoding convertto utf-32le "\U460DC" @@ -525,35 +525,35 @@ test encoding-17.6 {UtfToUtf16Proc} -body { encoding convertto utf-32be "\U460DC" } -result "\x00\x04\x60\xDC" test encoding-17.7 {UtfToUtf16Proc} -body { - encoding convertto -strict utf-16be "\uDCDC" + encoding convertto -profile strict utf-16be "\uDCDC" } -returnCodes error -result {unexpected character at index 0: 'U+00DCDC'} test encoding-17.8 {UtfToUtf16Proc} -body { - encoding convertto -strict utf-16le "\uD8D8" + encoding convertto -profile strict utf-16le "\uD8D8" } -returnCodes error -result {unexpected character at index 0: 'U+00D8D8'} test encoding-17.9 {Utf32ToUtfProc} -body { - encoding convertfrom -strict utf-32 "\xFF\xFF\xFF\xFF" + encoding convertfrom -profile strict utf-32 "\xFF\xFF\xFF\xFF" } -returnCodes error -result {unexpected byte sequence starting at index 0: '\xFF'} test encoding-17.10 {Utf32ToUtfProc} -body { - encoding convertfrom -nocomplain utf-32 "\xFF\xFF\xFF\xFF" + encoding convertfrom -profile tcl8 utf-32 "\xFF\xFF\xFF\xFF" } -result \uFFFD test encoding-18.1 {TableToUtfProc on invalid input} -constraints deprecated -body { list [catch {encoding convertto jis0208 \\} res] $res } -result {0 !)} -test encoding-18.2 {TableToUtfProc on invalid input with -strict} -body { - list [catch {encoding convertto -strict jis0208 \\} res] $res +test encoding-18.2 {TableToUtfProc on invalid input with -profile strict} -body { + list [catch {encoding convertto -profile strict jis0208 \\} res] $res } -result {1 {unexpected character at index 0: 'U+00005C'}} -test encoding-18.3 {TableToUtfProc on invalid input with -strict -failindex} -body { - list [catch {encoding convertto -strict -failindex pos jis0208 \\} res] $res $pos +test encoding-18.3 {TableToUtfProc on invalid input with -profile strict -failindex} -body { + list [catch {encoding convertto -profile strict -failindex pos jis0208 \\} res] $res $pos } -result {0 {} 0} -test encoding-18.4 {TableToUtfProc on invalid input with -failindex -strict} -body { - list [catch {encoding convertto -failindex pos -strict jis0208 \\} res] $res $pos +test encoding-18.4 {TableToUtfProc on invalid input with -failindex -profile strict} -body { + list [catch {encoding convertto -failindex pos -profile strict jis0208 \\} res] $res $pos } -result {0 {} 0} test encoding-18.5 {TableToUtfProc on invalid input with -failindex} -body { list [catch {encoding convertto -failindex pos jis0208 \\} res] $res $pos } -result {0 {} 0} -test encoding-18.6 {TableToUtfProc on invalid input with -nocomplain} -body { - list [catch {encoding convertto -nocomplain jis0208 \\} res] $res +test encoding-18.6 {TableToUtfProc on invalid input with -profile tcl8} -body { + list [catch {encoding convertto -profile tcl8 jis0208 \\} res] $res } -result {0 !)} test encoding-19.1 {TableFromUtfProc} { @@ -669,25 +669,25 @@ test encoding-24.4 {Parse valid or invalid utf-8} { string length [encoding convertfrom utf-8 "\xC0\x80"] } 1 test encoding-24.5 {Parse valid or invalid utf-8} { - string length [encoding convertfrom -nocomplain utf-8 "\xC0\x81"] + string length [encoding convertfrom -profile tcl8 utf-8 "\xC0\x81"] } 2 test encoding-24.6 {Parse valid or invalid utf-8} { - string length [encoding convertfrom -nocomplain utf-8 "\xC1\xBF"] + string length [encoding convertfrom -profile tcl8 utf-8 "\xC1\xBF"] } 2 test encoding-24.7 {Parse valid or invalid utf-8} { string length [encoding convertfrom utf-8 "\xC2\x80"] } 1 test encoding-24.8 {Parse valid or invalid utf-8} { - string length [encoding convertfrom -nocomplain utf-8 "\xE0\x80\x80"] + string length [encoding convertfrom -profile tcl8 utf-8 "\xE0\x80\x80"] } 3 test encoding-24.9 {Parse valid or invalid utf-8} { - string length [encoding convertfrom -nocomplain utf-8 "\xE0\x9F\xBF"] + string length [encoding convertfrom -profile tcl8 utf-8 "\xE0\x9F\xBF"] } 3 test encoding-24.10 {Parse valid or invalid utf-8} { string length [encoding convertfrom utf-8 "\xE0\xA0\x80"] } 1 test encoding-24.11 {Parse valid or invalid utf-8} { - string length [encoding convertfrom -nocomplain utf-8 "\xEF\xBF\xBF"] + string length [encoding convertfrom -profile tcl8 utf-8 "\xEF\xBF\xBF"] } 1 test encoding-24.12 {Parse valid or invalid utf-8} -constraints deprecated -body { encoding convertfrom utf-8 "\xC0\x81" @@ -713,68 +713,68 @@ test encoding-24.18 {Parse valid or invalid utf-8} -constraints testbytestring - test encoding-24.19 {Parse valid or invalid utf-8} -constraints deprecated -body { encoding convertto utf-8 "ZX\uD800" } -result ZX\xED\xA0\x80 -test encoding-24.20 {Parse with -nocomplain but without providing encoding} { - string length [encoding convertfrom -nocomplain "\x20"] -} 1 -test encoding-24.21 {Parse with -nocomplain but without providing encoding} { - string length [encoding convertto -nocomplain "\x20"] -} 1 +test encoding-24.20 {Parse with -profile tcl8 but without providing encoding} -body { + encoding convertfrom -profile tcl8 "\x20" +} -result {wrong # args: should be "::tcl::encoding::convertfrom ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error +test encoding-24.21 {Parse with -profile tcl8 but without providing encoding} -body { + string length [encoding convertto -profile tcl8 "\x20"] +} -result {::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data} -returnCodes error test encoding-24.22 {Syntax error, two encodings} -body { encoding convertfrom iso8859-1 utf-8 "ZX\uD800" -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-strict? ?-failindex var? ?encoding? data" or "::tcl::encoding::convertfrom -nocomplain ?encoding? data"} +} -returnCodes 1 -result {::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data} test encoding-24.23 {Syntax error, two encodings} -body { encoding convertto iso8859-1 utf-8 "ZX\uD800" -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-strict? ?-failindex var? ?encoding? data" or "::tcl::encoding::convertto -nocomplain ?encoding? data"} -test encoding-24.24 {Parse invalid utf-8 with -strict} -body { - encoding convertfrom -strict utf-8 "\xC0\x80\x00\x00" +} -returnCodes 1 -result {::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data} +test encoding-24.24 {Parse invalid utf-8 with -profile strict} -body { + encoding convertfrom -profile strict utf-8 "\xC0\x80\x00\x00" } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xC0'} -test encoding-24.25 {Parse invalid utf-8 with -strict} -body { - encoding convertfrom -strict utf-8 "\x40\x80\x00\x00" +test encoding-24.25 {Parse invalid utf-8 with -profile strict} -body { + encoding convertfrom -profile strict utf-8 "\x40\x80\x00\x00" } -returnCodes 1 -result {unexpected byte sequence starting at index 1: '\x80'} -test encoding-24.26 {Parse valid utf-8 with -strict} -body { - encoding convertfrom -strict utf-8 "\xF1\x80\x80\x80" +test encoding-24.26 {Parse valid utf-8 with -profile strict} -body { + encoding convertfrom -profile strict utf-8 "\xF1\x80\x80\x80" } -result \U40000 -test encoding-24.27 {Parse invalid utf-8 with -strict} -body { - encoding convertfrom -strict utf-8 "\xF0\x80\x80\x80" +test encoding-24.27 {Parse invalid utf-8 with -profile strict} -body { + encoding convertfrom -profile strict utf-8 "\xF0\x80\x80\x80" } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xF0'} -test encoding-24.28 {Parse invalid utf-8 with -strict} -body { - encoding convertfrom -strict utf-8 "\xFF\x00\x00" +test encoding-24.28 {Parse invalid utf-8 with -profile strict} -body { + encoding convertfrom -profile strict utf-8 "\xFF\x00\x00" } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xFF'} test encoding-24.29 {Parse invalid utf-8} -body { encoding convertfrom utf-8 \xEF\xBF\xBF } -result \uFFFF -test encoding-24.30 {Parse noncharacter with -strict} -body { - encoding convertfrom -strict utf-8 \xEF\xBF\xBF +test encoding-24.30 {Parse noncharacter with -profile strict} -body { + encoding convertfrom -profile strict utf-8 \xEF\xBF\xBF } -result \uFFFF -test encoding-24.31 {Parse invalid utf-8 with -nocomplain} -body { - encoding convertfrom -nocomplain utf-8 \xEF\xBF\xBF +test encoding-24.31 {Parse invalid utf-8 with -profile tcl8} -body { + encoding convertfrom -profile tcl8 utf-8 \xEF\xBF\xBF } -result \uFFFF test encoding-24.32 {Try to generate invalid utf-8} -body { encoding convertto utf-8 \uFFFF } -result \xEF\xBF\xBF -test encoding-24.33 {Try to generate noncharacter with -strict} -body { - encoding convertto -strict utf-8 \uFFFF +test encoding-24.33 {Try to generate noncharacter with -profile strict} -body { + encoding convertto -profile strict utf-8 \uFFFF } -result \xEF\xBF\xBF -test encoding-24.34 {Try to generate invalid utf-8 with -nocomplain} -body { - encoding convertto -nocomplain utf-8 \uFFFF +test encoding-24.34 {Try to generate invalid utf-8 with -profile tcl8} -body { + encoding convertto -profile tcl8 utf-8 \uFFFF } -result \xEF\xBF\xBF test encoding-24.35 {Parse invalid utf-8} -constraints deprecated -body { encoding convertfrom utf-8 \xED\xA0\x80 } -result \uD800 -test encoding-24.36 {Parse invalid utf-8 with -strict} -body { - encoding convertfrom -strict utf-8 \xED\xA0\x80 +test encoding-24.36 {Parse invalid utf-8 with -profile strict} -body { + encoding convertfrom -profile strict utf-8 \xED\xA0\x80 } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'} -test encoding-24.37 {Parse invalid utf-8 with -nocomplain} -body { - encoding convertfrom -nocomplain utf-8 \xED\xA0\x80 +test encoding-24.37 {Parse invalid utf-8 with -profile tcl8} -body { + encoding convertfrom -profile tcl8 utf-8 \xED\xA0\x80 } -result \uD800 test encoding-24.38 {Try to generate invalid utf-8} -constraints deprecated -body { encoding convertto utf-8 \uD800 } -result \xED\xA0\x80 -test encoding-24.39 {Try to generate invalid utf-8 with -strict} -body { - encoding convertto -strict utf-8 \uD800 +test encoding-24.39 {Try to generate invalid utf-8 with -profile strict} -body { + encoding convertto -profile strict utf-8 \uD800 } -returnCodes 1 -result {unexpected character at index 0: 'U+00D800'} -test encoding-24.40 {Try to generate invalid utf-8 with -nocomplain} -body { - encoding convertto -nocomplain utf-8 \uD800 +test encoding-24.40 {Try to generate invalid utf-8 with -profile tcl8} -body { + encoding convertto -profile tcl8 utf-8 \uD800 } -result \xED\xA0\x80 file delete [file join [temporaryDirectory] iso2022.txt] @@ -931,7 +931,7 @@ test encoding-28.0 {all encodings load} -body { set string hello foreach name [encoding names] { incr count - encoding convertto -nocomplain $name $string + encoding convertto -profile tcl8 $name $string # discard the cached internal representation of Tcl_Encoding # Unfortunately, without this, encoding 2-1 fails. -- cgit v0.12 From 26e89b4b3c03b100a2a461c034c1930a23a4273b Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 2 Feb 2023 12:23:37 +0000 Subject: Use common option parsing for ConvertfromObjCmd. Fix test error messages. --- generic/tclCmdAH.c | 76 ++++------------------------------------------------- tests/encoding.test | 6 ++--- 2 files changed, 8 insertions(+), 74 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 818159d..67f76a6 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -683,81 +683,15 @@ EncodingConvertfromObjCmd( Tcl_Encoding encoding; /* Encoding to use */ int length; /* Length of the byte array being converted */ const char *bytesPtr; /* Pointer to the first byte of the array */ -#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) - int flags = TCL_ENCODING_STOPONERROR; -#else - int flags = TCL_ENCODING_NOCOMPLAIN; -#endif + int flags; int result; - Tcl_Obj *failVarObj = NULL; - static const char *const options[] = {"-profile", "-failindex", NULL}; - enum convertfromOptions { PROFILE, FAILINDEX } optIndex; - enum TclEncodingProfile profile; - - /* - * Possible combinations: - * 1) data -> objc = 2 - * 2) ?options? encoding data -> objc >= 3 - * It is intentional that specifying option forces encoding to be - * specified. Less prone to user error. This should have always been - * the case even in 8.6 imho where there were no options (ie (1) - * should never have been allowed) - */ + Tcl_Obj *failVarObj; - if (objc == 1) { -numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ - Tcl_WrongNumArgs( - interp, - 1, - objv, - "??-profile profile? ?-failindex var? ?encoding?? data"); + if (EncodingConvertParseOptions( + interp, objc, objv, 1, &encoding, &data, &flags, &failVarObj) + != TCL_OK) { return TCL_ERROR; } - else if (objc == 2) { - encoding = Tcl_GetEncoding(interp, NULL); - data = objv[1]; - } else { - int argIndex; - for (argIndex = 1; argIndex < (objc-2); ++argIndex) { - if (Tcl_GetIndexFromObj( - interp, objv[argIndex], options, "option", 0, &optIndex) - != TCL_OK) { - return TCL_ERROR; - } - if (++argIndex == (objc - 2)) { - goto numArgsError; - } - switch (optIndex) { - case PROFILE: - if (TclEncodingProfileParseName( - interp, objv[argIndex], &profile) - != TCL_OK) { - return TCL_ERROR; - } - switch (profile) { - case TCL_ENCODING_PROFILE_TCL8: - flags = TCL_ENCODING_NOCOMPLAIN; - break; - case TCL_ENCODING_PROFILE_STRICT: - flags = TCL_ENCODING_STRICT; - break; - case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ - default: - break; - } - break; - case FAILINDEX: - failVarObj = objv[argIndex]; - break; - } - } - /* Get encoding after opts so no need to free it on option error */ - if (Tcl_GetEncodingFromObj(interp, objv[objc - 2], &encoding) - != TCL_OK) { - return TCL_ERROR; - } - data = objv[objc - 1]; - } /* * Convert the string into a byte array in 'ds' diff --git a/tests/encoding.test b/tests/encoding.test index 813cd84..e4a2acb 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -718,13 +718,13 @@ test encoding-24.20 {Parse with -profile tcl8 but without providing encoding} -b } -result {wrong # args: should be "::tcl::encoding::convertfrom ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error test encoding-24.21 {Parse with -profile tcl8 but without providing encoding} -body { string length [encoding convertto -profile tcl8 "\x20"] -} -result {::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data} -returnCodes error +} -result {wrong # args: should be "::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error test encoding-24.22 {Syntax error, two encodings} -body { encoding convertfrom iso8859-1 utf-8 "ZX\uD800" -} -returnCodes 1 -result {::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data} +} -result {bad option "iso8859-1": must be -profile or -failindex} -returnCodes error test encoding-24.23 {Syntax error, two encodings} -body { encoding convertto iso8859-1 utf-8 "ZX\uD800" -} -returnCodes 1 -result {::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data} +} -result {bad option "iso8859-1": must be -profile or -failindex} -returnCodes error test encoding-24.24 {Parse invalid utf-8 with -profile strict} -body { encoding convertfrom -profile strict utf-8 "\xC0\x80\x00\x00" } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xC0'} -- cgit v0.12 From e31133e3b0149b9bc29c9c6f06e76ccc6994df7e Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 2 Feb 2023 15:37:21 +0000 Subject: Change encoding error options to fconfigure to encoding profiles --- generic/tclCmdAH.c | 2 +- generic/tclEncoding.c | 23 +++++++++++------ generic/tclIO.c | 69 ++++++++++++++++----------------------------------- generic/tclInt.h | 2 +- 4 files changed, 39 insertions(+), 57 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 67f76a6..9165fda 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -611,7 +611,7 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ switch (optIndex) { case PROFILE: if (TclEncodingProfileParseName( - interp, objv[argIndex], &profile) + interp, Tcl_GetString(objv[argIndex]), &profile) != TCL_OK) { return TCL_ERROR; } diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index bdd091f..55ace3c 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -4102,20 +4102,29 @@ InitializeEncodingSearchPath( int TclEncodingProfileParseName( Tcl_Interp *interp, /* For error messages. May be NULL */ - Tcl_Obj *profileName, /* Name of profile */ + const char *profileName, /* Name of profile */ enum TclEncodingProfile *profilePtr) /* Output */ { /* NOTE: Order must match enum TclEncodingProfile !!! */ static const char *const profileNames[] = {"", "tcl8", "strict"}; int idx; - if (Tcl_GetIndexFromObj( - interp, profileName, profileNames, "profile", 0, &idx) - != TCL_OK) { - return TCL_ERROR; + for (idx = 0; idx < sizeof(profileNames) / sizeof(profileNames[0]); ++idx) { + if (!strcmp(profileName, profileNames[idx])) { + *profilePtr = (enum TclEncodingProfile)idx; + return TCL_OK; + } } - *profilePtr = (enum TclEncodingProfile)idx; - return TCL_OK; + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "bad profile \"%s\". Must be \"\", \"tcl8\" or \"strict\".", + profileName)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); + } + return TCL_ERROR; } /* diff --git a/generic/tclIO.c b/generic/tclIO.c index fed469c..47740ef 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -7862,7 +7862,7 @@ Tcl_BadChannelOption( { if (interp != NULL) { const char *genericopt = - "blocking buffering buffersize encoding eofchar nocomplainencoding strictencoding translation"; + "blocking buffering buffersize encoding encodingprofile eofchar translation"; const char **argv; int argc, i; Tcl_DString ds; @@ -8060,27 +8060,17 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(1, "-nocomplainencoding")) { + if (len == 0 || HaveOpt(1, "-encodingprofile")) { if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-nocomplainencoding"); + Tcl_DStringAppendElement(dsPtr, "-encodingprofile"); } -#ifdef TCL_NO_DEPRECATED - Tcl_DStringAppendElement(dsPtr, - (flags & CHANNEL_ENCODING_NOCOMPLAIN) ? "1" : "0"); -#else - Tcl_DStringAppendElement(dsPtr, - (flags & CHANNEL_ENCODING_STRICT) ? "0" : "1"); -#endif - if (len > 0) { - return TCL_OK; - } - } - if (len == 0 || HaveOpt(1, "-strictencoding")) { - if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-strictencoding"); + if (flags & CHANNEL_ENCODING_STRICT) { + Tcl_DStringAppendElement(dsPtr, "strict"); + } else if (flags & CHANNEL_ENCODING_NOCOMPLAIN) { + Tcl_DStringAppendElement(dsPtr, "tcl8"); + } else { + Tcl_DStringAppendElement(dsPtr, ""); } - Tcl_DStringAppendElement(dsPtr, - (flags & CHANNEL_ENCODING_STRICT) ? "1" : "0"); if (len > 0) { return TCL_OK; } @@ -8341,42 +8331,25 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_EOF|CHANNEL_STICKY_EOF|CHANNEL_BLOCKED); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; return TCL_OK; - } else if (HaveOpt(1, "-nocomplainencoding")) { - int newMode; - - if (Tcl_GetBoolean(interp, newValue, &newMode) == TCL_ERROR) { + } else if (HaveOpt(1, "-encodingprofile")) { + enum TclEncodingProfile profile; + if (TclEncodingProfileParseName(interp, newValue, &profile) != TCL_OK) { return TCL_ERROR; } - if (newMode) { + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); SetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); - } else { -#ifdef TCL_NO_DEPRECATED - ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); -#else - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT) != CHANNEL_ENCODING_STRICT) { - if (interp) { - Tcl_SetObjResult(interp, Tcl_NewStringObj( - "bad value for -nocomplainencoding: only true allowed", - TCL_INDEX_NONE)); - } - return TCL_ERROR; - } -#endif - } - ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); - return TCL_OK; - } else if (HaveOpt(1, "-strictencoding")) { - int newMode; - - if (Tcl_GetBoolean(interp, newValue, &newMode) == TCL_ERROR) { - return TCL_ERROR; - } - if (newMode) { + break; + case TCL_ENCODING_PROFILE_STRICT: ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); SetFlag(statePtr, CHANNEL_ENCODING_STRICT); - } else { + break; + case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ + default: + ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); + break; } ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); return TCL_OK; diff --git a/generic/tclInt.h b/generic/tclInt.h index db8ee9f..82728d3 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2896,7 +2896,7 @@ enum TclEncodingProfile { MODULE_SCOPE Tcl_Encoding tclIdentityEncoding; MODULE_SCOPE int TclEncodingProfileParseName(Tcl_Interp *interp, - Tcl_Obj *profileName, + const char *profileName, enum TclEncodingProfile *profilePtr); /* -- cgit v0.12 From 100d8ce724b2ed4d9f15a045bc2e48119b53465f Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 2 Feb 2023 16:43:12 +0000 Subject: Update tests to use -encodingprofile --- generic/tclIO.c | 30 +++++++++++++++--------------- tests/chanio.test | 6 +++--- tests/io.test | 44 ++++++++++++++++++++++---------------------- tests/ioCmd.test | 26 ++++++++++++++------------ tests/winConsole.test | 14 +++++++------- tests/zlib.test | 4 ++-- 6 files changed, 63 insertions(+), 61 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 47740ef..b76234b 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -8017,6 +8017,21 @@ Tcl_GetChannelOption( return TCL_OK; } } + if (len == 0 || HaveOpt(1, "-encodingprofile")) { + if (len == 0) { + Tcl_DStringAppendElement(dsPtr, "-encodingprofile"); + } + if (flags & CHANNEL_ENCODING_STRICT) { + Tcl_DStringAppendElement(dsPtr, "strict"); + } else if (flags & CHANNEL_ENCODING_NOCOMPLAIN) { + Tcl_DStringAppendElement(dsPtr, "tcl8"); + } else { + Tcl_DStringAppendElement(dsPtr, ""); + } + if (len > 0) { + return TCL_OK; + } + } if (len == 0 || HaveOpt(2, "-eofchar")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-eofchar"); @@ -8060,21 +8075,6 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(1, "-encodingprofile")) { - if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-encodingprofile"); - } - if (flags & CHANNEL_ENCODING_STRICT) { - Tcl_DStringAppendElement(dsPtr, "strict"); - } else if (flags & CHANNEL_ENCODING_NOCOMPLAIN) { - Tcl_DStringAppendElement(dsPtr, "tcl8"); - } else { - Tcl_DStringAppendElement(dsPtr, ""); - } - if (len > 0) { - return TCL_OK; - } - } if (len == 0 || HaveOpt(1, "-translation")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-translation"); diff --git a/tests/chanio.test b/tests/chanio.test index fb94051..7c9857d 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -252,7 +252,7 @@ test chan-io-3.3 {WriteChars: compatibility with WriteBytes: flush on line} -bod test chan-io-3.4 {WriteChars: loop over stage buffer} -body { # stage buffer maps to more than can be queued at once. set f [open $path(test1) w] - chan configure $f -encoding jis0208 -buffersize 16 -nocomplainencoding 1 + chan configure $f -encoding jis0208 -buffersize 16 -profile tcl8 chan puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] chan close $f @@ -265,7 +265,7 @@ test chan-io-3.5 {WriteChars: saved != 0} -body { # be moved to beginning of next channel buffer to preserve requested # buffersize. set f [open $path(test1) w] - chan configure $f -encoding jis0208 -buffersize 17 -nocomplainencoding 1 + chan configure $f -encoding jis0208 -buffersize 17 -profile tcl8 chan puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] chan close $f @@ -298,7 +298,7 @@ test chan-io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} -body { # on flush. The truncated bytes are moved to the beginning of the next # channel buffer. set f [open $path(test1) w] - chan configure $f -encoding jis0208 -buffersize 17 -nocomplainencoding 1 + chan configure $f -encoding jis0208 -buffersize 17 -profile tcl8 chan puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] chan close $f diff --git a/tests/io.test b/tests/io.test index 2708906..efc6374 100644 --- a/tests/io.test +++ b/tests/io.test @@ -272,7 +272,7 @@ test io-3.4 {WriteChars: loop over stage buffer} -body { # stage buffer maps to more than can be queued at once. set f [open $path(test1) w] - fconfigure $f -encoding jis0208 -buffersize 16 -nocomplainencoding 1 + fconfigure $f -encoding jis0208 -buffersize 16 -encodingprofile tcl8 puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] close $f @@ -286,7 +286,7 @@ test io-3.5 {WriteChars: saved != 0} -body { # requested buffersize. set f [open $path(test1) w] - fconfigure $f -encoding jis0208 -buffersize 17 -nocomplainencoding 1 + fconfigure $f -encoding jis0208 -buffersize 17 -encodingprofile tcl8 puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] close $f @@ -319,7 +319,7 @@ test io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} -body { # of the next channel buffer. set f [open $path(test1) w] - fconfigure $f -encoding jis0208 -buffersize 17 -nocomplainencoding 1 + fconfigure $f -encoding jis0208 -buffersize 17 -encodingprofile tcl8 puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] close $f @@ -8964,7 +8964,7 @@ test io-75.1 {multibyte encoding error read results in raw bytes} -setup { puts -nonewline $f A\xC0\x40 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -nocomplainencoding 1 -buffering none + fconfigure $f -encoding utf-8 -encodingprofile tcl8 -buffering none } -body { set d [read $f] binary scan $d H* hd @@ -8974,10 +8974,10 @@ test io-75.1 {multibyte encoding error read results in raw bytes} -setup { removeFile io-75.1 } -result 41c040 -test io-75.2 {unrepresentable character write passes and is replaced by ? (-nocomplainencoding 1)} -setup { +test io-75.2 {unrepresentable character write passes and is replaced by ? (-encodingprofile tcl8)} -setup { set fn [makeFile {} io-75.2] set f [open $fn w+] - fconfigure $f -encoding iso8859-1 -nocomplainencoding 1 + fconfigure $f -encoding iso8859-1 -encodingprofile tcl8 } -body { puts -nonewline $f A\u2022 flush $f @@ -8991,14 +8991,14 @@ test io-75.2 {unrepresentable character write passes and is replaced by ? (-noco # Incomplete sequence test. # This error may IMHO only be detected with the close. # But the read already returns the incomplete sequence. -test io-75.3 {incomplete multibyte encoding read is ignored (-nocomplainencoding 1)} -setup { +test io-75.3 {incomplete multibyte encoding read is ignored (-encodingprofile tcl8)} -setup { set fn [makeFile {} io-75.3] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f "A\xC0" flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -nocomplainencoding 1 + fconfigure $f -encoding utf-8 -buffering none -encodingprofile tcl8 } -body { set d [read $f] close $f @@ -9010,7 +9010,7 @@ test io-75.3 {incomplete multibyte encoding read is ignored (-nocomplainencoding # As utf-8 has a special treatment in multi-byte decoding, also test another # one. -test io-75.4 {shiftjis encoding error read results in raw bytes (-nocomplainencoding 1)} -setup { +test io-75.4 {shiftjis encoding error read results in raw bytes (-encodingprofile tcl8)} -setup { set fn [makeFile {} io-75.4] set f [open $fn w+] fconfigure $f -encoding binary @@ -9019,7 +9019,7 @@ test io-75.4 {shiftjis encoding error read results in raw bytes (-nocomplainenco puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -nocomplainencoding 1 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -encodingprofile tcl8 } -body { set d [read $f] binary scan $d H* hd @@ -9029,14 +9029,14 @@ test io-75.4 {shiftjis encoding error read results in raw bytes (-nocomplainenco removeFile io-75.4 } -result 4181ff41 -test io-75.5 {invalid utf-8 encoding read is ignored (-nocomplainencoding 1)} -setup { +test io-75.5 {invalid utf-8 encoding read is ignored (-encodingprofile tcl8)} -setup { set fn [makeFile {} io-75.5] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -nocomplainencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile tcl8 } -body { set d [read $f] close $f @@ -9046,7 +9046,7 @@ test io-75.5 {invalid utf-8 encoding read is ignored (-nocomplainencoding 1)} -s removeFile io-75.5 } -result 4181 -test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { +test io-75.6 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.6] set f [open $fn w+] fconfigure $f -encoding binary @@ -9054,7 +9054,7 @@ test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -s puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9065,7 +9065,7 @@ test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -s removeFile io-75.6 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} -test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { +test io-75.7 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.7] set f [open $fn w+] fconfigure $f -encoding binary @@ -9073,7 +9073,7 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { puts -nonewline $f A\xA1\x1A flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9088,7 +9088,7 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { removeFile io-75.7 } -match glob -result {41 0 1 {error reading "*": illegal byte sequence} ¡} -test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { +test io-75.8 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.8] set f [open $fn w+] fconfigure $f -encoding binary @@ -9096,7 +9096,7 @@ test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { puts -nonewline $f A\x1A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9111,7 +9111,7 @@ test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { test io-75.9 {unrepresentable character write passes and is replaced by ?} -setup { set fn [makeFile {} io-75.9] set f [open $fn w+] - fconfigure $f -encoding iso8859-1 -strictencoding 1 + fconfigure $f -encoding iso8859-1 -encodingprofile strict } -body { catch {puts -nonewline $f "A\u2022"} msg flush $f @@ -9155,7 +9155,7 @@ test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9182,7 +9182,7 @@ test io-75.12 {invalid utf-8 encoding read is ignored} -setup { } -cleanup { removeFile io-75.12 } -result 4181 -test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { +test io-75.13 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.13] set f [open $fn w+] fconfigure $f -encoding binary @@ -9190,7 +9190,7 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - puts -nonewline $f "A\x81" flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd diff --git a/tests/ioCmd.test b/tests/ioCmd.test index 1a72f70..8c9d870 100644 --- a/tests/ioCmd.test +++ b/tests/ioCmd.test @@ -207,7 +207,7 @@ test iocmd-7.5 {close command} -setup { proc expectedOpts {got extra} { set basicOpts { - -blocking -buffering -buffersize -encoding -eofchar -nocomplainencoding -strictencoding -translation + -blocking -buffering -buffersize -encoding -encodingprofile -eofchar -translation } set opts [list {*}$basicOpts {*}$extra] lset opts end [string cat "or " [lindex $opts end]] @@ -240,33 +240,33 @@ test iocmd-8.7 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] - fconfigure $f1 -translation lf -eofchar {} -encoding utf-16 -nocomplainencoding 1 + fconfigure $f1 -translation lf -eofchar {} -encoding utf-16 -encodingprofile tcl8 fconfigure $f1 } -cleanup { catch {close $f1} -} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf} +} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -encodingprofile tcl8 -eofchar {} -translation lf} test iocmd-8.8 {fconfigure command} -setup { file delete $path(test1) set x {} } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation lf -buffering line -buffersize 3030 \ - -eofchar {} -encoding utf-16 -nocomplainencoding 1 + -eofchar {} -encoding utf-16 -encodingprofile tcl8 lappend x [fconfigure $f1 -buffering] lappend x [fconfigure $f1] } -cleanup { catch {close $f1} -} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf}} +} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -encodingprofile tcl8 -eofchar {} -translation lf}} test iocmd-8.9 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation binary -buffering none -buffersize 4040 \ - -eofchar {} -encoding binary -nocomplainencoding 1 + -eofchar {} -encoding binary -encodingprofile tcl8 fconfigure $f1 } -cleanup { catch {close $f1} -} -result {-blocking 1 -buffering none -buffersize 4040 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf} +} -result {-blocking 1 -buffering none -buffersize 4040 -encoding binary -encodingprofile tcl8 -eofchar {} -translation lf} test iocmd-8.10 {fconfigure command} -returnCodes error -body { fconfigure a b } -result {can not find channel named "a"} @@ -369,7 +369,7 @@ test iocmd-8.20 {fconfigure command / win console channel} -constraints {nonPort # TODO: Test parsing of serial channel options (nonPortable, since requires an # open channel to work with). test iocmd-8.21 {fconfigure command / -nocomplainencoding 0 error} -constraints { - deprecated + deprecated obsolete } -setup { # I don't know how else to open the console, but this is non-portable set console stdin @@ -378,7 +378,9 @@ test iocmd-8.21 {fconfigure command / -nocomplainencoding 0 error} -constraints } -returnCodes error -result "bad value for -nocomplainencoding: only true allowed" test iocmd-8.22 {fconfigure command / -nocomplainencoding 0, no error if -strictencoding already defined} -setup { set console stdin - set oldmode [fconfigure $console -strictencoding] + set oldprofile [fconfigure $console -encodingprofile] +} -constraints { + obsolete } -body { fconfigure $console -strictencoding 1 fconfigure $console -nocomplainencoding 0 @@ -1381,7 +1383,7 @@ test iocmd-25.1 {chan configure, cgetall, standard options} -match glob -body { close $c rename foo {} set res -} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding * -eofchar {{} {}} -nocomplainencoding * -strictencoding 0 -translation {auto *}}} +} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding * -encodingprofile * -eofchar {{} {}} -translation {auto *}}} test iocmd-25.2 {chan configure, cgetall, no options} -match glob -body { set res {} proc foo {args} {oninit cget cgetall; onfinal; track; return ""} @@ -1390,7 +1392,7 @@ test iocmd-25.2 {chan configure, cgetall, no options} -match glob -body { close $c rename foo {} set res -} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -eofchar {{} {}} -nocomplainencoding * -strictencoding 0 -translation {auto *}}} +} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -encodingprofile * -eofchar {{} {}} -translation {auto *}}} test iocmd-25.3 {chan configure, cgetall, regular result} -match glob -body { set res {} proc foo {args} { @@ -1402,7 +1404,7 @@ test iocmd-25.3 {chan configure, cgetall, regular result} -match glob -body { close $c rename foo {} set res -} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -eofchar {{} {}} -nocomplainencoding * -strictencoding 0 -translation {auto *} -bar foo -snarf x}} +} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -encodingprofile * -eofchar {{} {}} -translation {auto *} -bar foo -snarf x}} test iocmd-25.4 {chan configure, cgetall, bad result, list of uneven length} -match glob -body { set res {} proc foo {args} { diff --git a/tests/winConsole.test b/tests/winConsole.test index b04f3e9..62dfbf3 100644 --- a/tests/winConsole.test +++ b/tests/winConsole.test @@ -198,7 +198,7 @@ test console-fconfigure-get-1.0 { Console get stdin configuration } -constraints {win interactive} -body { lsort [dict keys [fconfigure stdin]] -} -result {-blocking -buffering -buffersize -encoding -eofchar -inputmode -translation} +} -result {-blocking -buffering -buffersize -encoding -encodingprofile -eofchar -inputmode -translation} set testnum 0 foreach {opt result} { @@ -224,7 +224,7 @@ test console-fconfigure-get-1.[incr testnum] { fconfigure -winsize } -constraints {win interactive} -body { fconfigure stdin -winsize -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -nocomplainencoding, -strictencoding, -translation, or -inputmode} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, -translation, or -inputmode} -returnCodes error ## fconfigure get stdout/stderr foreach chan {stdout stderr} major {2 3} { @@ -232,7 +232,7 @@ foreach chan {stdout stderr} major {2 3} { win interactive } -body { lsort [dict keys [fconfigure $chan]] - } -result {-blocking -buffering -buffersize -encoding -eofchar -translation -winsize} + } -result {-blocking -buffering -buffersize -encoding -encodingprofile -eofchar -translation -winsize} set testnum 0 foreach {opt result} { -blocking 1 @@ -260,7 +260,7 @@ foreach chan {stdout stderr} major {2 3} { fconfigure -inputmode } -constraints {win interactive} -body { fconfigure $chan -inputmode - } -result {bad option "-inputmode": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -nocomplainencoding, -strictencoding, -translation, or -winsize} -returnCodes error + } -result {bad option "-inputmode": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, -translation, or -winsize} -returnCodes error } @@ -330,7 +330,7 @@ test console-fconfigure-set-1.3 { fconfigure stdin -winsize } -constraints {win interactive} -body { fconfigure stdin -winsize {10 30} -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -nocomplainencoding, -strictencoding, -translation, or -inputmode} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, -translation, or -inputmode} -returnCodes error ## fconfigure set stdout,stderr @@ -338,13 +338,13 @@ test console-fconfigure-set-2.0 { fconfigure stdout -winsize } -constraints {win interactive} -body { fconfigure stdout -winsize {10 30} -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -nocomplainencoding, -strictencoding, or -translation} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, or -translation} -returnCodes error test console-fconfigure-set-3.0 { fconfigure stderr -winsize } -constraints {win interactive} -body { fconfigure stderr -winsize {10 30} -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -nocomplainencoding, -strictencoding, or -translation} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, or -translation} -returnCodes error # Multiple threads diff --git a/tests/zlib.test b/tests/zlib.test index ebbdd50..272a663 100644 --- a/tests/zlib.test +++ b/tests/zlib.test @@ -292,7 +292,7 @@ test zlib-8.6 {transformation and fconfigure} -setup { } -cleanup { catch {close $fd} removeFile $file -} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf -checksum 1 -dictionary {}} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf}} +} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf -checksum 1 -dictionary {}} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf}} test zlib-8.7 {transformation and fconfigure} -setup { set file [makeFile {} test.gz] set fd [open $file wb] @@ -302,7 +302,7 @@ test zlib-8.7 {transformation and fconfigure} -setup { } -cleanup { catch {close $fd} removeFile $file -} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf -checksum 0} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -nocomplainencoding 1 -strictencoding 0 -translation lf}} +} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf -checksum 0} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf}} # Input is headers from fetching SPDY draft # Dictionary is that which is proposed _in_ SPDY draft set spdyHeaders "HTTP/1.0 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nX-Robots-Tag: noarchive\r\nLast-Modified: Tue, 05 Jun 2012 02:43:25 GMT\r\nETag: \"1338864205129|#public|0|en|||0\"\r\nExpires: Tue, 05 Jun 2012 16:17:11 GMT\r\nDate: Tue, 05 Jun 2012 16:17:06 GMT\r\nCache-Control: public, max-age=5\r\nX-Content-Type-Options: nosniff\r\nX-XSS-Protection: 1; mode=block\r\nServer: GSE\r\n" -- cgit v0.12 From 52fc9a970c0239d9f74fd6313920572315e757a7 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Thu, 2 Feb 2023 22:51:26 +0000 Subject: Fix for [b8f575aa2398b0e4] and [154ed7ce564a7b4c], double-[read]/[gets] problem. Partial-read functionality commented out. --- generic/tclIOCmd.c | 6 +- tests/io.test | 450 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 297 insertions(+), 159 deletions(-) diff --git a/generic/tclIOCmd.c b/generic/tclIOCmd.c index 2eeb04c..5b47b08 100644 --- a/generic/tclIOCmd.c +++ b/generic/tclIOCmd.c @@ -331,14 +331,16 @@ Tcl_GetsObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + /* resultDictPtr = Tcl_NewDictObj(); Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) , linePtr); returnOptsPtr = Tcl_NewDictObj(); Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) , resultDictPtr); - code = TCL_ERROR; Tcl_SetReturnOptions(interp, returnOptsPtr); + */ + code = TCL_ERROR; goto done; } lineLen = TCL_INDEX_NONE; @@ -476,6 +478,7 @@ Tcl_ReadObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } + /* resultDictPtr = Tcl_NewDictObj(); Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) , resultPtr); @@ -485,6 +488,7 @@ Tcl_ReadObjCmd( TclChannelRelease(chan); Tcl_DecrRefCount(resultPtr); Tcl_SetReturnOptions(interp, returnOptsPtr); + */ return TCL_ERROR; } diff --git a/tests/io.test b/tests/io.test index 3f00561..5bf5f10 100644 --- a/tests/io.test +++ b/tests/io.test @@ -1560,19 +1560,29 @@ apply [list {} { set f [open $path(test1)] fconfigure $f -encoding utf-8 @strict@ -buffersize 10 set status [catch {read $f} cres copts] - set in [dict get $copts -result] - lappend res $in + #set in [dict get $copts -result] + #lappend res $in lappend res $status $cres set status [catch {read $f} cres copts] - set in [dict get $copts -result] - lappend res $in + #set in [dict get $copts -result] + #lappend res $in lappend res $status $cres set res } -cleanup { catch {close $f} - } -match glob -result {{read aaaaaaaaa} 1\ + } -match glob\ + } + + #append template {\ + # -result {{read aaaaaaaaa} 1\ + # {error reading "*": illegal byte sequence}\ + # {read {}} 1 {error reading "*": illegal byte sequence}} + #} + + append template {\ + -result {1\ {error reading "*": illegal byte sequence}\ - {read {}} 1 {error reading "*": illegal byte sequence}} + 1 {error reading "*": illegal byte sequence}} } # strict encoding may be the default in Tcl 9, but in 8 it is not @@ -9070,48 +9080,83 @@ test io-75.5 {invalid utf-8 encoding read is ignored (-nocomplainencoding 1)} -s removeFile io-75.5 } -result 4181 -test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { - set fn [makeFile {} io-75.6] - set f [open $fn w+] - fconfigure $f -encoding binary - # \x81 is invalid in utf-8 - puts -nonewline $f A\x81 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 -} -body { - set status [catch {read $f} cres copts] - set d [dict get $copts -result read] - binary scan $d H* hd - lappend hd $status $cres -} -cleanup { - close $f - removeFile io-75.6 -} -match glob -result {41 1 {error reading "*": illegal byte sequence}} -test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { - set fn [makeFile {} io-75.7] - set f [open $fn w+] - fconfigure $f -encoding binary - # \xA1 is invalid in utf-8. -eofchar is not detected, because it comes later. - puts -nonewline $f A\xA1\x1A - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 -} -body { - set status [catch {read $f} cres copts] - set d [dict get $copts -result read] - binary scan $d H* hd - lappend hd [eof $f] - lappend hd $status - lappend hd $cres - fconfigure $f -encoding iso8859-1 - lappend hd [read $f];# We changed encoding, so now we can read the \xA1 - close $f - set hd -} -cleanup { - removeFile io-75.7 -} -match glob -result {41 0 1 {error reading "*": illegal byte sequence} ¡} +apply [list {} { + + + set test { + test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { + set hd {} + set fn [makeFile {} io-75.6] + set f [open $fn w+] + fconfigure $f -encoding binary + # \x81 is invalid in utf-8 + puts -nonewline $f A\x81 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 + } -body { + set status [catch {read $f} cres copts] + #set d [dict get $copts -result read] + #binary scan $d H* hd + lappend hd $status $cres + } -cleanup { + close $f + removeFile io-75.6 + } -match glob\ + } + + #append test {\ + # -result {41 1 {error reading "*": illegal byte sequence}} + #} + + append test {\ + -result {1 {error reading "*": illegal byte sequence}} + } + + uplevel 1 $test + + set test { + test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { + set hd {} + set fn [makeFile {} io-75.7] + set f [open $fn w+] + fconfigure $f -encoding binary + # \xA1 is invalid in utf-8. -eofchar is not detected, because it comes later. + puts -nonewline $f A\xA1\x1A + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 + } -body { + set status [catch {read $f} cres copts] + #set d [dict get $copts -result read] + #binary scan $d H* hd + lappend hd [eof $f] + lappend hd $status + lappend hd $cres + fconfigure $f -encoding iso8859-1 + lappend hd [read $f];# We changed encoding, so now we can read the \xA1 + close $f + set hd + } -cleanup { + removeFile io-75.7 + } -match glob\ + } + + #append test {\ + # -result {41 0 1 {error reading "*": illegal byte sequence} ¡} + #} + + append test {\ + -result {0 1 {error reading "*": illegal byte sequence} ¡} + } + + uplevel 1 $test + + +} [namespace current]] + + test io-75.8.incomplete { incomplete uft-8 char after eof char is not an error (-strictencoding 1) @@ -9198,76 +9243,124 @@ test io-75.10 {incomplete multibyte encoding read is ignored} -setup { } -result 41c0 -test io-75.10_strict {incomplete multibyte encoding read is an error} -setup { - set res {} - set fn [makeFile {} io-75.10] - set f [open $fn w+] - fconfigure $f -encoding binary - puts -nonewline $f A\xC0 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -strictencoding 1 -buffering none -} -body { - set status [catch {read $f} cres copts] - set d [dict get $copts -result read] - binary scan $d H* hd - lappend res $hd $cres - chan configure $f -encoding iso8859-1 - set d [read $f] - binary scan $d H* hd - lappend res $hd - close $f - return $res -} -cleanup { - removeFile io-75.10 -} -match glob -result {41 {error reading "*": illegal byte sequence} c0} +apply [list {} { + set test { + test io-75.10_strict {incomplete multibyte encoding read is an error} -setup { + set res {} + set fn [makeFile {} io-75.10] + set f [open $fn w+] + fconfigure $f -encoding binary + puts -nonewline $f A\xC0 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -strictencoding 1 -buffering none + } -body { + set status [catch {read $f} cres copts] + + #set d [dict get $copts -result read] + #binary scan $d H* hd + #lappend res $hd $cres + lappend res $cres + + chan configure $f -encoding iso8859-1 + + set d [read $f] + binary scan $d H* hd + lappend res $hd + close $f + return $res + } -cleanup { + removeFile io-75.10 + } -match glob\ + } + + #append test {\ + # -result {41 {error reading "*": illegal byte sequence} c0} + #} + + append test {\ + -result {{error reading "*": illegal byte sequence} c0} + } + + uplevel 1 $test + + + + set test { + # As utf-8 has a special treatment in multi-byte decoding, also test another + # one. + test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { + set hd {} + set fn [makeFile {} io-75.11] + set f [open $fn w+] + fconfigure $f -encoding binary + # In shiftjis, \x81 starts a two-byte sequence. + # But 2nd byte \xFF is not allowed + puts -nonewline $f A\x81\xFFA + flush $f + seek $f 0 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" \ + -translation lf -strictencoding 1 + } -body { + set status [catch {read $f} cres copts] + #set d [dict get $copts -result read] + #binary scan $d H* hd + lappend hd $status + lappend hd $cres + } -cleanup { + close $f + removeFile io-75.11 + } -match glob + } -# As utf-8 has a special treatment in multi-byte decoding, also test another -# one. -test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { - set fn [makeFile {} io-75.11] - set f [open $fn w+] - fconfigure $f -encoding binary - # In shiftjis, \x81 starts a two-byte sequence. - # But 2nd byte \xFF is not allowed - puts -nonewline $f A\x81\xFFA - flush $f - seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" \ - -translation lf -strictencoding 1 -} -body { - set status [catch {read $f} cres copts] - set d [dict get $copts -result read] - binary scan $d H* hd - lappend hd $status - lappend hd $cres -} -cleanup { - close $f - removeFile io-75.11 -} -match glob -result {41 1 {error reading "*": illegal byte sequence}} + #append test {\ + # -result {41 1 {error reading "*": illegal byte sequence}} + #} + append test {\ + -result {1 {error reading "*": illegal byte sequence}} + } -test io-75.12 {invalid utf-8 encoding read is an error} -setup { - set res {} - set fn [makeFile {} io-75.12] - set f [open $fn w+] - fconfigure $f -encoding binary - puts -nonewline $f A\x81 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf \ + + set test { + test io-75.12 {invalid utf-8 encoding read is an error} -setup { + set hd {} + set res {} + set fn [makeFile {} io-75.12] + set f [open $fn w+] + fconfigure $f -encoding binary + puts -nonewline $f A\x81 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf \ -strictencoding 1 -} -body { - set status [catch {read $f} cres copts] - set d [dict get $copts -result read] - close $f - binary scan $d H* hd - lappend res $hd $status $cres - return $res -} -cleanup { - removeFile io-75.12 -} -match glob -result {41 1 {error reading "*": illegal byte sequence}} + } -body { + set status [catch {read $f} cres copts] + #set d [dict get $copts -result read] + #binary scan $d H* hd + #lappend res $hd + lappend res $status $cres + return $res + } -cleanup { + catch {close $f} + removeFile io-75.12 + } -match glob\ + } + + #append test {\ + # -result {41 1 {error reading "*": illegal byte sequence}} + #} + + + append test {\ + -result {1 {error reading "*": illegal byte sequence}} + } + + uplevel 1 $test +} [namespace current]] + + test io-75.12_ignore {invalid utf-8 encoding read is ignored} -setup { set fn [makeFile {} io-75.12] set f [open $fn w+] @@ -9285,25 +9378,49 @@ test io-75.12_ignore {invalid utf-8 encoding read is ignored} -setup { } -cleanup { removeFile io-75.12 } -result 4181 -test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { - set fn [makeFile {} io-75.13] - set f [open $fn w+] - fconfigure $f -encoding binary - # \x81 is invalid in utf-8 - puts -nonewline $f "A\x81" - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 -} -body { - set status [catch {read $f} cres copts] - set d [dict get $copts -result read] - binary scan $d H* hd - lappend hd $status - close $f - lappend hd $cres -} -cleanup { - removeFile io-75.13 -} -match glob -result {41 1 {error reading "*": illegal byte sequence}} + + +apply [list {} { + + set test { + test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { + set hd {} + set fn [makeFile {} io-75.13] + set f [open $fn w+] + fconfigure $f -encoding binary + # \x81 is invalid in utf-8 + puts -nonewline $f A\x81 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" \ + -translation lf -strictencoding 1 + } -body { + set status [catch {read $f} cres copts] + #set d [dict get $copts -result read] + #binary scan $d H* hd + lappend hd $status + lappend hd $cres + } -cleanup { + catch {close $f} + removeFile io-75.13 + } -match glob\ + } + + #append test {\ + # -result {41 1 {error reading "*": illegal byte sequence}} + #} + + append test {\ + -result {1 {error reading "*": illegal byte sequence}} + } + + uplevel 1 $test + + set test { + } + +} [namespace current]] + test io-75.14 {invalid utf-8 encoding [gets] coninues in non-strict mode after error} -setup { set res {} @@ -9329,34 +9446,51 @@ test io-75.14 {invalid utf-8 encoding [gets] coninues in non-strict mode after e } -match glob -result {a 1 {error reading "*": illegal byte sequence} bÀ c} -test io-75.15 {invalid utf-8 encoding strict gets should not hang} -setup { - set res {} - set fn [makeFile {} io-75.15] - set chan [open $fn w+] - fconfigure $chan -encoding binary - # This is not valid UTF-8 - puts $chan hello\nAB\xc0\x40CD\nEFG - close $chan -} -body { - #Now try to read it with [gets] - set chan [open $fn] - fconfigure $chan -encoding utf-8 -strictencoding 1 - lappend res [gets $chan] - set status [catch {gets $chan} cres copts] - lappend res $status $cres - set status [catch {gets $chan} cres copts] - lappend res $status $cres - lappend res [dict get $copts -result] - chan configur $chan -encoding binary - foreach char [split [read $chan 2] {}] { - lappend res [format %x [scan $char %c]] + +apply [list {} { + set test { + test io-75.15 {invalid utf-8 encoding strict gets should not hang} -setup { + set res {} + set fn [makeFile {} io-75.15] + set chan [open $fn w+] + fconfigure $chan -encoding binary + # This is not valid UTF-8 + puts $chan hello\nAB\xc0\x40CD\nEFG + close $chan + } -body { + #Now try to read it with [gets] + set chan [open $fn] + fconfigure $chan -encoding utf-8 -strictencoding 1 + lappend res [gets $chan] + set status [catch {gets $chan} cres copts] + lappend res $status $cres + set status [catch {gets $chan} cres copts] + lappend res $status $cres + #lappend res [dict get $copts -result] + chan configur $chan -encoding binary + foreach char [split [read $chan 2] {}] { + lappend res [format %x [scan $char %c]] + } + return $res + } -cleanup { + close $chan + removeFile io-75.15 + } -match glob\ } - return $res -} -cleanup { - close $chan - removeFile io-75.15 -} -match glob -result {hello 1 {error reading "*": illegal byte sequence}\ - 1 {error reading "*": illegal byte sequence} {read AB} c0 40} + + #append test {\ + # -result {hello 1 {error reading "*": illegal byte sequence}\ + # 1 {error reading "*": illegal byte sequence} {read AB} c0 40} + #} + + append test {\ + -result {hello 1 {error reading "*": illegal byte sequence}\ + 1 {error reading "*": illegal byte sequence} c0 40} + } + + uplevel 1 $test + +} [namespace current]] test io-76.0 {channel modes} -setup { -- cgit v0.12 From f238eb1dbc93130d15f8b4e7dd32602c1870794a Mon Sep 17 00:00:00 2001 From: pooryorick Date: Sat, 4 Feb 2023 00:28:12 +0000 Subject: Fix test io-75.14. --- tests/io.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/io.test b/tests/io.test index 0f62a4f..75255ca 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9428,7 +9428,7 @@ test io-75.14 { set res {} set fn [makeFile {} io-75.14] set f [open $fn w+] - fconfigure $f -encoding binary + fconfigure $f -translation binary # \xc0 is invalid in utf-8 puts -nonewline $f a\nb\xc0\nc\n flush $f -- cgit v0.12 From 694ae1913191cf93072702e7612b88544f7bea54 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 7 Feb 2023 11:22:08 +0000 Subject: Fix call to EncodingConvertParseOption for decoding --- generic/tclCmdAH.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 9165fda..02a3a46 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -688,7 +688,7 @@ EncodingConvertfromObjCmd( Tcl_Obj *failVarObj; if (EncodingConvertParseOptions( - interp, objc, objv, 1, &encoding, &data, &flags, &failVarObj) + interp, objc, objv, 0, &encoding, &data, &flags, &failVarObj) != TCL_OK) { return TCL_ERROR; } -- cgit v0.12 From e0ee29b9b606d2a3872ddf7f04332ba62433ae32 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 7 Feb 2023 11:23:52 +0000 Subject: Refactor encoding tests for broader coverage and easier test case management --- tests/cmdAH.test | 538 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 343 insertions(+), 195 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index d7a3657..22dc2a4 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -171,239 +171,387 @@ test cmdAH-3.2 {Tcl_ContinueObjCmd, success} { list [catch {continue} msg] $msg } {4 {}} -test cmdAH-4.1 {Tcl_EncodingObjCmd} -returnCodes error -body { +### +# encoding command + +set "numargErrors(encoding system)" {^wrong # args: should be "(encoding |::tcl::encoding::)system \?encoding\?"$} +set "numargErrors(encoding convertfrom)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertfrom \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} +set "numargErrors(encoding convertto)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertto \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} +set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} + +set encProfiles {tcl8 strict} + +# TODO - valid sequences for different encodings - shiftjis etc. +# Note utf-16, utf-32 missing because they are automatically +# generated based on le/be versions. +set encValidStrings { + ascii ABC \x41\x42\x43 + utf-8 A\u0000\u03A9\u8A9E\U00010384 \x41\x00\xCE\xA9\xE8\xAA\x9E\xF0\x90\x8E\x84 + utf-16le A\u0000\u03A9\u8A9E\U00010384 \x41\x00\x00\x00\xA9\x03\x9E\x8A\x00\xD8\x84\xDF + utf-16be A\u0000\u03A9\u8A9E\U00010384 \x00\x41\x00\x00\x03\xA9\x8A\x9E\xD8\x00\xDF\x84 + utf-32le A\u0000\u03A9\u8A9E\U00010384 \x41\x00\x00\x00\x00\x00\x00\x00\xA9\x03\x00\x00\x9E\x8A\x00\x00\x84\x03\x01\x00 + utf-32be A\u0000\u03A9\u8A9E\U00010384 \x00\x00\x00\x41\x00\x00\x00\x00\x00\x00\x03\xA9\x00\x00\x8A\x9E\x00\x01\x03\x84 +} + +# Invalid byte sequences {encoding bytes profile prefix failindex tag} +# Note tag is used in test id generation as well. The combination +# should be unique for test ids to be unique. +# Note utf-16, utf-32 missing because they are automatically +# generated based on le/be versions. +# TODO - other encodings and test cases +set encInvalidBytes { + ascii \x41\xe9\x42 default A\u00E9B -1 {non-ASCII} + ascii \x41\xe9\x42 tcl8 A\u00E9B -1 {non-ASCII} + ascii \x41\xe9\x42 strict A 1 {non-ASCII} + + utf-8 \x41\xC0\x42 default A\u00C0B -1 C0 + utf-8 \x41\xC0\x42 tcl8 A\u00C0B -1 C0 + utf-8 \x41\xC0\x42 strict A 1 C0 + utf-8 \x41\x80\x42 default A\u0080B -1 80 + utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 + utf-8 \x41\x80\x42 strict A 1 80 + utf-8 \x41\xC0\x80\x42 default A\u0000B -1 C080 + utf-8 \x41\xC0\x80\x42 tcl8 A\u0000B -1 C080 + utf-8 \x41\xC0\x80\x42 strict A 1 C080 + utf-8 \x41\xC1\x42 default A\u00C1B -1 C1 + utf-8 \x41\xC1\x42 tcl8 A\u00C1B -1 C1 + utf-8 \x41\xC1\x42 strict A 1 C1 + utf-8 \x41\xC2\x42 default A\u00C2B -1 C2-nontrail + utf-8 \x41\xC2\x42 tcl8 A\u00C2B -1 C2-nontrail + utf-8 \x41\xC2\x42 strict A 1 C2-nontrail + utf-8 \x41\xC2 default A\u00C2 -1 C2-incomplete + utf-8 \x41\xC2 tcl8 A\u00C2 -1 C2-incomplete + utf-8 \x41\xC2 strict A 1 C2-incomplete + utf-8 A\xed\xa0\x80B default A\uD800B -1 High-surrogate + utf-8 A\xed\xa0\x80B tcl8 A\uD800B -1 High-surrogate + utf-8 A\xed\xa0\x80B strict A 1 High-surrogate + utf-8 A\xed\xb0\x80B default A\uDC00B -1 Low-surrogate + utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate + utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate + + utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 default A\uD800B -1 {High-surrogate} + utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 tcl8 A\uD800B -1 {High-surrogate} + utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 strict A 4 {High-surrogate} +} + +# Strings that cannot be encoded for specific encoding / profiles +# {encoding string profile bytes failindex tag} +# Note tag is used in test id generation as well. The combination +# should be unique for test ids to be unique. +# Note utf-16, utf-32 missing because they are automatically +# generated based on le/be versions. +# TODO - other encodings and test cases +# TODO - out of range code point (note cannot be generated by \U notation) +set encUnencodableStrings { + ascii A\u00e0B default \x41\x3f\x42 -1 non-ASCII + ascii A\u00e0B tcl8 \x41\x3f\x42 -1 non-ASCII + ascii A\u00e0B strict \x41 1 non-ASCII + + iso8859-1 A\u0141B default \x41\x3f\x42 -1 unencodable + iso8859-1 A\u0141B tcl8 \x41\x3f\x42 -1 unencodable + iso8859-1 A\u0141B strict \x41 1 unencodable + + utf-8 A\uD800B default \x41\xed\xa0\x80\x42 -1 High-surrogate + utf-8 A\uD800B tcl8 \x41\xed\xa0\x80\x42 -1 High-surrogate + utf-8 A\uD800B strict \x41 1 High-surrogate + utf-8 A\uDC00B default \x41\xed\xb0\x80\x42 -1 High-surrogate + utf-8 A\uDC00B tcl8 \x41\xed\xb0\x80\x42 -1 High-surrogate + utf-8 A\uDC00B strict \x41 1 High-surrogate +} + +if {$::tcl_platform(byteOrder) eq "littleEndian"} { + set endian le +} else { + set endian be +} + +# +# Check errors for invalid number of arguments +proc badnumargs {id cmd cmdargs} { + variable numargErrors + test $id.a "Syntax error: $cmd $cmdargs" \ + -body [list {*}$cmd {*}$cmdargs] \ + -result $numargErrors($cmd) \ + -match regexp \ + -returnCodes error + test $id.b "Syntax error: $cmd (byte compiled)" \ + -setup [list proc compiled_proc {} [list {*}$cmd {*}$cmdargs]] \ + -body {compiled_proc} \ + -cleanup {rename compiled_proc {}} \ + -result $numargErrors($cmd) \ + -match regexp \ + -returnCodes error +} + +# Wraps tests resulting in unknown encoding errors +proc unknownencodingtest {id cmd} { + set result "unknown encoding \"[lindex $cmd end-1]\"" + test $id.a "Unknown encoding error: $cmd" \ + -body [list encoding {*}$cmd] \ + -result $result \ + -returnCodes error + test $id.b "Syntax error: $cmd (byte compiled)" \ + -setup [list proc encoding_test {} [list encoding {*}$cmd]] \ + -body {encoding_test} \ + -cleanup {rename encoding_test {}} \ + -result $result \ + -returnCodes error +} + +# Wraps tests for conversion, successful or not. +# Really more general than just for encoding conversion. +proc testconvert {id body result args} { + test $id.a $body \ + -body $body \ + -result $result \ + {*}$args + dict append args -setup \n[list proc compiled_script {} $body] + dict append args -cleanup "\nrename compiled_script {}" + test $id.b "$body (byte compiled)" \ + -body {compiled_script} \ + -result $result \ + {*}$args +} + +test cmdAH-4.1.1 {encoding} -returnCodes error -body { encoding } -result {wrong # args: should be "encoding subcommand ?arg ...?"} -test cmdAH-4.2 {Tcl_EncodingObjCmd} -returnCodes error -body { +test cmdAH-4.1.2 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding foo } -result {unknown or ambiguous subcommand "foo": must be convertfrom, convertto, dirs, names, or system} -test cmdAH-4.3 {Tcl_EncodingObjCmd} -returnCodes error -body { - encoding convertto -} -result {wrong # args: should be "encoding convertto ?-strict? ?-failindex var? ?encoding? data" or "encoding convertto -nocomplain ?encoding? data"} -test cmdAH-4.4 {Tcl_EncodingObjCmd} -returnCodes error -body { - encoding convertto foo bar -} -result {unknown encoding "foo"} -test cmdAH-4.5 {Tcl_EncodingObjCmd} -setup { - set system [encoding system] -} -body { - encoding system jis0208 - encoding convertto 乎 -} -cleanup { - encoding system $system -} -result 8C -test cmdAH-4.6 {Tcl_EncodingObjCmd} -setup { + +# +# encoding system 4.2.* +badnumargs cmdAH-4.2.1 {encoding system} {ascii ascii} +test cmdAH-4.2.2 {Tcl_EncodingObjCmd} -setup { set system [encoding system] } -body { encoding system iso8859-1 - encoding convertto jis0208 乎 -} -cleanup { - encoding system $system -} -result 8C -test cmdAH-4.7 {Tcl_EncodingObjCmd} -returnCodes error -body { - encoding convertfrom -} -result {wrong # args: should be "encoding convertfrom ?-strict? ?-failindex var? ?encoding? data" or "encoding convertfrom -nocomplain ?encoding? data"} -test cmdAH-4.8 {Tcl_EncodingObjCmd} -returnCodes error -body { - encoding convertfrom foo bar -} -result {unknown encoding "foo"} -test cmdAH-4.9 {Tcl_EncodingObjCmd} -setup { - set system [encoding system] -} -body { - encoding system jis0208 - encoding convertfrom 8C + encoding system } -cleanup { encoding system $system -} -result 乎 -test cmdAH-4.10 {Tcl_EncodingObjCmd} -setup { +} -result iso8859-1 + +# +# encoding convertfrom 4.3.* + +# Odd number of args is always invalid since last two args +# are ENCODING DATA and all options take a value +badnumargs cmdAH-4.3.1 {encoding convertfrom} {} +badnumargs cmdAH-4.3.2 {encoding convertfrom} {-failindex VAR ABC} +badnumargs cmdAH-4.3.3 {encoding convertfrom} {-profile VAR ABC} +badnumargs cmdAH-4.3.4 {encoding convertfrom} {-failindex VAR -profile strict ABC} +badnumargs cmdAH-4.3.5 {encoding convertfrom} {-profile strict -failindex VAR ABC} + +# Test that last two args always treated as ENCODING DATA +unknownencodingtest 4.3.6 {convertfrom -failindex ABC} +unknownencodingtest 4.3.7 {convertfrom -profile ABC} +unknownencodingtest 4.3.8 {convertfrom nosuchencoding ABC} +unknownencodingtest 4.3.9 {convertfrom -failindex VAR -profile ABC} +unknownencodingtest 4.3.10 {convertfrom -profile strict -failindex ABC} +testconvert cmdAH-4.3.11 { + encoding convertfrom jis0208 \x38\x43 +} \u4e4e -setup { set system [encoding system] -} -body { encoding system iso8859-1 - encoding convertfrom jis0208 8C } -cleanup { encoding system $system -} -result 乎 -test cmdAH-4.11 {Tcl_EncodingObjCmd} -returnCodes error -body { - encoding names foo -} -result {wrong # args: should be "encoding names"} -test cmdAH-4.12 {Tcl_EncodingObjCmd} -returnCodes error -body { - encoding system foo bar -} -result {wrong # args: should be "encoding system ?encoding?"} -test cmdAH-4.13 {Tcl_EncodingObjCmd} -setup { +} + +# Verify single arg defaults to system encoding +testconvert cmdAH-4.3.12 { + encoding convertfrom \x38\x43 +} \u4e4e -setup { set system [encoding system] -} -body { - encoding system iso8859-1 - encoding system + encoding system jis0208 } -cleanup { encoding system $system -} -result iso8859-1 +} -test cmdAH-4.14.1 {Syntax error, -nocomplain and -failindex, no encoding} -body { - encoding convertfrom -nocomplain -failindex 2 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-strict? ?-failindex var? ?encoding? data" or "encoding convertfrom -nocomplain ?encoding? data"} -test cmdAH-4.14.2 {Syntax error, -nocomplain and -failindex, no encoding} -body { - encoding convertto -nocomplain -failindex 2 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-strict? ?-failindex var? ?encoding? data" or "encoding convertto -nocomplain ?encoding? data"} -test cmdAH-4.15.1 {Syntax error, -failindex and -nocomplain, no encoding} -body { - encoding convertfrom -failindex 2 -nocomplain ABC -} -returnCodes 1 -result {unknown encoding "-nocomplain"} -test cmdAH-4.15.2 {Syntax error, -failindex and -nocomplain, no encoding} -body { - encoding convertto -failindex 2 -nocomplain ABC -} -returnCodes 1 -result {unknown encoding "-nocomplain"} -test cmdAH-4.16.1 {Syntax error, -nocomplain and -failindex, encoding} -body { - encoding convertfrom -nocomplain -failindex 2 utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-strict? ?-failindex var? ?encoding? data" or "encoding convertfrom -nocomplain ?encoding? data"} -test cmdAH-4.16.2 {Syntax error, -nocomplain and -failindex, encoding} -body { - encoding convertto -nocomplain -failindex 2 utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-strict? ?-failindex var? ?encoding? data" or "encoding convertto -nocomplain ?encoding? data"} -test cmdAH-4.17.1 {Syntax error, -failindex and -nocomplain, encoding} -body { - encoding convertfrom -failindex 2 -nocomplain utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-strict? ?-failindex var? ?encoding? data" or "encoding convertfrom -nocomplain ?encoding? data"} -test cmdAH-4.17.2 {Syntax error, -failindex and -nocomplain, encoding} -body { - encoding convertto -failindex 2 -nocomplain utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-strict? ?-failindex var? ?encoding? data" or "encoding convertto -nocomplain ?encoding? data"} -test cmdAH-4.18.1 {Syntax error, -failindex with no var, no encoding} -body { - encoding convertfrom -failindex ABC -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-strict? ?-failindex var? ?encoding? data" or "::tcl::encoding::convertfrom -nocomplain ?encoding? data"} -test cmdAH-4.18.2 {Syntax error, -failindex with no var, no encoding (byte compiled)} -setup { - proc encoding_test {} { - encoding convertfrom -failindex ABC +# Wrapper for verifying -failindex +proc testfailindex {id converter enc data result {profile default}} { + if {$profile eq "default"} { + testconvert $id "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + } else { + testconvert $id "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result } -} -body { - # Compile and execute - encoding_test -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-strict? ?-failindex var? ?encoding? data" or "::tcl::encoding::convertfrom -nocomplain ?encoding? data"} -cleanup { - rename encoding_test "" } -test cmdAH-4.18.3 {Syntax error, -failindex with no var, no encoding} -body { - encoding convertto -failindex ABC -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-strict? ?-failindex var? ?encoding? data" or "::tcl::encoding::convertto -nocomplain ?encoding? data"} -test cmdAH-4.18.4 {Syntax error, -failindex with no var, no encoding (byte compiled)} -setup { - proc encoding_test {} { - encoding convertto -failindex ABC + +# -failindex - valid data +foreach {enc string bytes} $encValidStrings { + testfailindex 4.3.13.$enc convertfrom $enc $bytes [list $string -1] + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testfailindex 4.3.13.$enc convertfrom $enc $bytes [list $string -1] } -} -body { - # Compile and execute - encoding_test -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-strict? ?-failindex var? ?encoding? data" or "::tcl::encoding::convertto -nocomplain ?encoding? data"} -cleanup { - rename encoding_test "" } -test cmdAH-4.19.1 {convertrom -failindex with correct data} -body { - encoding convertfrom -failindex test ABC - set test -} -returnCodes 0 -result -1 -test cmdAH-4.19.2 {convertrom -failindex with correct data (byt compiled)} -setup { - proc encoding_test {} { - encoding convertfrom -failindex test ABC - set test + +# -failindex - invalid data +foreach {enc bytes profile prefix failidx tag} $encInvalidBytes { + testfailindex 4.3.14.$enc.$profile.$tag convertfrom $enc $bytes [list $prefix $failidx] $profile + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testfailindex 4.3.14.$enc.$profile.$tag convertfrom $enc $bytes [list $prefix $failidx] $profile } -} -body { - # Compile and execute - encoding_test -} -returnCodes 0 -result -1 -cleanup { - rename encoding_test "" } -test cmdAH-4.19.3 {convertrom -failindex with correct data} -body { - encoding convertto -failindex test ABC - set test -} -returnCodes 0 -result -1 -test cmdAH-4.19.4 {convertrom -failindex with correct data (byt compiled)} -setup { - proc encoding_test {} { - encoding convertto -failindex test ABC - set test + +# -profile + +# All valid byte sequences should be accepted by all profiles +foreach profile $encProfiles { + set i 0 + foreach {enc string bytes} $encValidStrings { + testconvert 4.3.15.$enc.$profile.[incr i] [list encoding convertfrom $enc $bytes] $string + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testconvert 4.3.15.$enc.$profile.[incr i] [list encoding convertfrom $enc $bytes] $string + } } -} -body { - # Compile and execute - encoding_test -} -returnCodes 0 -result -1 -cleanup { - rename encoding_test "" } -test cmdAH-4.20.1 {convertrom -failindex with incomplete utf8} -body { - set x [encoding convertfrom -failindex i utf-8 A\xc3] - binary scan $x H* y - list $y $i -} -returnCodes 0 -result {41 1} -test cmdAH-4.20.2 {convertrom -failindex with incomplete utf8 (byte compiled)} -setup { - proc encoding_test {} { - set x [encoding convertfrom -failindex i utf-8 A\xc3] - binary scan $x H* y - list $y $i + +# Cycle through the various combinations of encodings and profiles +# for invalid byte sequences +foreach {enc bytes profile prefix failidx tag} $encInvalidBytes { + if {$failidx eq -1} { + set result [list $prefix] + } else { + set badbyte "'\\x[string toupper [binary encode hex [string index $bytes $failidx]]]'" + # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch + # so glob it out for now. + set result [list "unexpected byte sequence starting at index $failidx: *" -returnCodes error -match glob] } -} -body { - # Compile and execute - encoding_test -} -returnCodes 0 -result {41 1} -cleanup { - rename encoding_test "" -} -test cmdAH-4.20.3 {convertrom -failindex with incomplete utf8} -body { - set x [encoding convertfrom -strict -failindex i utf-8 A\xc3] - binary scan $x H* y - list $y $i -} -returnCodes 0 -result {41 1} -test cmdAH-4.20.4 {convertrom -failindex with incomplete utf8 (byte compiled)} -setup { - proc encoding_test {} { - set x [encoding convertfrom -strict -failindex i utf-8 A\xc3] - binary scan $x H* y - list $y $i + if {$profile eq "default"} { + testconvert 4.3.15.$enc.$profile.$tag [list encoding convertfrom $enc $bytes] {*}$result + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testconvert 4.3.15.$enc.$profile.$tag [list encoding convertfrom $enc $bytes] {*}$result + } + } else { + testconvert 4.3.15.$enc.$profile.$tag [list encoding convertfrom -profile $profile $enc $bytes] {*}$result + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testconvert 4.3.15.$enc.$profile.$tag [list encoding convertfrom -profile $profile $enc $bytes] {*}$result + } } -} -body { - # Compile and execute - encoding_test -} -returnCodes 0 -result {41 1} -cleanup { - rename encoding_test "" } -test cmdAH-4.20.5 {convertrom -failindex with incomplete utf8} -body { - set x [encoding convertfrom -failindex i -strict utf-8 A\xc3] - binary scan $x H* y - list $y $i -} -returnCodes 0 -result {41 1} -test cmdAH-4.20.6 {convertrom -failindex with incomplete utf8 (byte compiled)} -setup { - proc encoding_test {} { - set x [encoding convertfrom -failindex i -strict utf-8 A\xc3] - binary scan $x H* y - list $y $i + +# +# encoding convertto 4.4.* + +badnumargs cmdAH-4.4.1 {encoding convertto} {} +badnumargs cmdAH-4.4.2 {encoding convertto} {-failindex VAR ABC} +badnumargs cmdAH-4.4.3 {encoding convertto} {-profile VAR ABC} +badnumargs cmdAH-4.4.4 {encoding convertto} {-failindex VAR -profile strict ABC} +badnumargs cmdAH-4.4.5 {encoding convertto} {-profile strict -failindex VAR ABC} + +# Test that last two args always treated as ENCODING DATA +unknownencodingtest 4.4.6 {convertto -failindex ABC} +unknownencodingtest 4.4.7 {convertto -profile ABC} +unknownencodingtest 4.4.8 {convertto nosuchencoding ABC} +unknownencodingtest 4.4.9 {convertto -failindex VAR -profile ABC} +unknownencodingtest 4.4.10 {convertto -profile strict -failindex ABC} +testconvert cmdAH-4.4.11 { + encoding convertto jis0208 \u4e4e +} \x38\x43 -setup { + set system [encoding system] + encoding system iso8859-1 +} -cleanup { + encoding system $system +} + +# Verify single arg defaults to system encoding +testconvert cmdAH-4.4.12 { + encoding convertto \u4e4e +} \x38\x43 -setup { + set system [encoding system] + encoding system jis0208 +} -cleanup { + encoding system $system +} + +# -failindex - valid data +foreach {enc string bytes} $encValidStrings { + testfailindex 4.4.13.$enc convertto $enc $string [list $bytes -1] + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testfailindex 4.4.13.$enc convertto $enc $string [list $bytes -1] } -} -body { - # Compile and execute - encoding_test -} -returnCodes 0 -result {41 1} -cleanup { - rename encoding_test "" } -test cmdAH-4.21.1 {convertto -failindex with wrong character} -body { - set x [encoding convertto -failindex i iso8859-1 A\u0141] - binary scan $x H* y - list $y $i -} -returnCodes 0 -result {41 1} -test cmdAH-4.21.2 {convertto -failindex with wrong character (byte compiled)} -setup { - proc encoding_test {} { - set x [encoding convertto -failindex i iso8859-1 A\u0141] - binary scan $x H* y - list $y $i + +# -failindex - invalid data +foreach {enc string profile bytes failidx tag} $encUnencodableStrings { + testfailindex 4.4.14.$enc.$profile.$tag convertto $enc $string [list $bytes $failidx] $profile + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testfailindex 4.4.14.$enc.$profile.$tag convertto $enc $string [list $bytes $failidx] $profile } -} -body { - # Compile and execute - encoding_test -} -returnCodes 0 -result {41 1} -cleanup { - rename encoding_test "" } -test cmdAH-4.22 {convertfrom -strict} -body { - encoding convertfrom -strict utf-8 A\x00B -} -result A\x00B -test cmdAH-4.23 {convertfrom -strict} -body { - encoding convertfrom -strict utf-8 A\xC0\x80B -} -returnCodes error -result {unexpected byte sequence starting at index 1: '\xC0'} +# -profile -test cmdAH-4.24 {convertto -strict} -body { - encoding convertto -strict utf-8 A\x00B -} -result A\x00B +# All valid byte sequences should be accepted by all profiles +foreach profile $encProfiles { + set i 0 + foreach {enc string bytes} $encValidStrings { + testconvert 4.4.15.$enc.$profile.[incr i] [list encoding convertto $enc $string] $bytes + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testconvert 4.4.15.$enc.$profile.[incr i] [list encoding convertto $enc $string] $bytes + } + } +} -test cmdAH-4.25 {convertfrom -strict} -constraints knownBug -body { - encoding convertfrom -strict utf-8 A\x80B -} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'} +# Cycle through the various combinations of encodings and profiles +# for invalid byte sequences +foreach {enc string profile bytes failidx tag} $encUnencodableStrings { + if {$failidx eq -1} { + set result [list $bytes] + } else { + # TODO - if the bad char is unprintable, tcltest errors out when printing a mismatch + # so glob it out for now. + set result [list "unexpected character at index $failidx: *" -returnCodes error -match glob] + } + if {$profile eq "default"} { + testconvert 4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testconvert 4.3.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result + } + } else { + testconvert 4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result + if {"utf-16$endian" eq $enc} { + # utf-16le ->utf-16, utf-32be -> utf32 etc. + set enc [string range $enc 0 5] + testconvert 4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result + } + } +} -test cmdAH-4.26 {convertto -strict} -constraints {testbytestring knownBug} -body { - encoding convertto -strict utf-8 A[testbytestring \x80]B +test cmdAH-4.5.1 {convertto -profile strict} -constraints {testbytestring knownBug} -body { + # TODO - what does testbytestring even test? Invalid UTF8 in the Tcl_Obj bytes field + encoding convertto -profile strict utf-8 A[testbytestring \x80]B } -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'} +# +# encoding names 4.5.* +badnumargs cmdAH-4.5.1 {encoding names} {foo} +test cmdAH-4.5.2 {encoding names should include at least utf-8 and iso8859-1 and at least one more} -body { + set names [encoding names] + list [expr {"utf-8" in $names}] [expr {"iso8859-1" in $names}] [expr {[llength $names] > 2}] +} -result {1 1 1} + +# +# file command + test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body { file } -result {wrong # args: should be "file subcommand ?arg ...?"} -- cgit v0.12 From b741dab392a7e58c23568bd821d7eff982c2ec14 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 7 Feb 2023 11:25:22 +0000 Subject: Fix tcltest to not exit on encoding errors when printing to stdout --- library/tcltest/tcltest.tcl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/library/tcltest/tcltest.tcl b/library/tcltest/tcltest.tcl index 7344f9f..94010a7 100644 --- a/library/tcltest/tcltest.tcl +++ b/library/tcltest/tcltest.tcl @@ -2221,7 +2221,11 @@ proc tcltest::test {name description args} { if {$scriptCompare} { puts [outputChannel] "---- Error testing result: $scriptMatch" } else { - puts [outputChannel] "---- Result was:\n$actualAnswer" + try { + puts [outputChannel] "---- Result was:\n$actualAnswer" + } on error {errMsg errCode} { + puts [outputChannel] "---- Result was:\n" + } puts [outputChannel] "---- Result should have been\ ($match matching):\n$result" } -- cgit v0.12 From 4294befd8b12d341c6fa74ef24120838d931a07a Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 9 Feb 2023 07:27:43 +0000 Subject: Do not have -failindex imply -strict --- generic/tclCmdAH.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 02a3a46..efc156c 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -554,7 +554,6 @@ EncodingConvertParseOptions ( Tcl_Interp *interp, /* For error messages. May be NULL */ int objc, /* Number of arguments */ Tcl_Obj *const objv[], /* Argument objects as passed to command. */ - int isEncoder, /* 1 -> convertto, 0 -> convertfrom */ Tcl_Encoding *encPtr, /* Where to store the encoding */ Tcl_Obj **dataObjPtr, /* Where to store ptr to Tcl_Obj containing data */ int *flagsPtr, /* Bit mask of encoding option flags */ @@ -640,15 +639,6 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ dataObj = objv[objc - 1]; } - /* -failindex forces checking*/ - if (failVarObj != NULL && flags == TCL_ENCODING_NOCOMPLAIN) { - /* - * Historical, but I really don't like this mixing of defines - * from two different bit mask domains - ENCODING_FAILINDEX - */ - flags = isEncoder ? TCL_ENCODING_STOPONERROR : ENCODING_FAILINDEX; - } - *encPtr = encoding; *dataObjPtr = dataObj; *flagsPtr = flags; @@ -688,7 +678,7 @@ EncodingConvertfromObjCmd( Tcl_Obj *failVarObj; if (EncodingConvertParseOptions( - interp, objc, objv, 0, &encoding, &data, &flags, &failVarObj) + interp, objc, objv, &encoding, &data, &flags, &failVarObj) != TCL_OK) { return TCL_ERROR; } @@ -775,7 +765,7 @@ EncodingConverttoObjCmd( Tcl_Obj *failVarObj; if (EncodingConvertParseOptions( - interp, objc, objv, 1, &encoding, &data, &flags, &failVarObj) + interp, objc, objv, &encoding, &data, &flags, &failVarObj) != TCL_OK) { return TCL_ERROR; } -- cgit v0.12 From d46a2441593da26b460fba5a4612ec43fa0d9215 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 9 Feb 2023 17:03:31 +0000 Subject: Add equivalent tests from ff630bf370 --- tests/cmdAH.test | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index ad5e540..c4053a2 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -229,9 +229,21 @@ set encInvalidBytes { utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 default A\uD800B -1 {High-surrogate} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 tcl8 A\uD800B -1 {High-surrogate} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 strict A 4 {High-surrogate} + utf-32le \x00\xD8\x00\x00 default \uD800 -1 {High-surrogate} + utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} + utf-32le \x00\xD8\x00\x00 strict "" 0 {High-surrogate} + utf-32le \x00\xDC\x00\x00 default \uDC00 -1 {Low-surrogate} + utf-32le \x00\xDC\x00\x00 tcl8 \uDC00 -1 {Low-surrogate} + utf-32le \x00\xDC\x00\x00 strict "" 0 {Low-surrogate} + utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 default \uD800\uDC00 -1 {High-low-surrogate} + utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 tcl8 \uD800\uDC00 -1 {High-low-surrogate} + utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 strict "" 0 {High-low-surrogate} + utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 default \uDC00\uD800 -1 {High-low-surrogate} + utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 tcl8 \uDC00\uD800 -1 {High-low-surrogate} + utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 strict "" 0 {High-low-surrogate} + utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 default A\uD800B -1 {High-surrogate-middle} + utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 tcl8 A\uD800B -1 {High-surrogate-middle} + utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 strict A 4 {High-surrogate-middle} } # Strings that cannot be encoded for specific encoding / profiles -- cgit v0.12 From 9d1ba01f11c772a015e3edbfb1ea4ae8e9f148bf Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 9 Feb 2023 17:04:33 +0000 Subject: Modify encoding C API to use profiles (in progress) --- generic/tcl.h | 22 +++++++++- generic/tclCmdAH.c | 16 ++----- generic/tclEncoding.c | 118 ++++++++++++++++++++++++++++++++++++++------------ generic/tclIO.c | 6 ++- generic/tclInt.h | 13 +----- 5 files changed, 122 insertions(+), 53 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index f373382..ec94e71 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2144,7 +2144,27 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_CHAR_LIMIT 0x10 #define TCL_ENCODING_MODIFIED 0x20 #define TCL_ENCODING_NOCOMPLAIN 0x40 -#define TCL_ENCODING_STRICT 0x44 +#define TCL_ENCODING_STRICT 0x44 +/* Reserve top byte for profile values (disjoint) */ +#define TCL_ENCODING_PROFILE_TCL8 0x01000000 +#define TCL_ENCODING_PROFILE_STRICT 0x02000000 +#define TCL_ENCODING_PROFILE_MASK 0xFF000000 +#define TCL_ENCODING_PROFILE_GET(flags_) ((flags_) & TCL_ENCODING_PROFILE_MASK) +#define TCL_ENCODING_PROFILE_SET(flags_, profile_) \ + do { \ + (flags_) &= ~TCL_ENCODING_PROFILE_MASK; \ + (flags_) |= profile_; \ + } while (0) +/* Still being argued - For Tcl9, is the default strict? TODO */ +#if TCL_MAJOR_VERSION < 9 +#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 +#else +#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? TODO */ +#endif + +#define TCL_ENCODING_EXTERNAL_FLAG_MASK \ + (TCL_ENCODING_START|TCL_ENCODING_END|TCL_ENCODING_STOPONERROR) + /* * The following definitions are the error codes returned by the conversion diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index efc156c..05c0887 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -562,7 +562,7 @@ EncodingConvertParseOptions ( { static const char *const options[] = {"-profile", "-failindex", NULL}; enum convertfromOptions { PROFILE, FAILINDEX } optIndex; - enum TclEncodingProfile profile; + int profile; Tcl_Encoding encoding; Tcl_Obj *dataObj; Tcl_Obj *failVarObj; @@ -614,17 +614,9 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ != TCL_OK) { return TCL_ERROR; } - switch (profile) { - case TCL_ENCODING_PROFILE_TCL8: - flags = TCL_ENCODING_NOCOMPLAIN; - break; - case TCL_ENCODING_PROFILE_STRICT: - flags = TCL_ENCODING_STRICT; - break; - case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ - default: - break; - } + /* TODO - next line probably not needed as the conversion + functions already take care of mapping profile to flags */ + flags = TclEncodingExternalFlagsToInternal(profile); break; case FAILINDEX: failVarObj = objv[argIndex]; diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 106a2f1..8e42e26 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -542,6 +542,8 @@ TclInitEncodingSubsystem(void) Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); + /* TODO - why is NOCOMPLAIN being hardcoded for encodings below? */ + /* * Create a few initial encodings. UTF-8 to UTF-8 translation is not a * no-op because it turns a stream of improperly formed UTF-8 into a @@ -1184,13 +1186,12 @@ Tcl_ExternalToUtfDString( * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in utf-8. * Possible flags values: - * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but - * return the first error position (Default in Tcl 9.0). - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * - TCL_ENCODING_MODIFIED: enable Tcl internal conversion mapping \xC0\x80 + * to 0x00. Only valid for "utf-8" and "cesu-8". * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1236,6 +1237,7 @@ Tcl_ExternalToUtfDStringEx( srcLen = encodingPtr->lengthProc(src); } + flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; @@ -1408,7 +1410,7 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_PROFILE_DEFAULT, dstPtr); return Tcl_DStringValue(dstPtr); } @@ -1421,15 +1423,12 @@ Tcl_UtfToExternalDString( * Convert a source buffer from UTF-8 to the specified encoding. * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in the - * target encoding. - * Possible flags values: - * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but - * return the first error position (Default in Tcl 9.0). - * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default - * fallback character. Always return -1 (Default in Tcl 8.7). - * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00. - * Only valid for "utf-8" and "cesu-8". This flag may be used together - * with the other flags. + * target encoding. It should be composed by OR-ing the following: + * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} + * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile + * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags + * - TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 instead + * of 0x00. Only valid for "utf-8" and "cesu-8". * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1450,7 +1449,7 @@ Tcl_UtfToExternalDStringEx( const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ - int flags, /* Conversion control flags. */ + int flags, /* Conversion control flags. */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { @@ -1474,6 +1473,7 @@ Tcl_UtfToExternalDStringEx( } else if (srcLen < 0) { srcLen = strlen(src); } + flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, @@ -4095,7 +4095,7 @@ InitializeEncodingSearchPath( * * TclEncodingProfileParseName -- * - * Maps an encoding profile name to its enum value. + * Maps an encoding profile name to its integer equivalent. * * Results: * TCL_OK on success or TCL_ERROR on failure. @@ -4107,17 +4107,22 @@ InitializeEncodingSearchPath( */ int TclEncodingProfileParseName( - Tcl_Interp *interp, /* For error messages. May be NULL */ - const char *profileName, /* Name of profile */ - enum TclEncodingProfile *profilePtr) /* Output */ + Tcl_Interp *interp, /* For error messages. May be NULL */ + const char *profileName, /* Name of profile */ + int *profilePtr) /* Output */ { - /* NOTE: Order must match enum TclEncodingProfile !!! */ - static const char *const profileNames[] = {"", "tcl8", "strict"}; - int idx; + /* NOTE: Order in arrays must match !!! */ + static const char *const profileNames[] = {"", "tcl8", "strict", NULL}; + static int profileFlags[] = { + TCL_ENCODING_PROFILE_DEFAULT, + TCL_ENCODING_PROFILE_TCL8, + TCL_ENCODING_PROFILE_STRICT, + }; + int i; - for (idx = 0; idx < sizeof(profileNames) / sizeof(profileNames[0]); ++idx) { - if (!strcmp(profileName, profileNames[idx])) { - *profilePtr = (enum TclEncodingProfile)idx; + for (i = 0; i < sizeof(profileNames) / sizeof(profileNames[0]); ++i) { + if (!strcmp(profileName, profileNames[i])) { + *profilePtr = profileFlags[i]; return TCL_OK; } } @@ -4134,6 +4139,63 @@ TclEncodingProfileParseName( } /* + *------------------------------------------------------------------------ + * + * TclEncodingExternalFlagsToInternal -- + * + * Maps the flags supported in the encoding C API's to internal flags. + * + * TCL_ENCODING_STRICT and TCL_ENCODING_NOCOMPLAIN are masked off + * because they are for internal use only and externally specified + * through TCL_ENCODING_PROFILE_* bits. + * + * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is + * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile + * specified. + * + * If no profile or an invalid profile is specified, it is set to + * the default. + * + * Results: + * Internal encoding flag mask. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------ + */ +int TclEncodingExternalFlagsToInternal(int flags) +{ + flags &= ~(TCL_ENCODING_STRICT | TCL_ENCODING_NOCOMPLAIN); + if (flags & TCL_ENCODING_STOPONERROR) { + TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); + } + else { + int profile = TCL_ENCODING_PROFILE_GET(flags); + switch (profile) { + case TCL_ENCODING_PROFILE_TCL8: + flags |= TCL_ENCODING_NOCOMPLAIN; + break; + case TCL_ENCODING_PROFILE_STRICT: + flags |= TCL_ENCODING_STRICT; + break; + default: + /* TODO - clean this up once default mechanisms settled */ + TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT); +#if TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_TCL8 + flags |= TCL_ENCODING_NOCOMPLAIN; +#elif TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT + flags |= TCL_ENCODING_STRICT; +#else +#error TCL_ENCODING_PROFILE_DEFAULT must be TCL8 or STRICT +#endif + break; + } + } + return flags; +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 diff --git a/generic/tclIO.c b/generic/tclIO.c index 370ca95..0152740 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -8379,7 +8379,7 @@ Tcl_SetChannelOption( statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; return TCL_OK; } else if (HaveOpt(1, "-encodingprofile")) { - enum TclEncodingProfile profile; + int profile; if (TclEncodingProfileParseName(interp, newValue, &profile) != TCL_OK) { return TCL_ERROR; } @@ -8392,7 +8392,11 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); SetFlag(statePtr, CHANNEL_ENCODING_STRICT); break; + /* TODO - clean up this DEFAULT handling once channel flags fixed */ +#if TCL_ENCODING_PROFILE_DEFAULT != TCL_ENCODING_PROFILE_TCL8 \ + && TCL_ENCODING_PROFILE_DEFAULT != TCL_ENCODING_PROFILE_STRICT case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ +#endif default: ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); diff --git a/generic/tclInt.h b/generic/tclInt.h index 82728d3..2b491d6 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2883,21 +2883,12 @@ MODULE_SCOPE TclPlatformType tclPlatform; * Declarations related to internal encoding functions. */ -/* - * Enum for encoding profiles that control encoding treatment of - * invalid bytes. NOTE: Order must match that of encodingProfileNames in - * TclEncodingProfileParseName() !!! - */ -enum TclEncodingProfile { - TCL_ENCODING_PROFILE_DEFAULT, - TCL_ENCODING_PROFILE_TCL8, - TCL_ENCODING_PROFILE_STRICT, -}; MODULE_SCOPE Tcl_Encoding tclIdentityEncoding; MODULE_SCOPE int TclEncodingProfileParseName(Tcl_Interp *interp, const char *profileName, - enum TclEncodingProfile *profilePtr); + int *profilePtr); +MODULE_SCOPE int TclEncodingExternalFlagsToInternal(int flags); /* * TIP #233 (Virtualized Time) -- cgit v0.12 From e26214c28753b22c398ba4d7196a8afae999ab5a Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Fri, 10 Feb 2023 17:07:12 +0000 Subject: Phase out (almost) STRICT and NOCOMPLAIN flags. --- generic/tclCmdAH.c | 38 +++++++++------- generic/tclEncoding.c | 114 +++++++++++++++++++++++++++++++++++------------- generic/tclIO.c | 118 ++++++++++++-------------------------------------- generic/tclIO.h | 3 +- generic/tclInt.h | 8 ++-- 5 files changed, 140 insertions(+), 141 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 05c0887..5fbe27e 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -543,7 +543,7 @@ TclInitEncodingCmd( * if non-NULL * - *dataObjPtr is set to the Tcl_Obj containing the data to encode or * decode - * - *flagsPtr is set to encoding error handling flags + * - *profilePtr is set to encoding error handling profile * - *failVarPtr is set to -failindex option value or NULL * On error, all of the above are uninitialized. * @@ -556,20 +556,19 @@ EncodingConvertParseOptions ( Tcl_Obj *const objv[], /* Argument objects as passed to command. */ Tcl_Encoding *encPtr, /* Where to store the encoding */ Tcl_Obj **dataObjPtr, /* Where to store ptr to Tcl_Obj containing data */ - int *flagsPtr, /* Bit mask of encoding option flags */ + int *profilePtr, /* Bit mask of encoding option profile */ Tcl_Obj **failVarPtr /* Where to store -failindex option value */ ) { static const char *const options[] = {"-profile", "-failindex", NULL}; enum convertfromOptions { PROFILE, FAILINDEX } optIndex; - int profile; Tcl_Encoding encoding; Tcl_Obj *dataObj; Tcl_Obj *failVarObj; #if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) - int flags = TCL_ENCODING_STOPONERROR; + int profile = TCL_ENCODING_PROFILE_TCL8; /* TODO - default for Tcl9? */ #else - int flags = TCL_ENCODING_NOCOMPLAIN; + int profile = TCL_ENCODING_PROFILE_TCL8; #endif /* @@ -609,14 +608,16 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ } switch (optIndex) { case PROFILE: - if (TclEncodingProfileParseName( + if (TclEncodingProfileNameToId( interp, Tcl_GetString(objv[argIndex]), &profile) != TCL_OK) { return TCL_ERROR; } +#ifdef NOTNEEDED /* TODO - next line probably not needed as the conversion functions already take care of mapping profile to flags */ - flags = TclEncodingExternalFlagsToInternal(profile); + profile = TclEncodingExternalFlagsToInternal(profile); +#endif break; case FAILINDEX: failVarObj = objv[argIndex]; @@ -633,7 +634,7 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ *encPtr = encoding; *dataObjPtr = dataObj; - *flagsPtr = flags; + *profilePtr = profile; *failVarPtr = failVarObj; return TCL_OK; @@ -676,20 +677,23 @@ EncodingConvertfromObjCmd( } /* - * Convert the string into a byte array in 'ds' + * Convert the string into a byte array in 'ds'. */ #if !defined(TCL_NO_DEPRECATED) && (TCL_MAJOR_VERSION < 9) - if (!(flags & TCL_ENCODING_STOPONERROR)) { + if (TCL_ENCODING_PROFILE_GET(flags) == TCL_ENCODING_PROFILE_TCL8) { + /* Permits high bits to be non-0 in byte array (Tcl 8 style) */ bytesPtr = (char *) Tcl_GetByteArrayFromObj(data, &length); - } else + } + else #endif - bytesPtr = (char *) TclGetBytesFromObj(interp, data, &length); + bytesPtr = (char *) TclGetBytesFromObj(interp, data, &length); + if (bytesPtr == NULL) { return TCL_ERROR; } result = Tcl_ExternalToUtfDStringEx(encoding, bytesPtr, length, flags, &ds); - if ((!(flags & TCL_ENCODING_NOCOMPLAIN) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) && (result != TCL_INDEX_NONE)) { + if (result != TCL_INDEX_NONE) { if (failVarObj != NULL) { if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; @@ -704,7 +708,8 @@ EncodingConvertfromObjCmd( Tcl_DStringFree(&ds); return TCL_ERROR; } - } else if (failVarObj != NULL) { + } + else if (failVarObj != NULL) { if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; } @@ -769,7 +774,7 @@ EncodingConverttoObjCmd( stringPtr = TclGetStringFromObj(data, &length); result = Tcl_UtfToExternalDStringEx(encoding, stringPtr, length, flags, &ds); - if ((!(flags & TCL_ENCODING_NOCOMPLAIN) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) && (result != TCL_INDEX_NONE)) { + if (result != TCL_INDEX_NONE) { if (failVarObj != NULL) { /* I hope, wide int will cover size_t data type */ if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { @@ -788,7 +793,8 @@ EncodingConverttoObjCmd( Tcl_DStringFree(&ds); return TCL_ERROR; } - } else if (failVarObj != NULL) { + } + else if (failVarObj != NULL) { if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; } diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 8e42e26..153f8d3 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -188,6 +188,15 @@ static Tcl_Encoding systemEncoding = NULL; Tcl_Encoding tclIdentityEncoding = NULL; /* + * Names of encoding profiles and corresponding integer values + */ +static struct TclEncodingProfiles { + const char *name; + int value; +} encodingProfiles[] = {{"tcl8", TCL_ENCODING_PROFILE_TCL8}, + {"strict", TCL_ENCODING_PROFILE_STRICT}}; + +/* * The following variable is used in the sparse matrix code for a * TableEncoding to represent a page in the table that has no entries. */ @@ -1172,7 +1181,7 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_NOCOMPLAIN, dstPtr); + Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr); return Tcl_DStringValue(dstPtr); } @@ -2315,11 +2324,17 @@ BinaryProc( *------------------------------------------------------------------------- */ +#ifdef OBSOLETE #if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) # define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN) || (flags & TCL_ENCODING_STOPONERROR)) #else # define STOPONERROR (flags & TCL_ENCODING_STOPONERROR) #endif +#endif + + +#define STRICT_PROFILE(flags_) (TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) +#define STOPONERROR STRICT_PROFILE(flags) static int UtfToUtfProc( @@ -2386,10 +2401,11 @@ UtfToUtfProc( */ *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) - || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - || (flags & ENCODING_FAILINDEX))) { + } else if ((UCHAR(*src) == 0xC0) && + (src + 1 < srcEnd) && + (UCHAR(src[1]) == 0x80) && + (!(flags & TCL_ENCODING_MODIFIED) + || (STRICT_PROFILE(flags)))) { /* * If in input mode, and -strict or -failindex is specified: This is an error. */ @@ -2403,7 +2419,8 @@ UtfToUtfProc( */ *dst++ = 0; src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + } + else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an @@ -2416,10 +2433,10 @@ UtfToUtfProc( result = TCL_CONVERT_MULTIBYTE; break; } - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX)) { - result = TCL_CONVERT_SYNTAX; - break; - } + if (STRICT_PROFILE(flags)) { + result = TCL_CONVERT_SYNTAX; + break; + } ch = UCHAR(*src++); } else { char chbuf[2]; @@ -2427,12 +2444,13 @@ UtfToUtfProc( TclUtfToUCS4(chbuf, &ch); } dst += Tcl_UniCharToUtf(ch, dst); - } else { + } + else { int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED) - && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { + && STRICT_PROFILE(flags)) { result = TCL_CONVERT_SYNTAX; break; } @@ -2475,8 +2493,9 @@ UtfToUtfProc( result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) { + } else if (STRICT_PROFILE(flags) && + (flags & TCL_ENCODING_MODIFIED) && + ((ch & ~0x7FF) == 0xD800)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -2567,8 +2586,8 @@ Utf32ToUtfProc( } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } - if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && ((ch & ~0x7FF) == 0xD800))) { + if ((unsigned)ch > 0x10FFFF + || (STRICT_PROFILE(flags) && ((ch & ~0x7FF) == 0xD800))) { if (STOPONERROR) { result = TCL_CONVERT_SYNTAX; break; @@ -4095,34 +4114,27 @@ InitializeEncodingSearchPath( * * TclEncodingProfileParseName -- * - * Maps an encoding profile name to its integer equivalent. + * Maps an encoding profile name to its integer equivalent. * * Results: - * TCL_OK on success or TCL_ERROR on failure. + * TCL_OK on success or TCL_ERROR on failure. * * Side effects: - * Returns the profile enum value in *profilePtr + * Returns the profile enum value in *profilePtr * *------------------------------------------------------------------------ */ int -TclEncodingProfileParseName( +TclEncodingProfileNameToId( Tcl_Interp *interp, /* For error messages. May be NULL */ const char *profileName, /* Name of profile */ int *profilePtr) /* Output */ { - /* NOTE: Order in arrays must match !!! */ - static const char *const profileNames[] = {"", "tcl8", "strict", NULL}; - static int profileFlags[] = { - TCL_ENCODING_PROFILE_DEFAULT, - TCL_ENCODING_PROFILE_TCL8, - TCL_ENCODING_PROFILE_STRICT, - }; int i; - for (i = 0; i < sizeof(profileNames) / sizeof(profileNames[0]); ++i) { - if (!strcmp(profileName, profileNames[i])) { - *profilePtr = profileFlags[i]; + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (!strcmp(profileName, encodingProfiles[i].name)) { + *profilePtr = encodingProfiles[i].value; return TCL_OK; } } @@ -4130,13 +4142,52 @@ TclEncodingProfileParseName( Tcl_SetObjResult( interp, Tcl_ObjPrintf( - "bad profile \"%s\". Must be \"\", \"tcl8\" or \"strict\".", + "bad profile \"%s\". Must be \"tcl8\" or \"strict\".", profileName)); Tcl_SetErrorCode( interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); } return TCL_ERROR; } + +/* + *------------------------------------------------------------------------ + * + * TclEncodingProfileValueToName -- + * + * Maps an encoding profile value to its name. + * + * Results: + * Pointer to the name or NULL on failure. Caller must not make + * not modify the string and must make a copy to hold on to it. + * + * Side effects: + * None. + *------------------------------------------------------------------------ + */ +const char * +TclEncodingProfileIdToName( + Tcl_Interp *interp, /* For error messages. May be NULL */ + int profileValue) /* Profile #define value */ +{ + int i; + + for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + if (profileValue == encodingProfiles[i].value) { + return encodingProfiles[i].name; + } + } + if (interp) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "Internal error. Bad profile id \"%d\".", + profileValue)); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "PROFILEID", NULL); + } + return NULL; +} /* *------------------------------------------------------------------------ @@ -4179,6 +4230,7 @@ int TclEncodingExternalFlagsToInternal(int flags) case TCL_ENCODING_PROFILE_STRICT: flags |= TCL_ENCODING_STRICT; break; + case 0: /* Unspecified by caller */ default: /* TODO - clean this up once default mechanisms settled */ TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT); diff --git a/generic/tclIO.c b/generic/tclIO.c index 0152740..49f4257 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -1700,8 +1700,12 @@ Tcl_CreateChannel( } statePtr->inputEncodingState = NULL; statePtr->inputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, + TCL_ENCODING_PROFILE_DEFAULT); statePtr->outputEncodingState = NULL; statePtr->outputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, + TCL_ENCODING_PROFILE_DEFAULT); /* * Set the channel up initially in AUTO input translation mode to accept @@ -4394,21 +4398,6 @@ Write( } /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->outputEncodingFlags |= TCL_ENCODING_STRICT; -#ifdef TCL_NO_DEPRECATED - } else if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->outputEncodingFlags &= ~TCL_ENCODING_STRICT; - statePtr->outputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; -#endif - } else { - statePtr->outputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - - /* * Write the terminated escape sequence even if srcLen is 0. */ @@ -4733,21 +4722,6 @@ Tcl_GetsObj( } /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_STRICT; -#ifdef TCL_NO_DEPRECATED - } else if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - statePtr->inputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; -#endif - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - - /* * Object used by FilterInputBytes to keep track of how much data has been * consumed from the channel buffers. */ @@ -5528,21 +5502,6 @@ FilterInputBytes( } gsPtr->state = statePtr->inputEncodingState; - /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_STRICT; -#ifdef TCL_NO_DEPRECATED - } else if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - statePtr->inputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; -#endif - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - result = Tcl_ExternalToUtf(NULL, gsPtr->encoding, raw, rawLen, statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE, &statePtr->inputEncodingState, dst, spaceLeft, &gsPtr->rawRead, @@ -6349,21 +6308,6 @@ ReadChars( } /* - * Transfer encoding nocomplain/strict option to the encoding flags - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_STRICT)) { - statePtr->inputEncodingFlags |= TCL_ENCODING_STRICT; -#ifdef TCL_NO_DEPRECATED - } else if (GotFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN)) { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - statePtr->inputEncodingFlags |= TCL_ENCODING_NOCOMPLAIN; -#endif - } else { - statePtr->inputEncodingFlags &= ~TCL_ENCODING_STRICT; - } - - /* * This routine is burdened with satisfying several constraints. It cannot * append more than 'charsToRead` chars onto objPtr. This is measured * after encoding and translation transformations are completed. There is @@ -8065,16 +8009,18 @@ Tcl_GetChannelOption( } } if (len == 0 || HaveOpt(1, "-encodingprofile")) { + int profile; + const char *profileName; if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-encodingprofile"); } - if (flags & CHANNEL_ENCODING_STRICT) { - Tcl_DStringAppendElement(dsPtr, "strict"); - } else if (flags & CHANNEL_ENCODING_NOCOMPLAIN) { - Tcl_DStringAppendElement(dsPtr, "tcl8"); - } else { - Tcl_DStringAppendElement(dsPtr, ""); + /* Note currently input and output profiles are same */ + profile = TCL_ENCODING_PROFILE_GET(statePtr->inputEncodingFlags); + profileName = TclEncodingProfileIdToName(interp, profile); + if (profileName == NULL) { + return TCL_ERROR; } + Tcl_DStringAppendElement(dsPtr, profileName); if (len > 0) { return TCL_OK; } @@ -8293,6 +8239,7 @@ Tcl_SetChannelOption( return TCL_OK; } else if (HaveOpt(2, "-encoding")) { Tcl_Encoding encoding; + int profile; if ((newValue[0] == '\0') || (strcmp(newValue, "binary") == 0)) { encoding = NULL; @@ -8317,9 +8264,12 @@ Tcl_SetChannelOption( Tcl_FreeEncoding(statePtr->encoding); statePtr->encoding = encoding; statePtr->inputEncodingState = NULL; + profile = TCL_ENCODING_PROFILE_GET(statePtr->inputEncodingFlags); statePtr->inputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); statePtr->outputEncodingState = NULL; statePtr->outputEncodingFlags = TCL_ENCODING_START; + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); /* Same as input */ ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); UpdateInterest(chanPtr); return TCL_OK; @@ -8380,28 +8330,11 @@ Tcl_SetChannelOption( return TCL_OK; } else if (HaveOpt(1, "-encodingprofile")) { int profile; - if (TclEncodingProfileParseName(interp, newValue, &profile) != TCL_OK) { + if (TclEncodingProfileNameToId(interp, newValue, &profile) != TCL_OK) { return TCL_ERROR; } - switch (profile) { - case TCL_ENCODING_PROFILE_TCL8: - ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); - SetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); - break; - case TCL_ENCODING_PROFILE_STRICT: - ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); - SetFlag(statePtr, CHANNEL_ENCODING_STRICT); - break; - /* TODO - clean up this DEFAULT handling once channel flags fixed */ -#if TCL_ENCODING_PROFILE_DEFAULT != TCL_ENCODING_PROFILE_TCL8 \ - && TCL_ENCODING_PROFILE_DEFAULT != TCL_ENCODING_PROFILE_STRICT - case TCL_ENCODING_PROFILE_DEFAULT: /* FALLTHRU */ -#endif - default: - ResetFlag(statePtr, CHANNEL_ENCODING_NOCOMPLAIN); - ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); - break; - } + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); return TCL_OK; } else if (HaveOpt(1, "-translation")) { @@ -9493,12 +9426,17 @@ TclCopyChannel( * of the bytes themselves. */ + /* + * TODO - should really only allow lossless profiles. Below reflects + * Tcl 8.7 alphas prior to encoding profiles + */ + moveBytes = inStatePtr->inEofChar == '\0' /* No eofChar to stop input */ && inStatePtr->inputTranslation == TCL_TRANSLATE_LF && outStatePtr->outputTranslation == TCL_TRANSLATE_LF && inStatePtr->encoding == outStatePtr->encoding - && (inStatePtr->flags & TCL_ENCODING_STRICT) != TCL_ENCODING_STRICT - && outStatePtr->flags & TCL_ENCODING_NOCOMPLAIN; + && TCL_ENCODING_PROFILE_GET(inStatePtr->flags) != TCL_ENCODING_PROFILE_STRICT + && TCL_ENCODING_PROFILE_GET(outStatePtr->flags) == TCL_ENCODING_PROFILE_TCL8; /* * Allocate a new CopyState to maintain info about the current copy in @@ -9826,8 +9764,8 @@ CopyData( inBinary = (inStatePtr->encoding == NULL); outBinary = (outStatePtr->encoding == NULL); sameEncoding = inStatePtr->encoding == outStatePtr->encoding - && (inStatePtr->flags & TCL_ENCODING_STRICT) != TCL_ENCODING_STRICT - && outStatePtr->flags & TCL_ENCODING_NOCOMPLAIN; + && TCL_ENCODING_PROFILE_GET(inStatePtr->flags) != TCL_ENCODING_PROFILE_STRICT + && TCL_ENCODING_PROFILE_GET(outStatePtr->flags) == TCL_ENCODING_PROFILE_TCL8; if (!(inBinary || sameEncoding)) { TclNewObj(bufObj); diff --git a/generic/tclIO.h b/generic/tclIO.h index a69e990..3f2feee 100644 --- a/generic/tclIO.h +++ b/generic/tclIO.h @@ -275,16 +275,17 @@ typedef struct ChannelState { * encountered an encoding error */ #define CHANNEL_RAW_MODE (1<<16) /* When set, notes that the Raw API is * being used. */ +#ifdef APN #define CHANNEL_ENCODING_NOCOMPLAIN (1<<17) /* set if option * -nocomplainencoding is set to 1 */ #define CHANNEL_ENCODING_STRICT (1<<18) /* set if option * -strictencoding is set to 1 */ +#endif #define CHANNEL_INCLOSE (1<<19) /* Channel is currently being closed. * Its structures are still live and * usable, but it may not be closed * again from within the close * handler. */ -#define ENCODING_FAILINDEX (1<<20) /* Internal flag, fail on Invalid bytes only */ #define CHANNEL_CLOSEDWRITE (1<<21) /* Channel write side has been closed. * No further Tcl-level write IO on * the channel is allowed. */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 2b491d6..4b6303d 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2885,9 +2885,11 @@ MODULE_SCOPE TclPlatformType tclPlatform; MODULE_SCOPE Tcl_Encoding tclIdentityEncoding; MODULE_SCOPE int -TclEncodingProfileParseName(Tcl_Interp *interp, - const char *profileName, - int *profilePtr); +TclEncodingProfileNameToId(Tcl_Interp *interp, + const char *profileName, + int *profilePtr); +MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp, + int profileId); MODULE_SCOPE int TclEncodingExternalFlagsToInternal(int flags); /* -- cgit v0.12 From c2f0e2f8da529b6bd9f8793a07e73ed1bb6eb903 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sat, 11 Feb 2023 01:51:32 +0000 Subject: Eliminate TCL_ENCODING_{STRICT,NOCOMPLAIN} --- generic/tcl.h | 12 ++---------- generic/tclEncoding.c | 37 ++++++++----------------------------- generic/tclIO.h | 6 ------ 3 files changed, 10 insertions(+), 45 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index ec94e71..b7d31aa 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2127,14 +2127,8 @@ typedef struct Tcl_EncodingType { * 0x00. Only valid for "utf-8" and "cesu-8". * This flag is implicit for external -> internal conversions, * optional for internal -> external conversions. - * TCL_ENCODING_NOCOMPLAIN - If set, the converter - * substitutes the problematic character(s) with - * one or more "close" characters in the - * destination buffer and then continues to - * convert the source. If clear, the converter returns - * immediately upon encountering an invalid byte sequence - * or a source character that has no mapping in the - * target encoding. Only for Tcl 9.x. + * TCL_ENCODING_PROFILE_* - Mutually exclusive encoding profile ids. Note + * these are bit masks. */ #define TCL_ENCODING_START 0x01 @@ -2143,8 +2137,6 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_NO_TERMINATE 0x08 #define TCL_ENCODING_CHAR_LIMIT 0x10 #define TCL_ENCODING_MODIFIED 0x20 -#define TCL_ENCODING_NOCOMPLAIN 0x40 -#define TCL_ENCODING_STRICT 0x44 /* Reserve top byte for profile values (disjoint) */ #define TCL_ENCODING_PROFILE_TCL8 0x01000000 #define TCL_ENCODING_PROFILE_STRICT 0x02000000 diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 153f8d3..85c2b6a 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -574,7 +574,7 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(0); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); @@ -583,13 +583,13 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(TCL_ENCODING_LE|TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(TCL_ENCODING_LE); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; - type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2"; - type.clientData = INT2PTR(isLe.c|TCL_ENCODING_NOCOMPLAIN); + type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); type.toUtfProc = Utf32ToUtfProc; @@ -2324,16 +2324,11 @@ BinaryProc( *------------------------------------------------------------------------- */ -#ifdef OBSOLETE -#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) -# define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN) || (flags & TCL_ENCODING_STOPONERROR)) -#else -# define STOPONERROR (flags & TCL_ENCODING_STOPONERROR) -#endif -#endif - +#define STRICT_PROFILE(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) -#define STRICT_PROFILE(flags_) (TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) #define STOPONERROR STRICT_PROFILE(flags) static int @@ -4196,10 +4191,6 @@ TclEncodingProfileIdToName( * * Maps the flags supported in the encoding C API's to internal flags. * - * TCL_ENCODING_STRICT and TCL_ENCODING_NOCOMPLAIN are masked off - * because they are for internal use only and externally specified - * through TCL_ENCODING_PROFILE_* bits. - * * For backward compatibility reasons, TCL_ENCODING_STOPONERROR is * is mapped to the TCL_ENCODING_PROFILE_STRICT overwriting any profile * specified. @@ -4217,7 +4208,6 @@ TclEncodingProfileIdToName( */ int TclEncodingExternalFlagsToInternal(int flags) { - flags &= ~(TCL_ENCODING_STRICT | TCL_ENCODING_NOCOMPLAIN); if (flags & TCL_ENCODING_STOPONERROR) { TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); } @@ -4225,22 +4215,11 @@ int TclEncodingExternalFlagsToInternal(int flags) int profile = TCL_ENCODING_PROFILE_GET(flags); switch (profile) { case TCL_ENCODING_PROFILE_TCL8: - flags |= TCL_ENCODING_NOCOMPLAIN; - break; case TCL_ENCODING_PROFILE_STRICT: - flags |= TCL_ENCODING_STRICT; break; case 0: /* Unspecified by caller */ default: - /* TODO - clean this up once default mechanisms settled */ TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_DEFAULT); -#if TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_TCL8 - flags |= TCL_ENCODING_NOCOMPLAIN; -#elif TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT - flags |= TCL_ENCODING_STRICT; -#else -#error TCL_ENCODING_PROFILE_DEFAULT must be TCL8 or STRICT -#endif break; } } diff --git a/generic/tclIO.h b/generic/tclIO.h index 3f2feee..dded07f 100644 --- a/generic/tclIO.h +++ b/generic/tclIO.h @@ -275,12 +275,6 @@ typedef struct ChannelState { * encountered an encoding error */ #define CHANNEL_RAW_MODE (1<<16) /* When set, notes that the Raw API is * being used. */ -#ifdef APN -#define CHANNEL_ENCODING_NOCOMPLAIN (1<<17) /* set if option - * -nocomplainencoding is set to 1 */ -#define CHANNEL_ENCODING_STRICT (1<<18) /* set if option - * -strictencoding is set to 1 */ -#endif #define CHANNEL_INCLOSE (1<<19) /* Channel is currently being closed. * Its structures are still live and * usable, but it may not be closed -- cgit v0.12 From 727887b6dc02960e49117cb5db99e44806a0327f Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sat, 11 Feb 2023 17:38:07 +0000 Subject: Partial implementation of replace profile --- generic/tcl.h | 7 +-- generic/tclEncoding.c | 119 +++++++++++++++++++++++++++++++++++++++----------- tests/cmdAH.test | 3 ++ 3 files changed, 99 insertions(+), 30 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index b7d31aa..3fc53db 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2140,6 +2140,7 @@ typedef struct Tcl_EncodingType { /* Reserve top byte for profile values (disjoint) */ #define TCL_ENCODING_PROFILE_TCL8 0x01000000 #define TCL_ENCODING_PROFILE_STRICT 0x02000000 +#define TCL_ENCODING_PROFILE_REPLACE 0x03000000 #define TCL_ENCODING_PROFILE_MASK 0xFF000000 #define TCL_ENCODING_PROFILE_GET(flags_) ((flags_) & TCL_ENCODING_PROFILE_MASK) #define TCL_ENCODING_PROFILE_SET(flags_, profile_) \ @@ -2151,13 +2152,9 @@ typedef struct Tcl_EncodingType { #if TCL_MAJOR_VERSION < 9 #define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 #else -#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? TODO */ +#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? REPLACE? TODO */ #endif -#define TCL_ENCODING_EXTERNAL_FLAG_MASK \ - (TCL_ENCODING_START|TCL_ENCODING_END|TCL_ENCODING_STOPONERROR) - - /* * The following definitions are the error codes returned by the conversion * routines: diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 85c2b6a..bb1f32f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -193,8 +193,12 @@ Tcl_Encoding tclIdentityEncoding = NULL; static struct TclEncodingProfiles { const char *name; int value; -} encodingProfiles[] = {{"tcl8", TCL_ENCODING_PROFILE_TCL8}, - {"strict", TCL_ENCODING_PROFILE_STRICT}}; +} encodingProfiles[] = { + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"replace", TCL_ENCODING_PROFILE_REPLACE}, +}; +#define UNICODE_REPLACE_CHAR 0xFFFD /* * The following variable is used in the sparse matrix code for a @@ -2336,7 +2340,7 @@ UtfToUtfProc( void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ + int flags, /* TCL_ENCODING_* conversion control flags. */ TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ @@ -2376,6 +2380,8 @@ UtfToUtfProc( dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + int profile = TCL_ENCODING_PROFILE_GET(flags); + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2389,34 +2395,51 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { + /* + * TCL_ENCODING_MODIFIED is set when the target encoding is Tcl's + * internal UTF-8 modified version. + */ + if (UCHAR(*src) < 0x80 + && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { /* - * Copy 7bit characters, but skip null-bytes when we are in input - * mode, so that they get converted to 0xC080. + * Copy 7bit characters, but skip null-bytes when target encoding + * is Tcl's "modified" UTF-8. These need to be converted to + * \xC0\x80 as is done in a later branch. */ *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && - (src + 1 < srcEnd) && - (UCHAR(src[1]) == 0x80) && - (!(flags & TCL_ENCODING_MODIFIED) - || (STRICT_PROFILE(flags)))) { + } + else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) + && (UCHAR(src[1]) == 0x80) + && (!(flags & TCL_ENCODING_MODIFIED) + || (profile == TCL_ENCODING_PROFILE_STRICT))) { /* - * If in input mode, and -strict or -failindex is specified: This is an error. + * \xC0\x80 and either strict profile or target is "real" UTF-8 + * - Strict profile - error + * - Non-strict, real UTF-8 - output \x00 */ if (flags & TCL_ENCODING_MODIFIED) { + /* + * TODO - should above check not be against STRICT? + * That would probably break a convertto command that goes + * from the internal UTF8 to the real UTF8. On the other + * hand this means, a strict UTF8->UTF8 transform is not + * possible using this function. + */ result = TCL_CONVERT_SYNTAX; break; } /* - * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'. + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. */ *dst++ = 0; src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* + * Incomplete byte sequence. * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves @@ -2424,17 +2447,39 @@ UtfToUtfProc( */ if (flags & TCL_ENCODING_MODIFIED) { - if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { - result = TCL_CONVERT_MULTIBYTE; + /* Incomplete bytes for modified UTF-8 target */ + if (profile == TCL_ENCODING_PROFILE_STRICT) { + result = (flags & TCL_ENCODING_CHAR_LIMIT) + ? TCL_CONVERT_MULTIBYTE + : TCL_CONVERT_SYNTAX; break; } - if (STRICT_PROFILE(flags)) { - result = TCL_CONVERT_SYNTAX; - break; + if (profile == TCL_ENCODING_PROFILE_REPLACE) { + ch = UNICODE_REPLACE_CHAR; + } else { + /* TCL_ENCODING_PROFILE_TCL8 */ + ch = UCHAR(*src); } - ch = UCHAR(*src++); - } else { + ++src; + } + else { + /* + * Incomplete bytes for real UTF-8 target. + * TODO - no profile check here because did not have any + * checks in the pre-profile code. Why? Is it because on + * output a valid internal utf-8 stream is assumed? + */ char chbuf[2]; + /* + * TODO - this code seems broken to me. + * - it does not check profiles + * - generates invalid output for real UTF-8 target + * (consider \xC2) + * A possible explanation is this behavior matches the + * Tcl8 decoding behavior of mapping invalid bytes to the same + * code point value. Still, at least strictness checks should + * be made. + */ chbuf[0] = UCHAR(*src++); chbuf[1] = 0; TclUtfToUCS4(chbuf, &ch); } @@ -2444,11 +2489,31 @@ UtfToUtfProc( int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); + + /* + * Valid single char encodings were already handled earlier. + * So len==1 means an invalid byte that is magically transformed + * to a code point unless it resulted from the special + * \xC0\x80 sequence. Tests io-75.* + * TODO - below check could be simplified to remove the MODIFIED + * expression I think given the checks already made above. May be. + */ +#if 0 if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED) - && STRICT_PROFILE(flags)) { + && (profile == TCL_ENCODING_PROFILE_STRICT)) { result = TCL_CONVERT_SYNTAX; break; } +#else + if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)) { + if (profile == TCL_ENCODING_PROFILE_STRICT) { + result = TCL_CONVERT_SYNTAX; + break; + } else if (profile == TCL_ENCODING_PROFILE_REPLACE) { + ch = UNICODE_REPLACE_CHAR; + } + } +#endif src += len; if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { if (ch > 0xFFFF) { @@ -2464,13 +2529,14 @@ UtfToUtfProc( /* * A surrogate character is detected, handle especially. */ + /* TODO - what about REPLACE profile? */ low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - if (STOPONERROR) { + if (profile == TCL_ENCODING_PROFILE_STRICT) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2484,12 +2550,14 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) { + } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && + !(flags & TCL_ENCODING_MODIFIED) && + (((ch & ~0x7FF) == 0xD800))) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if (STRICT_PROFILE(flags) && - (flags & TCL_ENCODING_MODIFIED) && + } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && + (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; @@ -4216,6 +4284,7 @@ int TclEncodingExternalFlagsToInternal(int flags) switch (profile) { case TCL_ENCODING_PROFILE_TCL8: case TCL_ENCODING_PROFILE_STRICT: + case TCL_ENCODING_PROFILE_REPLACE: break; case 0: /* Unspecified by caller */ default: diff --git a/tests/cmdAH.test b/tests/cmdAH.test index c4053a2..52e7ac3 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -228,6 +228,9 @@ set encInvalidBytes { utf-8 A\xed\xb0\x80B default A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate + utf-8 \xed\xa0\x80\xed\xb0\x80 default \U00010000 -1 High-low-surrogate + utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate + utf-8 \xed\xa0\x80\xed\xb0\x80 strict \U00010000 0 High-low-surrogate utf-32le \x00\xD8\x00\x00 default \uD800 -1 {High-surrogate} utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} -- cgit v0.12 From b5095134dfebce7a33739c75d6533d90862901e3 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sun, 12 Feb 2023 06:15:59 +0000 Subject: Minor readability changes --- generic/tclEncoding.c | 101 ++++++++++++++++++++++++++++++++------------------ tests/cmdAH.test | 2 +- 2 files changed, 65 insertions(+), 38 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index bb1f32f..d2f3551 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -198,7 +198,20 @@ static struct TclEncodingProfiles { {"strict", TCL_ENCODING_PROFILE_STRICT}, {"replace", TCL_ENCODING_PROFILE_REPLACE}, }; +#define PROFILE_STRICT(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) + +#define PROFILE_REPLACE(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE)) + #define UNICODE_REPLACE_CHAR 0xFFFD +#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) +#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) +#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) /* * The following variable is used in the sparse matrix code for a @@ -243,6 +256,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc; static Tcl_EncodingConvertProc Iso88591FromUtfProc; static Tcl_EncodingConvertProc Iso88591ToUtfProc; + /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field * of the internalrep. This should help the lifetime of encodings be more useful. @@ -2328,13 +2342,6 @@ BinaryProc( *------------------------------------------------------------------------- */ -#define STRICT_PROFILE(flags_) \ - ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ - || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ - && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) - -#define STOPONERROR STRICT_PROFILE(flags) - static int UtfToUtfProc( void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ @@ -2412,7 +2419,7 @@ UtfToUtfProc( else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) - || (profile == TCL_ENCODING_PROFILE_STRICT))) { + || PROFILE_STRICT(profile))) { /* * \xC0\x80 and either strict profile or target is "real" UTF-8 * - Strict profile - error @@ -2448,13 +2455,13 @@ UtfToUtfProc( if (flags & TCL_ENCODING_MODIFIED) { /* Incomplete bytes for modified UTF-8 target */ - if (profile == TCL_ENCODING_PROFILE_STRICT) { + if (PROFILE_STRICT(profile)) { result = (flags & TCL_ENCODING_CHAR_LIMIT) ? TCL_CONVERT_MULTIBYTE : TCL_CONVERT_SYNTAX; break; } - if (profile == TCL_ENCODING_PROFILE_REPLACE) { + if (PROFILE_REPLACE(profile)) { ch = UNICODE_REPLACE_CHAR; } else { /* TCL_ENCODING_PROFILE_TCL8 */ @@ -2506,10 +2513,10 @@ UtfToUtfProc( } #else if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)) { - if (profile == TCL_ENCODING_PROFILE_STRICT) { + if (PROFILE_STRICT(profile)) { result = TCL_CONVERT_SYNTAX; break; - } else if (profile == TCL_ENCODING_PROFILE_REPLACE) { + } else if (PROFILE_REPLACE(profile)) { ch = UNICODE_REPLACE_CHAR; } } @@ -2534,9 +2541,9 @@ UtfToUtfProc( low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { - if (profile == TCL_ENCODING_PROFILE_STRICT) { + if (PROFILE_STRICT(profile)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2550,15 +2557,15 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && - !(flags & TCL_ENCODING_MODIFIED) && - (((ch & ~0x7FF) == 0xD800))) { + } else if (PROFILE_STRICT(profile) && + (!(flags & TCL_ENCODING_MODIFIED)) && + SURROGATE(ch)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && + } else if (PROFILE_STRICT(profile) && (flags & TCL_ENCODING_MODIFIED) && - ((ch & ~0x7FF) == 0xD800)) { + SURROGATE(ch)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -2649,12 +2656,15 @@ Utf32ToUtfProc( } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } - if ((unsigned)ch > 0x10FFFF - || (STRICT_PROFILE(flags) && ((ch & ~0x7FF) == 0xD800))) { - if (STOPONERROR) { + + if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } /* @@ -2666,7 +2676,7 @@ Utf32ToUtfProc( *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); - if ((ch & ~0x3FF) == 0xD800) { + if (HIGH_SURROGATE(ch)) { /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } @@ -2750,11 +2760,14 @@ UtfToUtf32Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2952,11 +2965,14 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -3059,6 +3075,9 @@ UtfToUcs2Proc( result = TCL_CONVERT_NOSPACE; break; } + /* TODO - there were no STRICT or NOCOMPLAIN checks here (why?) + * so no profile checks either for now. */ + #if TCL_UTF_MAX < 4 src += (len = TclUtfToUniChar(src, &ch)); if ((ch >= 0xD800) && (len < 3)) { @@ -3163,23 +3182,30 @@ TableToUtfProc( if (prefixBytes[byte]) { src++; if (src >= srcEnd) { + /* + * TODO - this is broken. For consistency with other + * decoders, an error should be raised only if strict. + * However, doing that check cause a whole bunch of test + * failures. Need to verify if those tests are in fact + * correct. + */ src--; result = TCL_CONVERT_MULTIBYTE; break; } - ch = toUnicode[byte][*((unsigned char *) src)]; + ch = toUnicode[byte][*((unsigned char *)src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar) byte; + ch = (Tcl_UniChar)byte; } /* @@ -3288,11 +3314,11 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; + word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -3476,7 +3502,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3489,7 +3515,7 @@ Iso88591FromUtfProc( * Plunge on, using '?' as a fallback character. */ - ch = (Tcl_UniChar) '?'; + ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */ } if (dst > dstEnd) { @@ -3703,9 +3729,10 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if (!STOPONERROR) { + if (!PROFILE_STRICT(flags)) { /* - * Skip the unknown escape sequence. + * Skip the unknown escape sequence. TODO - bug? + * May be replace with UNICODE_REPLACE_CHAR? */ src += longest; @@ -3878,7 +3905,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 52e7ac3..7b2d99f 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -179,7 +179,7 @@ set "numargErrors(encoding convertfrom)" {^wrong # args: should be "(encoding |: set "numargErrors(encoding convertto)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertto \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} -set encProfiles {tcl8 strict} +set encProfiles {tcl8 strict replace} # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically -- cgit v0.12 From bf448a6421c4fd0340d6bba70aba3b0a713d049b Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sun, 12 Feb 2023 11:04:16 +0000 Subject: Added 'encoding profiles' --- generic/tclEncoding.c | 31 ++++++++++++++++++++++++++++++- tests/cmdAH.test | 9 +++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d2f3551..e8e1756 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -4278,7 +4278,7 @@ TclEncodingProfileIdToName( } return NULL; } - + /* *------------------------------------------------------------------------ * @@ -4321,6 +4321,35 @@ int TclEncodingExternalFlagsToInternal(int flags) } return flags; } + +/* + *------------------------------------------------------------------------ + * + * TclGetEncodingProfiles -- + * + * Get the list of supported encoding profiles. + * + * Results: + * None. + * + * Side effects: + * The list of profile names is stored in the interpreter result. + * + *------------------------------------------------------------------------ + */ +void +TclGetEncodingProfiles(Tcl_Interp *interp) +{ + int i, n; + Tcl_Obj *objPtr; + n = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); + objPtr = Tcl_NewListObj(n, NULL); + for (i = 0; i < n; ++i) { + Tcl_ListObjAppendElement( + interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, -1)); + } + Tcl_SetObjResult(interp, objPtr); +} /* * Local Variables: diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 7b2d99f..c666513 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -178,6 +178,7 @@ set "numargErrors(encoding system)" {^wrong # args: should be "(encoding |::tcl: set "numargErrors(encoding convertfrom)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertfrom \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} set "numargErrors(encoding convertto)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertto \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} +set "numargErrors(encoding profiles)" {wrong # args: should be "encoding profiles"} set encProfiles {tcl8 strict replace} @@ -202,6 +203,7 @@ set encValidStrings { set encInvalidBytes { ascii \x41\xe9\x42 default A\u00E9B -1 {non-ASCII} ascii \x41\xe9\x42 tcl8 A\u00E9B -1 {non-ASCII} + ascii \x41\xe9\x42 replace A\uFFFDB -1 {non-ASCII} ascii \x41\xe9\x42 strict A 1 {non-ASCII} utf-8 \x41\xC0\x42 default A\u00C0B -1 C0 @@ -565,6 +567,13 @@ test cmdAH-4.5.2 {encoding names should include at least utf-8 and iso8859-1 and } -result {1 1 1} # +# encoding profiles 4.6.* +badnumargs cmdAH-4.6.1 {encoding profiles} {foo} +test cmdAH-4.6.2 {encoding profiles} -body { + lsort [encoding profiles] +} -result {replace strict tcl8} + +# # file command test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body { -- cgit v0.12 From 0c764d2b03ab2b8daf95b3a25a470b56dffdad4f Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sun, 12 Feb 2023 16:56:17 +0000 Subject: Minor fixes and tests --- generic/tclCmdAH.c | 30 ++++++++++++++++++++++++++++++ generic/tclEncoding.c | 22 ++++++++++------------ generic/tclInt.h | 1 + tests/cmdAH.test | 7 ++++++- tests/socket.test | 2 +- 5 files changed, 48 insertions(+), 14 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 5fbe27e..692c75b 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -51,6 +51,7 @@ static Tcl_ObjCmdProc EncodingConvertfromObjCmd; static Tcl_ObjCmdProc EncodingConverttoObjCmd; static Tcl_ObjCmdProc EncodingDirsObjCmd; static Tcl_ObjCmdProc EncodingNamesObjCmd; +static Tcl_ObjCmdProc EncodingProfilesObjCmd; static Tcl_ObjCmdProc EncodingSystemObjCmd; static inline int ForeachAssignments(Tcl_Interp *interp, struct ForeachState *statePtr); @@ -519,6 +520,7 @@ TclInitEncodingCmd( {"convertto", EncodingConverttoObjCmd, TclCompileBasic1To3ArgCmd, NULL, NULL, 0}, {"dirs", EncodingDirsObjCmd, TclCompileBasic0Or1ArgCmd, NULL, NULL, 1}, {"names", EncodingNamesObjCmd, TclCompileBasic0ArgCmd, NULL, NULL, 0}, + {"profiles", EncodingProfilesObjCmd, TclCompileBasic0ArgCmd, NULL, NULL, 0}, {"system", EncodingSystemObjCmd, TclCompileBasic0Or1ArgCmd, NULL, NULL, 1}, {NULL, NULL, NULL, NULL, NULL, 0} }; @@ -891,6 +893,34 @@ EncodingNamesObjCmd( /* *----------------------------------------------------------------------------- * + * EncodingProfilesObjCmd -- + * + * This command returns a list of the available encoding profiles + * + * Results: + * Returns a standard Tcl result + * + *----------------------------------------------------------------------------- + */ + +int +EncodingProfilesObjCmd( + TCL_UNUSED(void *), + Tcl_Interp* interp, /* Tcl interpreter */ + int objc, /* Number of command line args */ + Tcl_Obj* const objv[]) /* Vector of command line args */ +{ + if (objc > 1) { + Tcl_WrongNumArgs(interp, 1, objv, NULL); + return TCL_ERROR; + } + TclGetEncodingProfiles(interp); + return TCL_OK; +} + +/* + *----------------------------------------------------------------------------- + * * EncodingSystemObjCmd -- * * This command retrieves or changes the system encoding diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index e8e1756..fc3ac77 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -208,7 +208,7 @@ static struct TclEncodingProfiles { || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE)) -#define UNICODE_REPLACE_CHAR 0xFFFD +#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD) #define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) #define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) #define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) @@ -547,6 +547,7 @@ FillEncodingFileMap(void) * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ +#define TCL_ENCODING_CESU8 0x400 /* TODO - Distinguishes cesu-8 from utf-8*/ void TclInitEncodingSubsystem(void) @@ -592,7 +593,7 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(0); + type.clientData = INT2PTR(TCL_ENCODING_CESU8); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); @@ -2505,13 +2506,6 @@ UtfToUtfProc( * TODO - below check could be simplified to remove the MODIFIED * expression I think given the checks already made above. May be. */ -#if 0 - if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED) - && (profile == TCL_ENCODING_PROFILE_STRICT)) { - result = TCL_CONVERT_SYNTAX; - break; - } -#else if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)) { if (PROFILE_STRICT(profile)) { result = TCL_CONVERT_SYNTAX; @@ -2520,7 +2514,7 @@ UtfToUtfProc( ch = UNICODE_REPLACE_CHAR; } } -#endif + src += len; if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { if (ch > 0xFFFF) { @@ -2551,7 +2545,7 @@ UtfToUtfProc( cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((ch | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); continue; } src += len; @@ -3205,7 +3199,11 @@ TableToUtfProc( if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar)byte; + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } else { + ch = (Tcl_UniChar)byte; + } } /* diff --git a/generic/tclInt.h b/generic/tclInt.h index 4b6303d..538b177 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2891,6 +2891,7 @@ TclEncodingProfileNameToId(Tcl_Interp *interp, MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp, int profileId); MODULE_SCOPE int TclEncodingExternalFlagsToInternal(int flags); +MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp); /* * TIP #233 (Virtualized Time) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index c666513..65ecac5 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -208,21 +208,26 @@ set encInvalidBytes { utf-8 \x41\xC0\x42 default A\u00C0B -1 C0 utf-8 \x41\xC0\x42 tcl8 A\u00C0B -1 C0 + utf-8 \x41\xC0\x42 replace A\uFFFDB -1 C0 utf-8 \x41\xC0\x42 strict A 1 C0 utf-8 \x41\x80\x42 default A\u0080B -1 80 utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 + utf-8 \x41\x80\x42 replace A\uFFFDB -1 80 utf-8 \x41\x80\x42 strict A 1 80 utf-8 \x41\xC0\x80\x42 default A\u0000B -1 C080 utf-8 \x41\xC0\x80\x42 tcl8 A\u0000B -1 C080 utf-8 \x41\xC0\x80\x42 strict A 1 C080 utf-8 \x41\xC1\x42 default A\u00C1B -1 C1 utf-8 \x41\xC1\x42 tcl8 A\u00C1B -1 C1 + utf-8 \x41\xC1\x42 replace A\uFFFDB -1 C1 utf-8 \x41\xC1\x42 strict A 1 C1 utf-8 \x41\xC2\x42 default A\u00C2B -1 C2-nontrail utf-8 \x41\xC2\x42 tcl8 A\u00C2B -1 C2-nontrail + utf-8 \x41\xC2\x42 replace A\uFFFDB -1 C2-nontrail utf-8 \x41\xC2\x42 strict A 1 C2-nontrail utf-8 \x41\xC2 default A\u00C2 -1 C2-incomplete utf-8 \x41\xC2 tcl8 A\u00C2 -1 C2-incomplete + utf-8 \x41\xC2 replace A\uFFFD -1 C2-incomplete utf-8 \x41\xC2 strict A 1 C2-incomplete utf-8 A\xed\xa0\x80B default A\uD800B -1 High-surrogate utf-8 A\xed\xa0\x80B tcl8 A\uD800B -1 High-surrogate @@ -335,7 +340,7 @@ test cmdAH-4.1.1 {encoding} -returnCodes error -body { } -result {wrong # args: should be "encoding subcommand ?arg ...?"} test cmdAH-4.1.2 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding foo -} -result {unknown or ambiguous subcommand "foo": must be convertfrom, convertto, dirs, names, or system} +} -result {unknown or ambiguous subcommand "foo": must be convertfrom, convertto, dirs, names, profiles, or system} # # encoding system 4.2.* diff --git a/tests/socket.test b/tests/socket.test index a0fe2f7..b1435be 100644 --- a/tests/socket.test +++ b/tests/socket.test @@ -1071,7 +1071,7 @@ test socket_$af-7.3 {testing socket specific options} -constraints [list socket close $s update llength $l -} -result 22 +} -result 20 test socket_$af-7.4 {testing socket specific options} -constraints [list socket supported_$af] -setup { set timer [after 10000 "set x timed_out"] set l "" -- cgit v0.12 From 86d84d444cba1b00cf6b8771db83f21d9e6e5e13 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sun, 12 Feb 2023 17:34:58 +0000 Subject: Tentative fix for [bd1a60eb9] - surrogates in strict utf-8 --- generic/tclEncoding.c | 11 +++++++++-- tests/cmdAH.test | 5 +++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index fc3ac77..5d099f9 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -547,7 +547,8 @@ FillEncodingFileMap(void) * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ -#define TCL_ENCODING_CESU8 0x400 /* TODO - Distinguishes cesu-8 from utf-8*/ +#define TCL_ENCODING_CESU8_SOURCE 0x400 /* TODO - Distinguishes cesu-8 + * *source* from utf-8 *source* */ void TclInitEncodingSubsystem(void) @@ -593,7 +594,7 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_CESU8); + type.clientData = INT2PTR(TCL_ENCODING_CESU8_SOURCE); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); @@ -2370,6 +2371,7 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; + int isCesu8; result = TCL_OK; @@ -2531,6 +2533,11 @@ UtfToUtfProc( * A surrogate character is detected, handle especially. */ /* TODO - what about REPLACE profile? */ + if (PROFILE_STRICT(profile) && !(flags & TCL_ENCODING_CESU8_SOURCE)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 65ecac5..f2aab52 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -237,8 +237,9 @@ set encInvalidBytes { utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate utf-8 \xed\xa0\x80\xed\xb0\x80 default \U00010000 -1 High-low-surrogate utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate - utf-8 \xed\xa0\x80\xed\xb0\x80 strict \U00010000 0 High-low-surrogate - + utf-8 \xed\xa0\x80\xed\xb0\x80 strict {} 0 High-low-surrogate +} +set utf32-le-TODO { utf-32le \x00\xD8\x00\x00 default \uD800 -1 {High-surrogate} utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} utf-32le \x00\xD8\x00\x00 strict "" 0 {High-surrogate} -- cgit v0.12 From 85320f8fd074a2a55f76a7c0a8290f0a195530dc Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 14 Feb 2023 11:37:35 +0000 Subject: Bug [bd1a60eb9c]. Eliminate TCL_ENCODING_UTF. --- generic/tclEncoding.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5d099f9..778fca8 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -546,7 +546,6 @@ FillEncodingFileMap(void) /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ -#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ #define TCL_ENCODING_CESU8_SOURCE 0x400 /* TODO - Distinguishes cesu-8 * *source* from utf-8 *source* */ @@ -592,7 +591,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = INT2PTR(TCL_ENCODING_UTF); + type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.clientData = INT2PTR(TCL_ENCODING_CESU8_SOURCE); type.encodingName = "cesu-8"; @@ -1269,7 +1268,7 @@ Tcl_ExternalToUtfDStringEx( flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags |= TCL_ENCODING_MODIFIED; } while (1) { @@ -1386,7 +1385,7 @@ Tcl_ExternalToUtf( dstLen--; } if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags |= TCL_ENCODING_MODIFIED; } do { Tcl_EncodingState savedState = *statePtr; @@ -2371,7 +2370,6 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; - int isCesu8; result = TCL_OK; @@ -2387,7 +2385,7 @@ UtfToUtfProc( dstStart = dst; flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_CESU8_SOURCE) ? 6 : TCL_UTF_MAX); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { int profile = TCL_ENCODING_PROFILE_GET(flags); @@ -2518,7 +2516,7 @@ UtfToUtfProc( } src += len; - if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { + if ((flags & TCL_ENCODING_CESU8_SOURCE) && (ch > 0x3FF)) { if (ch > 0xFFFF) { /* CESU-8 6-byte sequence for chars > U+FFFF */ ch -= 0x10000; -- cgit v0.12 From a750ed2c2475387ab61073159ebf455c2452c78e Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 14 Feb 2023 11:39:35 +0000 Subject: Fix uniqueness parsing fconfigure -encoding / -encodingprofile options --- generic/tclIO.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 49f4257..8a6f76a 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -7994,7 +7994,7 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(2, "-encoding")) { + if (len == 0 || HaveOpt(8, "-encoding")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-encoding"); } @@ -8008,7 +8008,7 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(1, "-encodingprofile")) { + if (len == 0 || HaveOpt(9, "-encodingprofile")) { int profile; const char *profileName; if (len == 0) { -- cgit v0.12 From 891d60a9ad2f9600dd9b1c3f0ce966d79a8942e8 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 14 Feb 2023 11:56:49 +0000 Subject: Remove obsolete comment --- generic/tclEncoding.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 778fca8..0f5e05f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -570,8 +570,6 @@ TclInitEncodingSubsystem(void) Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); - /* TODO - why is NOCOMPLAIN being hardcoded for encodings below? */ - /* * Create a few initial encodings. UTF-8 to UTF-8 translation is not a * no-op because it turns a stream of improperly formed UTF-8 into a -- cgit v0.12 From 96e60d29b763fa1c662fb77e731556ddfaf9c912 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Wed, 15 Feb 2023 17:27:55 +0000 Subject: Start on expanding encoding tests --- generic/tclEncoding.c | 41 +++++------ tests/cmdAH.test | 196 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 190 insertions(+), 47 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 7886910..8cd970f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2409,32 +2409,29 @@ UtfToUtfProc( */ *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (flags & ENCODING_UTF) && (!(flags & ENCODING_INPUT) - || PROFILE_STRICT(profile))) { - /* - * \xC0\x80 and either strict profile or target is "real" UTF-8 - * - Strict profile - error - * - Non-strict, real UTF-8 - output \x00 - */ - if (flags & ENCODING_INPUT) { - /* - * TODO - should above check not be against STRICT? - * That would probably break a convertto command that goes - * from the internal UTF8 to the real UTF8. On the other - * hand this means, a strict UTF8->UTF8 transform is not - * possible using this function. - */ + } + else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && + (UCHAR(src[1]) == 0x80) && (flags & ENCODING_UTF) && + (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || + PROFILE_REPLACE(profile))) { + /* Special sequence \xC0\x80 */ + if (PROFILE_STRICT(profile)) { result = TCL_CONVERT_SYNTAX; break; } - /* - * Convert 0xC080 to real nulls when we are in output mode, - * irrespective of the profile. - */ - *dst++ = 0; - src += 2; + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 1; /* C0, 80 handled in next loop iteration + since dst limit has to be checked */ + } else { + /* + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. + */ + *dst++ = 0; + src += 2; + } } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* diff --git a/tests/cmdAH.test b/tests/cmdAH.test index f2aab52..6aa3c2e 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -184,7 +184,8 @@ set encProfiles {tcl8 strict replace} # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically -# generated based on le/be versions. +# generated based on le/be versions. Also add all ranges from Unicode standard +# Table 3.7 set encValidStrings { ascii ABC \x41\x42\x43 utf-8 A\u0000\u03A9\u8A9E\U00010384 \x41\x00\xCE\xA9\xE8\xAA\x9E\xF0\x90\x8E\x84 @@ -194,22 +195,106 @@ set encValidStrings { utf-32be A\u0000\u03A9\u8A9E\U00010384 \x00\x00\x00\x41\x00\x00\x00\x00\x00\x00\x03\xA9\x00\x00\x8A\x9E\x00\x01\x03\x84 } -# Invalid byte sequences {encoding bytes profile prefix failindex tag} +# Invalid byte sequences. These are driven from a table with format +# {encoding bytes profile expectedresult expectedfailindex ctrl comment} +# # Note tag is used in test id generation as well. The combination -# should be unique for test ids to be unique. -# Note utf-16, utf-32 missing because they are automatically -# generated based on le/be versions. +# should be unique for test ids to be unique. Note utf-16, +# utf-32 missing because they are automatically generated based on le/be +# versions. Each entry potentially results in generation of multiple tests. +# This is controlled by the ctrl field. This should be a list of +# zero or more of the following: +# solo - the test data is the string itself +# lead - the test data is the string followed by a valid suffix +# tail - the test data is the string preceded by a prefix +# middle - the test data is the string wrapped by a prefix and suffix +# If the ctrl field is empty it is treated as all of the above +# Note if there is any other value by itself, it will cause the test to +# be skipped. This is intentional to skip known bugs. + # TODO - other encodings and test cases + +# ascii - Any byte above 127 is invalid set encInvalidBytes { + ascii 80 default \u20AC -1 {} {map to cp1252} + ascii 80 tcl8 \u20AC -1 {} {map to cp1252} + ascii 80 replace \uFFFD -1 {} {Smallest invalid byte} + ascii 80 strict {} 0 {} {Smallest invalid byte} + + ascii 81 default \u0081 -1 {knownBug} {map to cp1252} + ascii 82 default \u201A -1 {knownBug} {map to cp1252} + ascii 83 default \u0192 -1 {knownBug} {map to cp1252} + ascii 84 default \u201E -1 {knownBug} {map to cp1252} + ascii 85 default \u2026 -1 {knownBug} {map to cp1252} + ascii 86 default \u2020 -1 {knownBug} {map to cp1252} + ascii 87 default \u2021 -1 {knownBug} {map to cp1252} + ascii 88 default \u0276 -1 {knownBug} {map to cp1252} + ascii 89 default \u2030 -1 {knownBug} {map to cp1252} + ascii 8A default \u0160 -1 {knownBug} {map to cp1252} + ascii 8B default \u2039 -1 {knownBug} {map to cp1252} + ascii 8C default \u0152 -1 {knownBug} {map to cp1252} + ascii 8D default \u008D -1 {knownBug} {map to cp1252} + ascii 8E default \u017D -1 {knownBug} {map to cp1252} + ascii 8F default \u008F -1 {knownBug} {map to cp1252} + ascii 90 default \u0090 -1 {knownBug} {map to cp1252} + ascii 91 default \u2018 -1 {knownBug} {map to cp1252} + ascii 92 default \u2019 -1 {knownBug} {map to cp1252} + ascii 93 default \u201C -1 {knownBug} {map to cp1252} + ascii 94 default \u201D -1 {knownBug} {map to cp1252} + ascii 95 default \u2022 -1 {knownBug} {map to cp1252} + ascii 96 default \u2013 -1 {knownBug} {map to cp1252} + ascii 97 default \u2014 -1 {knownBug} {map to cp1252} + ascii 98 default \u02DC -1 {knownBug} {map to cp1252} + ascii 99 default \u2122 -1 {knownBug} {map to cp1252} + ascii 9A default \u0161 -1 {knownBug} {map to cp1252} + ascii 9B default \u203A -1 {knownBug} {map to cp1252} + ascii 9C default \u0153 -1 {knownBug} {map to cp1252} + ascii 9D default \u009D -1 {knownBug} {map to cp1252} + ascii 9E default \u017E -1 {knownBug} {map to cp1252} + ascii 9F default \u0178 -1 {knownBug} {map to cp1252} + + ascii FF default \u00FF -1 {} {Largest invalid byte} + ascii FF tcl8 \u00FF -1 {} {Largest invalid byte} + ascii FF replace \uFFFD -1 {} {Largest invalid byte} + ascii FF strict {} 0 {} {Largest invalid byte} +} + +# Following invalid sequences based on Table 3.7 in the Unicode standard. +# utf-8 C0, C1, F5:FF are invalid bytes ANYWHERE. +# Exception is C080 in non-strict mode. +# +lappend encInvalidBytes {*}{ + utf-8 C0 default \u00C0 -1 {} {C0 is invalid anywhere} + utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} + utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} + utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} + + utf-8 C080 default \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} + utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} + utf-8 C080 replace \uFFFD\uFFFD -1 C080 {} {C080 -> U+0 in Tcl's internal modified UTF8} + utf-8 C080 strict {} 0 {} {C080 -> U+0 in Tcl's internal modified UTF8} + + utf-8 C1 default \u00C1 -1 {} {C1 is invalid everywhere} + utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} + utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} + utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} + utf-8 F5 default \u00F5 -1 {} {F5:FF are invalid everywhere} + utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} + utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} + utf-8 F5 strict {} 0 {} {F5:FF are invalid everywhere} + utf-8 FF default \u00FF -1 {} {F5:FF are invalid everywhere} + utf-8 FF tcl8 \u00FF -1 {} {F5:FF are invalid everywhere} + utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere} + utf-8 FF strict {} 0 {} {F5:FF are invalid everywhere} + utf-8 F5908080 default \u00F5 -1 {knownBug} {F5:FF with trailing bytes} +} + +set xxencInvalidBytes { ascii \x41\xe9\x42 default A\u00E9B -1 {non-ASCII} ascii \x41\xe9\x42 tcl8 A\u00E9B -1 {non-ASCII} ascii \x41\xe9\x42 replace A\uFFFDB -1 {non-ASCII} ascii \x41\xe9\x42 strict A 1 {non-ASCII} - - utf-8 \x41\xC0\x42 default A\u00C0B -1 C0 - utf-8 \x41\xC0\x42 tcl8 A\u00C0B -1 C0 - utf-8 \x41\xC0\x42 replace A\uFFFDB -1 C0 - utf-8 \x41\xC0\x42 strict A 1 C0 + utf-8 \x41\x80\x42 default A\u0080B -1 80 utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 utf-8 \x41\x80\x42 replace A\uFFFDB -1 80 @@ -272,7 +357,7 @@ set encUnencodableStrings { iso8859-1 A\u0141B default \x41\x3f\x42 -1 unencodable iso8859-1 A\u0141B tcl8 \x41\x3f\x42 -1 unencodable - iso8859-1 A\u0141B strict \x41 1 unencodable + iso8859-1 A\u0141B strict \x41 0 unencodable utf-8 A\uD800B default \x41\xed\xa0\x80\x42 -1 High-surrogate utf-8 A\uD800B tcl8 \x41\xed\xa0\x80\x42 -1 High-surrogate @@ -282,12 +367,28 @@ set encUnencodableStrings { utf-8 A\uDC00B strict \x41 1 High-surrogate } + if {$::tcl_platform(byteOrder) eq "littleEndian"} { set endian le } else { set endian be } +# Maps utf-{16,32}{le,be} to utf-16, utf-32 and +# others to "". Used to test utf-16, utf-32 based +# on system endianness +proc endianUtf {enc} { + if {$::tcl_platform(byteOrder) eq "littleEndian"} { + set endian le + } else { + set endian be + } + if {$enc eq "utf-16$endian" || $enc eq "utf-32$endian"} { + return [string range $enc 0 5] + } + return "" +} + # # Check errors for invalid number of arguments proc badnumargs {id cmd cmdargs} { @@ -394,9 +495,17 @@ testconvert cmdAH-4.3.12 { # Wrapper for verifying -failindex proc testfailindex {id converter enc data result {profile default}} { if {$profile eq "default"} { - testconvert $id "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + testconvert $id.$enc "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + } } else { - testconvert $id "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + testconvert $id.$enc "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + } } } @@ -410,13 +519,49 @@ foreach {enc string bytes} $encValidStrings { } } -# -failindex - invalid data -foreach {enc bytes profile prefix failidx tag} $encInvalidBytes { - testfailindex cmdAH-4.3.14.$enc.$profile.$tag convertfrom $enc $bytes [list $prefix $failidx] $profile - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.3.14.$enc.$profile.$tag convertfrom $enc $bytes [list $prefix $failidx] $profile +# -failindex - invalid data for each profile +foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + # There are multiple test cases based on location of invalid bytes + set bytes [binary format H* $hex] + set prefix A + set suffix B + set prefixLen [string length [encoding convertto $enc $prefix]] + if {$ctrl eq {} || "solo" in $ctrl} { + testfailindex xxcmdAH-4.3.14.$profile.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile + } + if {$ctrl eq {} || "lead" in $ctrl} { + if {$failidx == -1} { + # If success expected + set result $str$suffix + } else { + # Failure expected + set result "" + } + testfailindex xxcmdAH-4.3.14.$profile.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile + } + if {$ctrl eq {} || "tail" in $ctrl} { + set expected_failidx $failidx + if {$failidx == -1} { + # If success expected + set result $prefix$str + } else { + # Failure expected + set result $prefix + incr expected_failidx [string length [encoding convertto $enc $prefix]] + } + testfailindex xxcmdAH-4.3.14.$profile.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile + } + if {$ctrl eq {} || "middle" in $ctrl} { + set expected_failidx $failidx + if {$failidx == -1} { + # If success expected + set result $prefix$str$suffix + } else { + # Failure expected + set result $prefix + incr expected_failidx [string length [encoding convertto $enc $prefix]] + } + testfailindex xxcmdAH-4.3.14.$profile.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile } } @@ -437,7 +582,8 @@ foreach profile $encProfiles { # Cycle through the various combinations of encodings and profiles # for invalid byte sequences -foreach {enc bytes profile prefix failidx tag} $encInvalidBytes { +foreach {enc hex profile prefix failidx ctrl comment} $encInvalidBytes { + set bytes [binary format H* $hex] if {$failidx eq -1} { set result [list $prefix] } else { @@ -447,18 +593,18 @@ foreach {enc bytes profile prefix failidx tag} $encInvalidBytes { set result [list "unexpected byte sequence starting at index $failidx: *" -returnCodes error -match glob] } if {$profile eq "default"} { - testconvert cmdAH-4.3.15.$enc.$profile.$tag [list encoding convertfrom $enc $bytes] {*}$result + testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom $enc $bytes] {*}$result if {"utf-16$endian" eq $enc} { # utf-16le ->utf-16, utf-32be -> utf32 etc. set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.$tag [list encoding convertfrom $enc $bytes] {*}$result + testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom $enc $bytes] {*}$result } } else { - testconvert cmdAH-4.3.15.$enc.$profile.$tag [list encoding convertfrom -profile $profile $enc $bytes] {*}$result + testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom -profile $profile $enc $bytes] {*}$result if {"utf-16$endian" eq $enc} { # utf-16le ->utf-16, utf-32be -> utf32 etc. set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.$tag [list encoding convertfrom -profile $profile $enc $bytes] {*}$result + testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom -profile $profile $enc $bytes] {*}$result } } } -- cgit v0.12 From 684cbb8f5cc3ed03b9349b0d322b04f1c87cc86a Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 16 Feb 2023 17:15:35 +0000 Subject: Bit more work on encoding test framework. Long way to go. --- generic/tclEncoding.c | 65 ++++---- tests/cmdAH.test | 427 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 324 insertions(+), 168 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 8cd970f..470f8f3 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2368,6 +2368,7 @@ UtfToUtfProc( const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; int ch; + int profile; result = TCL_OK; @@ -2385,8 +2386,8 @@ UtfToUtfProc( flags |= PTR2INT(clientData); dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + profile = TCL_ENCODING_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - int profile = TCL_ENCODING_PROFILE_GET(flags); if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* @@ -2415,15 +2416,15 @@ UtfToUtfProc( (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || PROFILE_REPLACE(profile))) { /* Special sequence \xC0\x80 */ - if (PROFILE_STRICT(profile)) { - result = TCL_CONVERT_SYNTAX; - break; - } - - if (PROFILE_REPLACE(profile)) { - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - src += 1; /* C0, 80 handled in next loop iteration - since dst limit has to be checked */ + if (flags & ENCODING_INPUT) { + if (PROFILE_REPLACE(profile)) { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + src += 2; + } else { + /* PROFILE_STRICT */ + result = TCL_CONVERT_SYNTAX; + break; + } } else { /* * Convert 0xC080 to real nulls when we are in output mode, @@ -2432,6 +2433,7 @@ UtfToUtfProc( *dst++ = 0; src += 2; } + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* @@ -2516,32 +2518,37 @@ UtfToUtfProc( /* * A surrogate character is detected, handle especially. */ - /* TODO - what about REPLACE profile? */ if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } - - low = ch; - len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - - if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { - - if (PROFILE_STRICT(profile)) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; + if (0 && PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + src += len; + // dst += Tcl_UniCharToUtf(ch, dst); + } + else { + low = ch; + len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0; + + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { + + if (PROFILE_STRICT(profile)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } +cesu8: + *dst++ = (char)(((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char)(((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char)((ch | 0x80) & 0xBF); + continue; } - cesu8: - *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((ch | 0x80) & 0xBF); - continue; + src += len; + dst += Tcl_UniCharToUtf(ch, dst); + ch = low; } - src += len; - dst += Tcl_UniCharToUtf(ch, dst); - ch = low; } else if (PROFILE_STRICT(profile) && (!(flags & ENCODING_INPUT)) && SURROGATE(ch)) { diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 6aa3c2e..6386658 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -187,19 +187,18 @@ set encProfiles {tcl8 strict replace} # generated based on le/be versions. Also add all ranges from Unicode standard # Table 3.7 set encValidStrings { - ascii ABC \x41\x42\x43 - utf-8 A\u0000\u03A9\u8A9E\U00010384 \x41\x00\xCE\xA9\xE8\xAA\x9E\xF0\x90\x8E\x84 - utf-16le A\u0000\u03A9\u8A9E\U00010384 \x41\x00\x00\x00\xA9\x03\x9E\x8A\x00\xD8\x84\xDF - utf-16be A\u0000\u03A9\u8A9E\U00010384 \x00\x41\x00\x00\x03\xA9\x8A\x9E\xD8\x00\xDF\x84 - utf-32le A\u0000\u03A9\u8A9E\U00010384 \x41\x00\x00\x00\x00\x00\x00\x00\xA9\x03\x00\x00\x9E\x8A\x00\x00\x84\x03\x01\x00 - utf-32be A\u0000\u03A9\u8A9E\U00010384 \x00\x00\x00\x41\x00\x00\x00\x00\x00\x00\x03\xA9\x00\x00\x8A\x9E\x00\x01\x03\x84 + ascii ABC 414243 + utf-8 A\u0000\u03A9\u8A9E\U00010384 4100CEA9E8AA9EF0908E84 + utf-16le A\u0000\u03A9\u8A9E\U00010384 41000000A9039E8A00D884DF + utf-16be A\u0000\u03A9\u8A9E\U00010384 0041000003A98A9ED800DF84 + utf-32le A\u0000\u03A9\u8A9E\U00010384 4100000000000000A90300009E8A000084030100 + utf-32be A\u0000\u03A9\u8A9E\U00010384 0000004100000000000003A900008A9E00010384 } # Invalid byte sequences. These are driven from a table with format # {encoding bytes profile expectedresult expectedfailindex ctrl comment} # -# Note tag is used in test id generation as well. The combination -# should be unique for test ids to be unique. Note utf-16, +# should be unique for test ids to be unique. Note utf-16, # utf-32 missing because they are automatically generated based on le/be # versions. Each entry potentially results in generation of multiple tests. # This is controlled by the ctrl field. This should be a list of @@ -214,13 +213,15 @@ set encValidStrings { # TODO - other encodings and test cases -# ascii - Any byte above 127 is invalid -set encInvalidBytes { - ascii 80 default \u20AC -1 {} {map to cp1252} - ascii 80 tcl8 \u20AC -1 {} {map to cp1252} +# ascii - Any byte above 127 is invalid and is mapped +# to the same numeric code point except for the range +# 80-9F which is treated as cp1252. +# This tests the TableToUtfProc code path. +lappend encInvalidBytes {*}{ + ascii 80 default \u20AC -1 {knownBug} {map to cp1252} + ascii 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} ascii 80 replace \uFFFD -1 {} {Smallest invalid byte} ascii 80 strict {} 0 {} {Smallest invalid byte} - ascii 81 default \u0081 -1 {knownBug} {map to cp1252} ascii 82 default \u201A -1 {knownBug} {map to cp1252} ascii 83 default \u0192 -1 {knownBug} {map to cp1252} @@ -259,25 +260,80 @@ set encInvalidBytes { ascii FF strict {} 0 {} {Largest invalid byte} } -# Following invalid sequences based on Table 3.7 in the Unicode standard. -# utf-8 C0, C1, F5:FF are invalid bytes ANYWHERE. -# Exception is C080 in non-strict mode. -# +# utf-8 - valid sequences based on Table 3.7 in the Unicode +# standard. +# +# Code Points First Second Third Fourth Byte +# U+0000..U+007F 00..7F +# U+0080..U+07FF C2..DF 80..BF +# U+0800..U+0FFF E0 A0..BF 80..BF +# U+1000..U+CFFF E1..EC 80..BF 80..BF +# U+D000..U+D7FF ED 80..9F 80..BF +# U+E000..U+FFFF EE..EF 80..BF 80..BF +# U+10000..U+3FFFF F0 90..BF 80..BF 80..BF +# U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF +# U+100000..U+10FFFF F4 80..8F 80..BF 80..BF +# +# Tests below are based on the "gaps" in the above table. Note ascii test +# values are repeated because internally a different code path is used +# (UtfToUtfProc). +# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080 lappend encInvalidBytes {*}{ + utf-8 80 default \u20AC -1 {knownBug} {map to cp1252} + utf-8 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} + utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte} + utf-8 80 strict {} 0 {} {Smallest invalid byte} + utf-8 81 default \u0081 -1 {knownBug} {map to cp1252} + utf-8 82 default \u201A -1 {knownBug} {map to cp1252} + utf-8 83 default \u0192 -1 {knownBug} {map to cp1252} + utf-8 84 default \u201E -1 {knownBug} {map to cp1252} + utf-8 85 default \u2026 -1 {knownBug} {map to cp1252} + utf-8 86 default \u2020 -1 {knownBug} {map to cp1252} + utf-8 87 default \u2021 -1 {knownBug} {map to cp1252} + utf-8 88 default \u0276 -1 {knownBug} {map to cp1252} + utf-8 89 default \u2030 -1 {knownBug} {map to cp1252} + utf-8 8A default \u0160 -1 {knownBug} {map to cp1252} + utf-8 8B default \u2039 -1 {knownBug} {map to cp1252} + utf-8 8C default \u0152 -1 {knownBug} {map to cp1252} + utf-8 8D default \u008D -1 {knownBug} {map to cp1252} + utf-8 8E default \u017D -1 {knownBug} {map to cp1252} + utf-8 8F default \u008F -1 {knownBug} {map to cp1252} + utf-8 90 default \u0090 -1 {knownBug} {map to cp1252} + utf-8 91 default \u2018 -1 {knownBug} {map to cp1252} + utf-8 92 default \u2019 -1 {knownBug} {map to cp1252} + utf-8 93 default \u201C -1 {knownBug} {map to cp1252} + utf-8 94 default \u201D -1 {knownBug} {map to cp1252} + utf-8 95 default \u2022 -1 {knownBug} {map to cp1252} + utf-8 96 default \u2013 -1 {knownBug} {map to cp1252} + utf-8 97 default \u2014 -1 {knownBug} {map to cp1252} + utf-8 98 default \u02DC -1 {knownBug} {map to cp1252} + utf-8 99 default \u2122 -1 {knownBug} {map to cp1252} + utf-8 9A default \u0161 -1 {knownBug} {map to cp1252} + utf-8 9B default \u203A -1 {knownBug} {map to cp1252} + utf-8 9C default \u0153 -1 {knownBug} {map to cp1252} + utf-8 9D default \u009D -1 {knownBug} {map to cp1252} + utf-8 9E default \u017E -1 {knownBug} {map to cp1252} + utf-8 9F default \u0178 -1 {knownBug} {map to cp1252} + utf-8 C0 default \u00C0 -1 {} {C0 is invalid anywhere} utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} - utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} - + utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} utf-8 C080 default \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} - utf-8 C080 replace \uFFFD\uFFFD -1 C080 {} {C080 -> U+0 in Tcl's internal modified UTF8} - utf-8 C080 strict {} 0 {} {C080 -> U+0 in Tcl's internal modified UTF8} - + utf-8 C080 strict {} 0 {} {C080 -> invalid} + utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} utf-8 C1 default \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} + + utf-8 C1 default \u00C1 -1 {} {Require valid trail byte} + utf-8 C1 tcl8 \u00C1 -1 {} {Require valid trail byte} + utf-8 C1 replace \uFFFD -1 {} {Require valid trail byte} + utf-8 C1 strict {} 0 {} {Require valid trail byte} + + utf-8 F5 default \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} @@ -286,14 +342,14 @@ lappend encInvalidBytes {*}{ utf-8 FF tcl8 \u00FF -1 {} {F5:FF are invalid everywhere} utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere} utf-8 FF strict {} 0 {} {F5:FF are invalid everywhere} - utf-8 F5908080 default \u00F5 -1 {knownBug} {F5:FF with trailing bytes} + + utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8} + utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownBug} {Unicode Table 3-9} + utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10} + utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownBug} {Unicode Table 3.11} } set xxencInvalidBytes { - ascii \x41\xe9\x42 default A\u00E9B -1 {non-ASCII} - ascii \x41\xe9\x42 tcl8 A\u00E9B -1 {non-ASCII} - ascii \x41\xe9\x42 replace A\uFFFDB -1 {non-ASCII} - ascii \x41\xe9\x42 strict A 1 {non-ASCII} utf-8 \x41\x80\x42 default A\u0080B -1 80 utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 @@ -343,31 +399,39 @@ set utf32-le-TODO { } # Strings that cannot be encoded for specific encoding / profiles -# {encoding string profile bytes failindex tag} -# Note tag is used in test id generation as well. The combination -# should be unique for test ids to be unique. +# {encoding string profile exptedresult expectedfailindex ctrl comment} +# should be unique for test ids to be unique. # Note utf-16, utf-32 missing because they are automatically # generated based on le/be versions. +# Each entry potentially results in generation of multiple tests. +# This is controlled by the ctrl field. This should be a list of +# zero or more of the following: +# solo - the test data is the string itself +# lead - the test data is the string followed by a valid suffix +# tail - the test data is the string preceded by a prefix +# middle - the test data is the string wrapped by a prefix and suffix +# If the ctrl field is empty it is treated as all of the above +# Note if there is any other value by itself, it will cause the test to +# be skipped. This is intentional to skip known bugs. # TODO - other encodings and test cases # TODO - out of range code point (note cannot be generated by \U notation) set encUnencodableStrings { - ascii A\u00e0B default \x41\x3f\x42 -1 non-ASCII - ascii A\u00e0B tcl8 \x41\x3f\x42 -1 non-ASCII - ascii A\u00e0B strict \x41 1 non-ASCII - - iso8859-1 A\u0141B default \x41\x3f\x42 -1 unencodable - iso8859-1 A\u0141B tcl8 \x41\x3f\x42 -1 unencodable - iso8859-1 A\u0141B strict \x41 0 unencodable - - utf-8 A\uD800B default \x41\xed\xa0\x80\x42 -1 High-surrogate - utf-8 A\uD800B tcl8 \x41\xed\xa0\x80\x42 -1 High-surrogate - utf-8 A\uD800B strict \x41 1 High-surrogate - utf-8 A\uDC00B default \x41\xed\xb0\x80\x42 -1 High-surrogate - utf-8 A\uDC00B tcl8 \x41\xed\xb0\x80\x42 -1 High-surrogate - utf-8 A\uDC00B strict \x41 1 High-surrogate + ascii \u00e0 default 3f -1 {} {unencodable} + ascii \u00e0 tcl8 3f -1 {} {unencodable} + ascii \u00e0 strict {} 0 {} {unencodable} + + iso8859-1 \u0141 default 3f -1 {} unencodable + iso8859-1 \u0141 tcl8 3f -1 {} unencodable + iso8859-1 \u0141 strict {} 0 {} unencodable + + utf-8 \uD800 default eda080 -1 {} High-surrogate + utf-8 \uD800 tcl8 eda080 -1 {} High-surrogate + utf-8 \uD800 strict {} 0 {} High-surrogate + utf-8 \uDC00 default edb080 -1 {} High-surrogate + utf-8 \uDC00 tcl8 edb080 -1 {} High-surrogate + utf-8 \uDC00 strict {} 0 {} High-surrogate } - if {$::tcl_platform(byteOrder) eq "littleEndian"} { set endian le } else { @@ -437,6 +501,40 @@ proc testconvert {id body result args} { {*}$args } +proc testprofile {id converter enc profile data result args} { + if {$profile eq "default"} { + testconvert $id.$enc.$profile [list encoding $converter $enc $data] $result {*}$args + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile [list encoding $converter $enc $data] $result {*}$args + } + } else { + testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args + } + } +} + + +# Wrapper for verifying -failindex +proc testfailindex {id converter enc data result {profile default}} { + if {$profile eq "default"} { + testconvert $id.$enc.$profile "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + } + } else { + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + if {[set enc [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + } + } +} + test cmdAH-4.1.1 {encoding} -returnCodes error -body { encoding } -result {wrong # args: should be "encoding subcommand ?arg ...?"} @@ -492,42 +590,110 @@ testconvert cmdAH-4.3.12 { encoding system $system } -# Wrapper for verifying -failindex -proc testfailindex {id converter enc data result {profile default}} { - if {$profile eq "default"} { - testconvert $id.$enc "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result - if {[set enc [endianUtf $enc]] ne ""} { - # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result +# convertfrom, convertfrom -profile + +# convertfrom ?-profile? : All valid byte sequences should be accepted by all profiles +foreach {enc str hex} $encValidStrings { + set bytes [binary decode hex $hex] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc A] + set suffix_bytes [encoding convertto $enc B] + foreach profile $encProfiles { + testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile + testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile + testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile + testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + } +} + +# convertfrom ?-profile? : invalid byte sequences +foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + set bytes [binary format H* $hex] + set prefix A + set suffix B + set prefixLen [string length [encoding convertto $enc $prefix]] + set result [list $str] + # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch + # so glob it out in error message pattern for now. + set errorWithoutPrefix [list "unexpected byte sequence starting at index $failidx: *" -returnCodes error -match glob] + set errorWithPrefix [list "unexpected byte sequence starting at index [expr {$failidx+$prefixLen}]: *" -returnCodes error -match glob] + if {$ctrl eq {} || "solo" in $ctrl} { + if {$failidx == -1} { + set result [list $str] + } else { + set result $errorWithoutPrefix } - } else { - testconvert $id.$enc "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result - if {[set enc [endianUtf $enc]] ne ""} { - # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + testprofile cmdAH-4.3.15.$hex.solo convertfrom $enc $profile $bytes {*}$result + } + if {$ctrl eq {} || "lead" in $ctrl} { + if {$failidx == -1} { + set result [list $str$suffix] + } else { + set result $errorWithoutPrefix + } + testprofile cmdAH-4.3.15.$hex.lead convertfrom $enc $profile $bytes$suffix {*}$result + } + if {$ctrl eq {} || "tail" in $ctrl} { + if {$failidx == -1} { + set result [list $prefix$str] + } else { + set result $errorWithPrefix + } + testprofile cmdAH-4.3.15.$hex.tail convertfrom $enc $profile $prefix$bytes {*}$result + } + if {$ctrl eq {} || "middle" in $ctrl} { + if {$failidx == -1} { + set result [list $prefix$str$suffix] + } else { + set result $errorWithPrefix } + testprofile cmdAH-4.3.15.$hex.middle convertfrom $enc $profile $prefix$bytes$suffix {*}$result } } -# -failindex - valid data -foreach {enc string bytes} $encValidStrings { - testfailindex cmdAH-4.3.13.$enc convertfrom $enc $bytes [list $string -1] - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.3.13.$enc convertfrom $enc $bytes [list $string -1] +proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } } + return $print } -# -failindex - invalid data for each profile +# convertfrom -failindex - valid data +foreach {enc str hex} $encValidStrings { + set bytes [binary decode hex $hex] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc A] + set suffix_bytes [encoding convertto $enc B] + foreach profile $encProfiles { + testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile + testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile + testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile + testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + } +} + + +# convertfrom -failindex, convertfrom -failindex -profile, invalid data foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # There are multiple test cases based on location of invalid bytes - set bytes [binary format H* $hex] + set bytes [binary decode hex $hex] set prefix A set suffix B set prefixLen [string length [encoding convertto $enc $prefix]] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex xxcmdAH-4.3.14.$profile.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -537,7 +703,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # Failure expected set result "" } - testfailindex xxcmdAH-4.3.14.$profile.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -547,9 +713,9 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { # Failure expected set result $prefix - incr expected_failidx [string length [encoding convertto $enc $prefix]] + incr expected_failidx $prefixLen } - testfailindex xxcmdAH-4.3.14.$profile.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -559,53 +725,9 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { # Failure expected set result $prefix - incr expected_failidx [string length [encoding convertto $enc $prefix]] - } - testfailindex xxcmdAH-4.3.14.$profile.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile - } -} - -# -profile - -# All valid byte sequences should be accepted by all profiles -foreach profile $encProfiles { - set i 0 - foreach {enc string bytes} $encValidStrings { - testconvert cmdAH-4.3.15.$enc.$profile.[incr i] [list encoding convertfrom $enc $bytes] $string - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.[incr i] [list encoding convertfrom $enc $bytes] $string - } - } -} - -# Cycle through the various combinations of encodings and profiles -# for invalid byte sequences -foreach {enc hex profile prefix failidx ctrl comment} $encInvalidBytes { - set bytes [binary format H* $hex] - if {$failidx eq -1} { - set result [list $prefix] - } else { - set badbyte "'\\x[string toupper [binary encode hex [string index $bytes $failidx]]]'" - # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch - # so glob it out for now. - set result [list "unexpected byte sequence starting at index $failidx: *" -returnCodes error -match glob] - } - if {$profile eq "default"} { - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom $enc $bytes] {*}$result - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom $enc $bytes] {*}$result - } - } else { - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom -profile $profile $enc $bytes] {*}$result - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.3.15.$enc.$profile.$hex [list encoding convertfrom -profile $profile $enc $bytes] {*}$result + incr expected_failidx $prefixLen } + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile } } @@ -646,41 +768,67 @@ testconvert cmdAH-4.4.12 { # -failindex - valid data foreach {enc string bytes} $encValidStrings { testfailindex cmdAH-4.4.13.$enc convertto $enc $string [list $bytes -1] - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.4.13.$enc convertto $enc $string [list $bytes -1] - } } # -failindex - invalid data -foreach {enc string profile bytes failidx tag} $encUnencodableStrings { - testfailindex cmdAH-4.4.14.$enc.$profile.$tag convertto $enc $string [list $bytes $failidx] $profile - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testfailindex cmdAH-4.4.14.$enc.$profile.$tag convertto $enc $string [list $bytes $failidx] $profile +foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { + set bytes [binary decode hex $hex] + set prefix A + set suffix B + set prefixLen [string length [encoding convertto $enc $prefix]] + if {$ctrl eq {} || "solo" in $ctrl} { + testfailindex cmdAH-4.4.14.$string.solo convertto $enc $string [list $bytes $failidx] $profile + } + if {$ctrl eq {} || "lead" in $ctrl} { + if {$failidx == -1} { + # If success expected + set result $bytes$suffix + } else { + # Failure expected + set result "" + } + testfailindex cmdAH-4.4.14.$string.lead convertto $enc $string$suffix [list $result $failidx] $profile + } + if {$ctrl eq {} || "tail" in $ctrl} { + set expected_failidx $failidx + if {$failidx == -1} { + # If success expected + set result $prefix$bytes + } else { + # Failure expected + set result $prefix + incr expected_failidx $prefixLen + } + testfailindex cmdAH-4.4.14.$string.tail convertto $enc $prefix$string [list $result $expected_failidx] $profile + } + if {$ctrl eq {} || "middle" in $ctrl} { + set expected_failidx $failidx + if {$failidx == -1} { + # If success expected + set result $prefix$bytes$suffix + } else { + # Failure expected + set result $prefix + incr expected_failidx $prefixLen + } + testfailindex cmdAH-4.4.14.$string.middle convertto $enc $prefix$string$suffix [list $result $expected_failidx] $profile } } -# -profile +# convertto -profile # All valid byte sequences should be accepted by all profiles foreach profile $encProfiles { set i 0 foreach {enc string bytes} $encValidStrings { - testconvert cmdAH-4.4.15.$enc.$profile.[incr i] [list encoding convertto $enc $string] $bytes - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - testconvert cmdAH-4.4.15.$enc.$profile.[incr i] [list encoding convertto $enc $string] $bytes - } + testprofile cmdAH-4.4.15 convertto $enc $profile $string $bytes } } # Cycle through the various combinations of encodings and profiles # for invalid byte sequences -foreach {enc string profile bytes failidx tag} $encUnencodableStrings { +foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { + set bytes [binary decode hex $hex] if {$failidx eq -1} { set result [list $bytes] } else { @@ -688,19 +836,20 @@ foreach {enc string profile bytes failidx tag} $encUnencodableStrings { # so glob it out for now. set result [list "unexpected character at index $failidx: *" -returnCodes error -match glob] } + #testprofile xx convertto $enc $profile $string {*}$result if {$profile eq "default"} { - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result + # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result if {"utf-16$endian" eq $enc} { # utf-16le ->utf-16, utf-32be -> utf32 etc. set enc [string range $enc 0 5] - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result + # xxtestconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result } } else { - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result + # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result if {"utf-16$endian" eq $enc} { # utf-16le ->utf-16, utf-32be -> utf32 etc. set enc [string range $enc 0 5] - testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result + # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result } } } -- cgit v0.12 From fdbb12eced9b528f6246424cf0916b620f1783bc Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Fri, 17 Feb 2023 18:59:28 +0000 Subject: Part way through utf-8 test equivalence classes --- generic/tclEncoding.c | 4 +- library/tcltest/tcltest.tcl | 37 +++- tests/cmdAH.test | 503 +++++++++++++++++++++++++++----------------- 3 files changed, 342 insertions(+), 202 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index a11e696..4d5743c 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2525,10 +2525,8 @@ UtfToUtfProc( src = saveSrc; break; } - if (0 && PROFILE_REPLACE(profile)) { + if (PROFILE_REPLACE(profile)) { ch = UNICODE_REPLACE_CHAR; - src += len; - // dst += Tcl_UniCharToUtf(ch, dst); } else { low = ch; diff --git a/library/tcltest/tcltest.tcl b/library/tcltest/tcltest.tcl index 94010a7..9ca7b09 100644 --- a/library/tcltest/tcltest.tcl +++ b/library/tcltest/tcltest.tcl @@ -1134,6 +1134,39 @@ proc tcltest::SafeFetch {n1 n2 op} { } } + +# tcltest::Asciify -- +# +# Transforms the passed string to contain only printable ascii characters. +# Useful for printing to terminals. Non-printables are mapped to +# \x, \u or \U sequences. +# +# Arguments: +# s - string to transform +# +# Results: +# The transformed strings +# +# Side effects: +# None. + +proc tcltest::Asciify {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } + } + return $print +} + # tcltest::ConstraintInitializer -- # # Get or set a script that when evaluated in the tcltest namespace @@ -2222,12 +2255,12 @@ proc tcltest::test {name description args} { puts [outputChannel] "---- Error testing result: $scriptMatch" } else { try { - puts [outputChannel] "---- Result was:\n$actualAnswer" + puts [outputChannel] "---- Result was:\n[Asciify $actualAnswer]" } on error {errMsg errCode} { puts [outputChannel] "---- Result was:\n" } puts [outputChannel] "---- Result should have been\ - ($match matching):\n$result" + ($match matching):\n[Asciify $result]" } } if {$errorCodeFailure} { diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 6386658..df28b2e 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -181,6 +181,7 @@ set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} set "numargErrors(encoding profiles)" {wrong # args: should be "encoding profiles"} set encProfiles {tcl8 strict replace} +set encDefaultProfile tcl8; # Should reflect the default from implementation # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically @@ -218,43 +219,41 @@ set encValidStrings { # 80-9F which is treated as cp1252. # This tests the TableToUtfProc code path. lappend encInvalidBytes {*}{ - ascii 80 default \u20AC -1 {knownBug} {map to cp1252} ascii 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} ascii 80 replace \uFFFD -1 {} {Smallest invalid byte} ascii 80 strict {} 0 {} {Smallest invalid byte} - ascii 81 default \u0081 -1 {knownBug} {map to cp1252} - ascii 82 default \u201A -1 {knownBug} {map to cp1252} - ascii 83 default \u0192 -1 {knownBug} {map to cp1252} - ascii 84 default \u201E -1 {knownBug} {map to cp1252} - ascii 85 default \u2026 -1 {knownBug} {map to cp1252} - ascii 86 default \u2020 -1 {knownBug} {map to cp1252} - ascii 87 default \u2021 -1 {knownBug} {map to cp1252} - ascii 88 default \u0276 -1 {knownBug} {map to cp1252} - ascii 89 default \u2030 -1 {knownBug} {map to cp1252} - ascii 8A default \u0160 -1 {knownBug} {map to cp1252} - ascii 8B default \u2039 -1 {knownBug} {map to cp1252} - ascii 8C default \u0152 -1 {knownBug} {map to cp1252} - ascii 8D default \u008D -1 {knownBug} {map to cp1252} - ascii 8E default \u017D -1 {knownBug} {map to cp1252} - ascii 8F default \u008F -1 {knownBug} {map to cp1252} - ascii 90 default \u0090 -1 {knownBug} {map to cp1252} - ascii 91 default \u2018 -1 {knownBug} {map to cp1252} - ascii 92 default \u2019 -1 {knownBug} {map to cp1252} - ascii 93 default \u201C -1 {knownBug} {map to cp1252} - ascii 94 default \u201D -1 {knownBug} {map to cp1252} - ascii 95 default \u2022 -1 {knownBug} {map to cp1252} - ascii 96 default \u2013 -1 {knownBug} {map to cp1252} - ascii 97 default \u2014 -1 {knownBug} {map to cp1252} - ascii 98 default \u02DC -1 {knownBug} {map to cp1252} - ascii 99 default \u2122 -1 {knownBug} {map to cp1252} - ascii 9A default \u0161 -1 {knownBug} {map to cp1252} - ascii 9B default \u203A -1 {knownBug} {map to cp1252} - ascii 9C default \u0153 -1 {knownBug} {map to cp1252} - ascii 9D default \u009D -1 {knownBug} {map to cp1252} - ascii 9E default \u017E -1 {knownBug} {map to cp1252} - ascii 9F default \u0178 -1 {knownBug} {map to cp1252} - - ascii FF default \u00FF -1 {} {Largest invalid byte} + ascii 81 tcl8 \u0081 -1 {knownBug} {map to cp1252} + ascii 82 tcl8 \u201A -1 {knownBug} {map to cp1252} + ascii 83 tcl8 \u0192 -1 {knownBug} {map to cp1252} + ascii 84 tcl8 \u201E -1 {knownBug} {map to cp1252} + ascii 85 tcl8 \u2026 -1 {knownBug} {map to cp1252} + ascii 86 tcl8 \u2020 -1 {knownBug} {map to cp1252} + ascii 87 tcl8 \u2021 -1 {knownBug} {map to cp1252} + ascii 88 tcl8 \u0276 -1 {knownBug} {map to cp1252} + ascii 89 tcl8 \u2030 -1 {knownBug} {map to cp1252} + ascii 8A tcl8 \u0160 -1 {knownBug} {map to cp1252} + ascii 8B tcl8 \u2039 -1 {knownBug} {map to cp1252} + ascii 8C tcl8 \u0152 -1 {knownBug} {map to cp1252} + ascii 8D tcl8 \u008D -1 {knownBug} {map to cp1252} + ascii 8E tcl8 \u017D -1 {knownBug} {map to cp1252} + ascii 8F tcl8 \u008F -1 {knownBug} {map to cp1252} + ascii 90 tcl8 \u0090 -1 {knownBug} {map to cp1252} + ascii 91 tcl8 \u2018 -1 {knownBug} {map to cp1252} + ascii 92 tcl8 \u2019 -1 {knownBug} {map to cp1252} + ascii 93 tcl8 \u201C -1 {knownBug} {map to cp1252} + ascii 94 tcl8 \u201D -1 {knownBug} {map to cp1252} + ascii 95 tcl8 \u2022 -1 {knownBug} {map to cp1252} + ascii 96 tcl8 \u2013 -1 {knownBug} {map to cp1252} + ascii 97 tcl8 \u2014 -1 {knownBug} {map to cp1252} + ascii 98 tcl8 \u02DC -1 {knownBug} {map to cp1252} + ascii 99 tcl8 \u2122 -1 {knownBug} {map to cp1252} + ascii 9A tcl8 \u0161 -1 {knownBug} {map to cp1252} + ascii 9B tcl8 \u203A -1 {knownBug} {map to cp1252} + ascii 9C tcl8 \u0153 -1 {knownBug} {map to cp1252} + ascii 9D tcl8 \u009D -1 {knownBug} {map to cp1252} + ascii 9E tcl8 \u017E -1 {knownBug} {map to cp1252} + ascii 9F tcl8 \u0178 -1 {knownBug} {map to cp1252} + ascii FF tcl8 \u00FF -1 {} {Largest invalid byte} ascii FF replace \uFFFD -1 {} {Largest invalid byte} ascii FF strict {} 0 {} {Largest invalid byte} @@ -279,121 +278,188 @@ lappend encInvalidBytes {*}{ # (UtfToUtfProc). # Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080 lappend encInvalidBytes {*}{ - utf-8 80 default \u20AC -1 {knownBug} {map to cp1252} + utf-8 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} utf-8 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte} utf-8 80 strict {} 0 {} {Smallest invalid byte} - utf-8 81 default \u0081 -1 {knownBug} {map to cp1252} - utf-8 82 default \u201A -1 {knownBug} {map to cp1252} - utf-8 83 default \u0192 -1 {knownBug} {map to cp1252} - utf-8 84 default \u201E -1 {knownBug} {map to cp1252} - utf-8 85 default \u2026 -1 {knownBug} {map to cp1252} - utf-8 86 default \u2020 -1 {knownBug} {map to cp1252} - utf-8 87 default \u2021 -1 {knownBug} {map to cp1252} - utf-8 88 default \u0276 -1 {knownBug} {map to cp1252} - utf-8 89 default \u2030 -1 {knownBug} {map to cp1252} - utf-8 8A default \u0160 -1 {knownBug} {map to cp1252} - utf-8 8B default \u2039 -1 {knownBug} {map to cp1252} - utf-8 8C default \u0152 -1 {knownBug} {map to cp1252} - utf-8 8D default \u008D -1 {knownBug} {map to cp1252} - utf-8 8E default \u017D -1 {knownBug} {map to cp1252} - utf-8 8F default \u008F -1 {knownBug} {map to cp1252} - utf-8 90 default \u0090 -1 {knownBug} {map to cp1252} - utf-8 91 default \u2018 -1 {knownBug} {map to cp1252} - utf-8 92 default \u2019 -1 {knownBug} {map to cp1252} - utf-8 93 default \u201C -1 {knownBug} {map to cp1252} - utf-8 94 default \u201D -1 {knownBug} {map to cp1252} - utf-8 95 default \u2022 -1 {knownBug} {map to cp1252} - utf-8 96 default \u2013 -1 {knownBug} {map to cp1252} - utf-8 97 default \u2014 -1 {knownBug} {map to cp1252} - utf-8 98 default \u02DC -1 {knownBug} {map to cp1252} - utf-8 99 default \u2122 -1 {knownBug} {map to cp1252} - utf-8 9A default \u0161 -1 {knownBug} {map to cp1252} - utf-8 9B default \u203A -1 {knownBug} {map to cp1252} - utf-8 9C default \u0153 -1 {knownBug} {map to cp1252} - utf-8 9D default \u009D -1 {knownBug} {map to cp1252} - utf-8 9E default \u017E -1 {knownBug} {map to cp1252} - utf-8 9F default \u0178 -1 {knownBug} {map to cp1252} - - utf-8 C0 default \u00C0 -1 {} {C0 is invalid anywhere} + utf-8 81 tcl8 \u0081 -1 {knownBug} {map to cp1252} + utf-8 82 tcl8 \u201A -1 {knownBug} {map to cp1252} + utf-8 83 tcl8 \u0192 -1 {knownBug} {map to cp1252} + utf-8 84 tcl8 \u201E -1 {knownBug} {map to cp1252} + utf-8 85 tcl8 \u2026 -1 {knownBug} {map to cp1252} + utf-8 86 tcl8 \u2020 -1 {knownBug} {map to cp1252} + utf-8 87 tcl8 \u2021 -1 {knownBug} {map to cp1252} + utf-8 88 tcl8 \u0276 -1 {knownBug} {map to cp1252} + utf-8 89 tcl8 \u2030 -1 {knownBug} {map to cp1252} + utf-8 8A tcl8 \u0160 -1 {knownBug} {map to cp1252} + utf-8 8B tcl8 \u2039 -1 {knownBug} {map to cp1252} + utf-8 8C tcl8 \u0152 -1 {knownBug} {map to cp1252} + utf-8 8D tcl8 \u008D -1 {knownBug} {map to cp1252} + utf-8 8E tcl8 \u017D -1 {knownBug} {map to cp1252} + utf-8 8F tcl8 \u008F -1 {knownBug} {map to cp1252} + utf-8 90 tcl8 \u0090 -1 {knownBug} {map to cp1252} + utf-8 91 tcl8 \u2018 -1 {knownBug} {map to cp1252} + utf-8 92 tcl8 \u2019 -1 {knownBug} {map to cp1252} + utf-8 93 tcl8 \u201C -1 {knownBug} {map to cp1252} + utf-8 94 tcl8 \u201D -1 {knownBug} {map to cp1252} + utf-8 95 tcl8 \u2022 -1 {knownBug} {map to cp1252} + utf-8 96 tcl8 \u2013 -1 {knownBug} {map to cp1252} + utf-8 97 tcl8 \u2014 -1 {knownBug} {map to cp1252} + utf-8 98 tcl8 \u02DC -1 {knownBug} {map to cp1252} + utf-8 99 tcl8 \u2122 -1 {knownBug} {map to cp1252} + utf-8 9A tcl8 \u0161 -1 {knownBug} {map to cp1252} + utf-8 9B tcl8 \u203A -1 {knownBug} {map to cp1252} + utf-8 9C tcl8 \u0153 -1 {knownBug} {map to cp1252} + utf-8 9D tcl8 \u009D -1 {knownBug} {map to cp1252} + utf-8 9E tcl8 \u017E -1 {knownBug} {map to cp1252} + utf-8 9F tcl8 \u0178 -1 {knownBug} {map to cp1252} + utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} - utf-8 C080 default \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} utf-8 C080 strict {} 0 {} {C080 -> invalid} utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} - utf-8 C1 default \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} - utf-8 C1 default \u00C1 -1 {} {Require valid trail byte} - utf-8 C1 tcl8 \u00C1 -1 {} {Require valid trail byte} - utf-8 C1 replace \uFFFD -1 {} {Require valid trail byte} - utf-8 C1 strict {} 0 {} {Require valid trail byte} - + utf-8 C2 tcl8 \u00C2 -1 {} {Missing trail byte} + utf-8 C2 replace \uFFFD -1 {} {Missing trail byte} + utf-8 C2 strict {} 0 {} {Missing trail byte} + utf-8 C27F tcl8 \u00C2\x7F -1 {} {Trail byte must be 80:BF} + utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 C27F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 DF tcl8 \u00DF -1 {} {Missing trail byte} + utf-8 DF replace \uFFFD -1 {} {Missing trail byte} + utf-8 DF strict {} 0 {} {Missing trail byte} + utf-8 DF7F tcl8 \u00DF\x7F -1 {} {Trail byte must be 80:BF} + utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 DF7F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 DFE0A080 tcl8 \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence} + utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence} + utf-8 DFE0A080 strict {} 0 {} {Invalid trail byte is start of valid sequence} + + utf-8 E0 tcl8 \u00E0 -1 {} {Missing trail byte} + utf-8 E0 replace \uFFFD -1 {} {Missing trail byte} + utf-8 E0 strict {} 0 {} {Missing trail byte} + utf-8 E080 tcl8 \u00E0\u20AC -1 {knownBug} {First trail byte must be A0:BF} + utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} + utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E09F tcl8 \u00E0\u0178 -1 {knownBug} {First trail byte must be A0:BF} + utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} + utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E0A07F tcl8 \u00E0\u00A0\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E0A07F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E0A07F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 E0BF7F tcl8 \u00E0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E0BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E0BF7F strict {} 0 {} {Second trail byte must be 80:BF} + + utf-8 E1 tcl8 \u00E1 -1 {} {Missing trail byte} + utf-8 E1 replace \uFFFD -1 {} {Missing trail byte} + utf-8 E1 strict {} 0 {} {Missing trail byte} + utf-8 E17F tcl8 \u00E1\x7F -1 {} {Trail byte must be 80:BF} + utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 E17F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 E1807F tcl8 \u00E1\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 E1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E1807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 E1BF7F tcl8 \u00E1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E1BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EC tcl8 \u00EC -1 {} {Missing trail byte} + utf-8 EC replace \uFFFD -1 {} {Missing trail byte} + utf-8 EC strict {} 0 {} {Missing trail byte} + utf-8 EC7F tcl8 \u00EC\x7F -1 {} {Trail byte must be 80:BF} + utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 EC7F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 EC807F tcl8 \u00EC\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 EC807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EC807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 ECBF7F tcl8 \u00EC\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 ECBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ECBF7F strict {} 0 {} {Second trail byte must be 80:BF} + + utf-8 ED tcl8 \u00ED -1 {} {Missing trail byte} + utf-8 ED replace \uFFFD -1 {} {Missing trail byte} + utf-8 ED strict {} 0 {} {Missing trail byte} + utf-8 ED7F tcl8 \u00ED\u7F -1 {knownBug} {First trail byte must be 80:9F} + utf-8 ED7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:9F} + utf-8 ED7F strict {} 0 {} {First trail byte must be 80:9F} + utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {knownBug} {First trail byte must be 80:9F} + utf-8 EDA0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:9F} + utf-8 EDA0 strict {} 0 {} {First trail byte must be 80:9F} + utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 ED807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ED807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 ED9F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ED9F7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EDA080 tcl8 \uD800 -1 {} {High surrogate} + utf-8 EDA080 replace \uFFFD -1 {} {High surrogate} + utf-8 EDA080 strict {} 0 {} {High surrogate} + utf-8 EDAFBF tcl8 \uDBFF -1 {} {High surrogate} + utf-8 EDAFBF replace \uFFFD -1 {} {High surrogate} + utf-8 EDAFBF strict {} 0 {} {High surrogate} + utf-8 EDB080 tcl8 \uDC00 -1 {} {Low surrogate} + utf-8 EDB080 replace \uFFFD -1 {} {Low surrogate} + utf-8 EDB080 strict {} 0 {} {Low surrogate} + utf-8 EDBFBF tcl8 \uDFFF -1 {} {Low surrogate} + utf-8 EDBFBF replace \uFFFD -1 {} {Low surrogate} + utf-8 EDBFBF strict {} 0 {} {Low surrogate} + utf-8 EDA080EDB080 tcl8 \U00010000 -1 {} {High low surrogate pair} + utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {} {High low surrogate pair} + utf-8 EDA080EDB080 strict {} 0 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF -1 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF strict {} 0 {} {High low surrogate pair} - utf-8 F5 default \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} utf-8 F5 strict {} 0 {} {F5:FF are invalid everywhere} - utf-8 FF default \u00FF -1 {} {F5:FF are invalid everywhere} utf-8 FF tcl8 \u00FF -1 {} {F5:FF are invalid everywhere} utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere} utf-8 FF strict {} 0 {} {F5:FF are invalid everywhere} utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8} - utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownBug} {Unicode Table 3-9} + utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9} utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10} - utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownBug} {Unicode Table 3.11} + utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3.11} } set xxencInvalidBytes { - utf-8 \x41\x80\x42 default A\u0080B -1 80 utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 utf-8 \x41\x80\x42 replace A\uFFFDB -1 80 utf-8 \x41\x80\x42 strict A 1 80 - utf-8 \x41\xC0\x80\x42 default A\u0000B -1 C080 utf-8 \x41\xC0\x80\x42 tcl8 A\u0000B -1 C080 utf-8 \x41\xC0\x80\x42 strict A 1 C080 - utf-8 \x41\xC1\x42 default A\u00C1B -1 C1 utf-8 \x41\xC1\x42 tcl8 A\u00C1B -1 C1 utf-8 \x41\xC1\x42 replace A\uFFFDB -1 C1 utf-8 \x41\xC1\x42 strict A 1 C1 - utf-8 \x41\xC2\x42 default A\u00C2B -1 C2-nontrail utf-8 \x41\xC2\x42 tcl8 A\u00C2B -1 C2-nontrail utf-8 \x41\xC2\x42 replace A\uFFFDB -1 C2-nontrail utf-8 \x41\xC2\x42 strict A 1 C2-nontrail - utf-8 \x41\xC2 default A\u00C2 -1 C2-incomplete utf-8 \x41\xC2 tcl8 A\u00C2 -1 C2-incomplete utf-8 \x41\xC2 replace A\uFFFD -1 C2-incomplete utf-8 \x41\xC2 strict A 1 C2-incomplete - utf-8 A\xed\xa0\x80B default A\uD800B -1 High-surrogate utf-8 A\xed\xa0\x80B tcl8 A\uD800B -1 High-surrogate utf-8 A\xed\xa0\x80B strict A 1 High-surrogate - utf-8 A\xed\xb0\x80B default A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate - utf-8 \xed\xa0\x80\xed\xb0\x80 default \U00010000 -1 High-low-surrogate utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate utf-8 \xed\xa0\x80\xed\xb0\x80 strict {} 0 High-low-surrogate } set utf32-le-TODO { - utf-32le \x00\xD8\x00\x00 default \uD800 -1 {High-surrogate} utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} utf-32le \x00\xD8\x00\x00 strict "" 0 {High-surrogate} - utf-32le \x00\xDC\x00\x00 default \uDC00 -1 {Low-surrogate} utf-32le \x00\xDC\x00\x00 tcl8 \uDC00 -1 {Low-surrogate} utf-32le \x00\xDC\x00\x00 strict "" 0 {Low-surrogate} - utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 default \uD800\uDC00 -1 {High-low-surrogate} utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 tcl8 \uD800\uDC00 -1 {High-low-surrogate} utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 strict "" 0 {High-low-surrogate} - utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 default \uDC00\uD800 -1 {High-low-surrogate} utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 tcl8 \uDC00\uD800 -1 {High-low-surrogate} utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 strict "" 0 {High-low-surrogate} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 default A\uD800B -1 {High-surrogate-middle} utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 tcl8 A\uD800B -1 {High-surrogate-middle} utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 strict A 4 {High-surrogate-middle} } @@ -416,18 +482,14 @@ set utf32-le-TODO { # TODO - other encodings and test cases # TODO - out of range code point (note cannot be generated by \U notation) set encUnencodableStrings { - ascii \u00e0 default 3f -1 {} {unencodable} ascii \u00e0 tcl8 3f -1 {} {unencodable} ascii \u00e0 strict {} 0 {} {unencodable} - iso8859-1 \u0141 default 3f -1 {} unencodable iso8859-1 \u0141 tcl8 3f -1 {} unencodable iso8859-1 \u0141 strict {} 0 {} unencodable - utf-8 \uD800 default eda080 -1 {} High-surrogate utf-8 \uD800 tcl8 eda080 -1 {} High-surrogate utf-8 \uD800 strict {} 0 {} High-surrogate - utf-8 \uDC00 default edb080 -1 {} High-surrogate utf-8 \uDC00 tcl8 edb080 -1 {} High-surrogate utf-8 \uDC00 strict {} 0 {} High-surrogate } @@ -453,6 +515,24 @@ proc endianUtf {enc} { return "" } +# Map arbitrary strings to printable form in ASCII. +proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } + } + return $print +} + # # Check errors for invalid number of arguments proc badnumargs {id cmd cmdargs} { @@ -501,36 +581,45 @@ proc testconvert {id body result args} { {*}$args } +# Wrapper to verify encoding convert{to,from} ?-profile? +# Generates tests for compiled and uncompiled implementation. +# Also generates utf-{16,32} tests if passed encoding is utf-{16,32}{le,be} +# The enc and profile are appended to id to generate the test id proc testprofile {id converter enc profile data result args} { - if {$profile eq "default"} { - testconvert $id.$enc.$profile [list encoding $converter $enc $data] $result {*}$args - if {[set enc [endianUtf $enc]] ne ""} { - # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc.$profile [list encoding $converter $enc $data] $result {*}$args - } - } else { - testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args - if {[set enc [endianUtf $enc]] ne ""} { + testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args + if {[set enc2 [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc2.$profile [list encoding $converter -profile $profile $enc2 $data] $result {*}$args + } + + # If this is the default profile, generate a test without specifying profile + if {$profile eq $::encDefaultProfile} { + testconvert $id.$enc.default [list encoding $converter $enc $data] $result {*}$args + if {[set enc2 [endianUtf $enc]] ne ""} { # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc.$profile [list encoding $converter -profile $profile $enc $data] $result {*}$args + testconvert $id.$enc2.default [list encoding $converter $enc2 $data] $result {*}$args } } } -# Wrapper for verifying -failindex +# Wrapper to verify encoding convert{to,from} -failindex ?-profile? +# Generates tests for compiled and uncompiled implementation. +# Also generates utf-{16,32} tests if passed encoding is utf-{16,32}{le,be} +# The enc and profile are appended to id to generate the test id proc testfailindex {id converter enc data result {profile default}} { - if {$profile eq "default"} { - testconvert $id.$enc.$profile "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result - if {[set enc [endianUtf $enc]] ne ""} { - # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc.$profile "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result - } - } else { - testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result - if {[set enc [endianUtf $enc]] ne ""} { + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + if {[set enc2 [endianUtf $enc]] ne ""} { + # If utf{16,32}-{le,be}, also do utf{16,32} + testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 $data] \[set idx]" $result + } + + # If this is the default profile, generate a test without specifying profile + if {$profile eq $::encDefaultProfile} { + testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + if {[set enc2 [endianUtf $enc]] ne ""} { # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result + testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 $data] \[set idx]" $result } } } @@ -590,9 +679,7 @@ testconvert cmdAH-4.3.12 { encoding system $system } -# convertfrom, convertfrom -profile - -# convertfrom ?-profile? : All valid byte sequences should be accepted by all profiles +# convertfrom ?-profile? : valid byte sequences foreach {enc str hex} $encValidStrings { set bytes [binary decode hex $hex] set prefix A @@ -612,7 +699,9 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set bytes [binary format H* $hex] set prefix A set suffix B - set prefixLen [string length [encoding convertto $enc $prefix]] + set prefix_bytes [encoding convertto $enc $prefix] + set suffix_bytes [encoding convertto $enc $suffix] + set prefixLen [string length $prefix_bytes] set result [list $str] # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch # so glob it out in error message pattern for now. @@ -624,7 +713,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { set result $errorWithoutPrefix } - testprofile cmdAH-4.3.15.$hex.solo convertfrom $enc $profile $bytes {*}$result + testprofile cmdAH-4.3.13.$hex.solo convertfrom $enc $profile $bytes {*}$result } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -632,7 +721,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { set result $errorWithoutPrefix } - testprofile cmdAH-4.3.15.$hex.lead convertfrom $enc $profile $bytes$suffix {*}$result + testprofile cmdAH-4.3.13.$hex.lead convertfrom $enc $profile $bytes$suffix_bytes {*}$result } if {$ctrl eq {} || "tail" in $ctrl} { if {$failidx == -1} { @@ -640,7 +729,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { set result $errorWithPrefix } - testprofile cmdAH-4.3.15.$hex.tail convertfrom $enc $profile $prefix$bytes {*}$result + testprofile cmdAH-4.3.13.$hex.tail convertfrom $enc $profile $prefix_bytes$bytes {*}$result } if {$ctrl eq {} || "middle" in $ctrl} { if {$failidx == -1} { @@ -648,28 +737,11 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } else { set result $errorWithPrefix } - testprofile cmdAH-4.3.15.$hex.middle convertfrom $enc $profile $prefix$bytes$suffix {*}$result + testprofile cmdAH-4.3.13.$hex.middle convertfrom $enc $profile $prefix_bytes$bytes$suffix_bytes {*}$result } } -proc printable {s} { - set print "" - foreach c [split $s ""] { - set i [scan $c %c] - if {[string is print $c] && ($i <= 127)} { - append print $c - } elseif {$i <= 0xff} { - append print \\x[format %02X $i] - } elseif {$i <= 0xffff} { - append print \\u[format %04X $i] - } else { - append print \\U[format %08X $i] - } - } - return $print -} - -# convertfrom -failindex - valid data +# convertfrom -failindex ?-profile? - valid data foreach {enc str hex} $encValidStrings { set bytes [binary decode hex $hex] set prefix A @@ -677,15 +749,14 @@ foreach {enc str hex} $encValidStrings { set prefix_bytes [encoding convertto $enc A] set suffix_bytes [encoding convertto $enc B] foreach profile $encProfiles { - testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile - testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile - testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile - testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile } } - -# convertfrom -failindex, convertfrom -failindex -profile, invalid data +# convertfrom -failindex ?-profile? - invalid data foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # There are multiple test cases based on location of invalid bytes set bytes [binary decode hex $hex] @@ -765,19 +836,96 @@ testconvert cmdAH-4.4.12 { encoding system $system } -# -failindex - valid data -foreach {enc string bytes} $encValidStrings { - testfailindex cmdAH-4.4.13.$enc convertto $enc $string [list $bytes -1] +# convertto ?-profile? : valid byte sequences + +foreach {enc str hex} $encValidStrings { + set bytes [binary decode hex $hex] + set printable [printable $str] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc A] + set suffix_bytes [encoding convertto $enc B] + foreach profile $encProfiles { + testprofile cmdAH-4.4.13.$printable.solo convertto $enc $profile $str $bytes + testprofile cmdAH-4.4.13.$printable.lead convertto $enc $profile $str$suffix $bytes$suffix_bytes + testprofile cmdAH-4.4.13.$printable.tail convertto $enc $profile $prefix$str $prefix_bytes$bytes + testprofile cmdAH-4.4.13.$printable.middle convertto $enc $profile $prefix$str$suffix $prefix_bytes$bytes$suffix_bytes + } +} + +# convertto ?-profile? : invalid byte sequences +foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { + set bytes [binary decode hex $hex] + set printable [printable $str] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc $prefix] + set suffix_bytes [encoding convertto $enc $suffix] + set prefixLen [string length $prefix_bytes] + set result [list $bytes] + # TODO - if the bad byte is unprintable, tcltest errors out when printing a mismatch + # so glob it out in error message pattern for now. + set errorWithoutPrefix [list "unexpected character at index $failidx: *" -returnCodes error -match glob] + set errorWithPrefix [list "unexpected character at index [expr {$failidx+$prefixLen}]: *" -returnCodes error -match glob] + if {$ctrl eq {} || "solo" in $ctrl} { + if {$failidx == -1} { + set result [list $bytes] + } else { + set result $errorWithoutPrefix + } + testprofile cmdAH-4.4.13.$printable.solo convertto $enc $profile $str {*}$result + } + if {$ctrl eq {} || "lead" in $ctrl} { + if {$failidx == -1} { + set result [list $bytes$suffix_bytes] + } else { + set result $errorWithoutPrefix + } + testprofile cmdAH-4.4.13.$printable.lead convertto $enc $profile $str$suffix {*}$result + } + if {$ctrl eq {} || "tail" in $ctrl} { + if {$failidx == -1} { + set result [list $prefix_bytes$bytes] + } else { + set result $errorWithPrefix + } + testprofile cmdAH-4.4.13.$printable.tail convertto $enc $profile $prefix$str {*}$result + } + if {$ctrl eq {} || "middle" in $ctrl} { + if {$failidx == -1} { + set result [list $prefix_bytes$bytes$suffix_bytes] + } else { + set result $errorWithPrefix + } + testprofile cmdAH-4.4.13.$printable.middle convertto $enc $profile $prefix$str$suffix {*}$result + } } -# -failindex - invalid data -foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { +# convertto -failindex ?-profile? - valid data +foreach {enc str hex} $encValidStrings { set bytes [binary decode hex $hex] + set printable [printable $str] + set prefix A + set suffix B + set prefix_bytes [encoding convertto $enc A] + set suffix_bytes [encoding convertto $enc B] + foreach profile $encProfiles { + testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str [list $bytes -1] $profile + testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix [list $bytes$suffix_bytes -1] $profile + testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str [list $prefix_bytes$bytes -1] $profile + testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix [list $prefix_bytes$bytes$suffix_bytes -1] $profile + } +} + +# convertto -failindex ?-profile? - invalid data +foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { + set bytes [binary decode hex $hex] + set printable [printable $str] set prefix A set suffix B set prefixLen [string length [encoding convertto $enc $prefix]] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex cmdAH-4.4.14.$string.solo convertto $enc $string [list $bytes $failidx] $profile + testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str [list $bytes $failidx] $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -787,7 +935,7 @@ foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { # Failure expected set result "" } - testfailindex cmdAH-4.4.14.$string.lead convertto $enc $string$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix [list $result $failidx] $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -799,7 +947,7 @@ foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.4.14.$string.tail convertto $enc $prefix$string [list $result $expected_failidx] $profile + testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str [list $result $expected_failidx] $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -811,46 +959,7 @@ foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.4.14.$string.middle convertto $enc $prefix$string$suffix [list $result $expected_failidx] $profile - } -} - -# convertto -profile - -# All valid byte sequences should be accepted by all profiles -foreach profile $encProfiles { - set i 0 - foreach {enc string bytes} $encValidStrings { - testprofile cmdAH-4.4.15 convertto $enc $profile $string $bytes - } -} - -# Cycle through the various combinations of encodings and profiles -# for invalid byte sequences -foreach {enc string profile hex failidx ctrl comment} $encUnencodableStrings { - set bytes [binary decode hex $hex] - if {$failidx eq -1} { - set result [list $bytes] - } else { - # TODO - if the bad char is unprintable, tcltest errors out when printing a mismatch - # so glob it out for now. - set result [list "unexpected character at index $failidx: *" -returnCodes error -match glob] - } - #testprofile xx convertto $enc $profile $string {*}$result - if {$profile eq "default"} { - # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - # xxtestconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto $enc $string] {*}$result - } - } else { - # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result - if {"utf-16$endian" eq $enc} { - # utf-16le ->utf-16, utf-32be -> utf32 etc. - set enc [string range $enc 0 5] - # testconvert cmdAH-4.4.15.$enc.$profile.$tag [list encoding convertto -profile $profile $enc $string] {*}$result - } + testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix [list $result $expected_failidx] $profile } } -- cgit v0.12 From 3d2dc708451191d04cca00561cbed0295a407b11 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sat, 18 Feb 2023 16:25:57 +0000 Subject: Done with invalid utf-8 table --- tests/cmdAH.test | 278 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 241 insertions(+), 37 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index df28b2e..ad315d2 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -344,11 +344,17 @@ lappend encInvalidBytes {*}{ utf-8 E0 replace \uFFFD -1 {} {Missing trail byte} utf-8 E0 strict {} 0 {} {Missing trail byte} utf-8 E080 tcl8 \u00E0\u20AC -1 {knownBug} {First trail byte must be A0:BF} - utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} - utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} + utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} utf-8 E09F tcl8 \u00E0\u0178 -1 {knownBug} {First trail byte must be A0:BF} - utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} - utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} + utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E0A0 tcl8 \u00E0\u00A0 -1 {} {Missing second trail byte} + utf-8 E0A0 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E0A0 strict {} 0 {} {Missing second trail byte} + utf-8 E0BF tcl8 \u00E0\u00BF -1 {} {Missing second trail byte} + utf-8 E0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E0BF strict {} 0 {} {Missing second trail byte} utf-8 E0A07F tcl8 \u00E0\u00A0\x7F -1 {} {Second trail byte must be 80:BF} utf-8 E0A07F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 E0A07F strict {} 0 {} {Second trail byte must be 80:BF} @@ -362,6 +368,12 @@ lappend encInvalidBytes {*}{ utf-8 E17F tcl8 \u00E1\x7F -1 {} {Trail byte must be 80:BF} utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} utf-8 E17F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 E181 tcl8 \u00E1\u0081 -1 {} {Missing second trail byte} + utf-8 E181 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E181 strict {} 0 {} {Missing second trail byte} + utf-8 E1BF tcl8 \u00E1\u00BF -1 {} {Missing second trail byte} + utf-8 E1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E1BF strict {} 0 {} {Missing second trail byte} utf-8 E1807F tcl8 \u00E1\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} utf-8 E1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 E1807F strict {} 0 {} {Second trail byte must be 80:BF} @@ -374,6 +386,12 @@ lappend encInvalidBytes {*}{ utf-8 EC7F tcl8 \u00EC\x7F -1 {} {Trail byte must be 80:BF} utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} utf-8 EC7F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 EC81 tcl8 \u00EC\u0081 -1 {} {Missing second trail byte} + utf-8 EC81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EC81 strict {} 0 {} {Missing second trail byte} + utf-8 ECBF tcl8 \u00EC\u00BF -1 {} {Missing second trail byte} + utf-8 ECBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 ECBF strict {} 0 {} {Missing second trail byte} utf-8 EC807F tcl8 \u00EC\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} utf-8 EC807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 EC807F strict {} 0 {} {Second trail byte must be 80:BF} @@ -381,39 +399,225 @@ lappend encInvalidBytes {*}{ utf-8 ECBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 ECBF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 ED tcl8 \u00ED -1 {} {Missing trail byte} - utf-8 ED replace \uFFFD -1 {} {Missing trail byte} - utf-8 ED strict {} 0 {} {Missing trail byte} - utf-8 ED7F tcl8 \u00ED\u7F -1 {knownBug} {First trail byte must be 80:9F} - utf-8 ED7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:9F} - utf-8 ED7F strict {} 0 {} {First trail byte must be 80:9F} - utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {knownBug} {First trail byte must be 80:9F} - utf-8 EDA0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:9F} - utf-8 EDA0 strict {} 0 {} {First trail byte must be 80:9F} - utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} - utf-8 ED807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 ED807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {knownBug} {Second trail byte must be 80:BF} - utf-8 ED9F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 ED9F7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 EDA080 tcl8 \uD800 -1 {} {High surrogate} - utf-8 EDA080 replace \uFFFD -1 {} {High surrogate} - utf-8 EDA080 strict {} 0 {} {High surrogate} - utf-8 EDAFBF tcl8 \uDBFF -1 {} {High surrogate} - utf-8 EDAFBF replace \uFFFD -1 {} {High surrogate} - utf-8 EDAFBF strict {} 0 {} {High surrogate} - utf-8 EDB080 tcl8 \uDC00 -1 {} {Low surrogate} - utf-8 EDB080 replace \uFFFD -1 {} {Low surrogate} - utf-8 EDB080 strict {} 0 {} {Low surrogate} - utf-8 EDBFBF tcl8 \uDFFF -1 {} {Low surrogate} - utf-8 EDBFBF replace \uFFFD -1 {} {Low surrogate} - utf-8 EDBFBF strict {} 0 {} {Low surrogate} - utf-8 EDA080EDB080 tcl8 \U00010000 -1 {} {High low surrogate pair} - utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {} {High low surrogate pair} - utf-8 EDA080EDB080 strict {} 0 {} {High low surrogate pair} - utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF -1 {} {High low surrogate pair} - utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {} {High low surrogate pair} - utf-8 EDAFBFEDBFBF strict {} 0 {} {High low surrogate pair} + utf-8 ED tcl8 \u00ED -1 {} {Missing trail byte} + utf-8 ED replace \uFFFD -1 {} {Missing trail byte} + utf-8 ED strict {} 0 {} {Missing trail byte} + utf-8 ED7F tcl8 \u00ED\u7F -1 {knownBug} {First trail byte must be 80:9F} + utf-8 ED7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:9F} + utf-8 ED7F strict {} 0 {} {First trail byte must be 80:9F} + utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {knownBug} {First trail byte must be 80:9F} + utf-8 EDA0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:9F} + utf-8 EDA0 strict {} 0 {} {First trail byte must be 80:9F} + utf-8 ED81 tcl8 \u00ED\u0081 -1 {} {Missing second trail byte} + utf-8 ED81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 ED81 strict {} 0 {} {Missing second trail byte} + utf-8 EDBF tcl8 \u00ED\u00BF -1 {} {Missing second trail byte} + utf-8 EDBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EDBF strict {} 0 {} {Missing second trail byte} + utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 ED807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ED807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 ED9F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ED9F7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EDA080 tcl8 \uD800 -1 {} {High surrogate} + utf-8 EDA080 replace \uFFFD -1 {} {High surrogate} + utf-8 EDA080 strict {} 0 {} {High surrogate} + utf-8 EDAFBF tcl8 \uDBFF -1 {} {High surrogate} + utf-8 EDAFBF replace \uFFFD -1 {} {High surrogate} + utf-8 EDAFBF strict {} 0 {} {High surrogate} + utf-8 EDB080 tcl8 \uDC00 -1 {} {Low surrogate} + utf-8 EDB080 replace \uFFFD -1 {} {Low surrogate} + utf-8 EDB080 strict {} 0 {} {Low surrogate} + utf-8 EDBFBF tcl8 \uDFFF -1 {} {Low surrogate} + utf-8 EDBFBF replace \uFFFD -1 {} {Low surrogate} + utf-8 EDBFBF strict {} 0 {} {Low surrogate} + utf-8 EDA080EDB080 tcl8 \U00010000 -1 {} {High low surrogate pair} + utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {} {High low surrogate pair} + utf-8 EDA080EDB080 strict {} 0 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF -1 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF strict {} 0 {} {High low surrogate pair} + + utf-8 EE tcl8 \u00EE -1 {} {Missing trail byte} + utf-8 EE replace \uFFFD -1 {} {Missing trail byte} + utf-8 EE strict {} 0 {} {Missing trail byte} + utf-8 EE7F tcl8 \u00EE\u7F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EE7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} + utf-8 EE7F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EED0 tcl8 \u00EE\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EED0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 EED0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EE81 tcl8 \u00EE\u0081 -1 {} {Missing second trail byte} + utf-8 EE81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EE81 strict {} 0 {} {Missing second trail byte} + utf-8 EEBF tcl8 \u00EE\u00BF -1 {} {Missing second trail byte} + utf-8 EEBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EEBF strict {} 0 {} {Missing second trail byte} + utf-8 EE807F tcl8 \u00EE\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 EE807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EE807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EEBF7F tcl8 \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EEBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EEBF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EF tcl8 \u00EF -1 {} {Missing trail byte} + utf-8 EF replace \uFFFD -1 {} {Missing trail byte} + utf-8 EF strict {} 0 {} {Missing trail byte} + utf-8 EF7F tcl8 \u00EF\u7F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EF7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} + utf-8 EF7F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EFD0 tcl8 \u00EF\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EFD0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 EFD0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EF81 tcl8 \u00EF\u0081 -1 {} {Missing second trail byte} + utf-8 EF81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EF81 strict {} 0 {} {Missing second trail byte} + utf-8 EFBF tcl8 \u00EF\u00BF -1 {} {Missing second trail byte} + utf-8 EFBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EFBF strict {} 0 {} {Missing second trail byte} + utf-8 EF807F tcl8 \u00EF\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 EF807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EF807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EFBF7F tcl8 \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EFBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EFBF7F strict {} 0 {} {Second trail byte must be 80:BF} + + utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte} + utf-8 F0 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F0 strict {} 0 {} {Missing trail byte} + utf-8 F08F tcl8 \u00F0\u8F -1 {knownBug} {First trail byte must be 90:BF} + utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} + utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF} + utf-8 F0D0 tcl8 \u00F0\u00D0 -1 {knownBug} {First trail byte must be 90:BF} + utf-8 F0D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 90:BF} + utf-8 F0D0 strict {} 0 {} {First trail byte must be 90:BF} + utf-8 F090 tcl8 \u00F0\u0090 -1 {} {Missing second trail byte} + utf-8 F090 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F090 strict {} 0 {} {Missing second trail byte} + utf-8 F0BF tcl8 \u00F0\u00BF -1 {} {Missing second trail byte} + utf-8 F0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F0BF strict {} 0 {} {Missing second trail byte} + utf-8 F0907F tcl8 \u00F0\u0090\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F0907F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F0907F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F0BF7F tcl8 \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F0BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F0BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F090BF tcl8 \u00F0\u0090\u00BF -1 {} {Missing third trail byte} + utf-8 F090BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F090BF strict {} 0 {} {Missing third trail byte} + utf-8 F0BF81 tcl8 \u00F0\u00BF\u0081 -1 {} {Missing third trail byte} + utf-8 F0BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F0BF81 strict {} 0 {} {Missing third trail byte} + utf-8 F0BF807F tcl8 \u00F0\u00BF\u20AC\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F0BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F0BF817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F090BFD0 tcl8 \u00F0\u0090\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F090BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F090BFD0 strict {} 0 {} {Third trail byte must be 80:BF} + + utf-8 F1 tcl8 \u00F1 -1 {} {Missing trail byte} + utf-8 F1 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F1 strict {} 0 {} {Missing trail byte} + utf-8 F17F tcl8 \u00F1\u8F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F17F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} + utf-8 F17F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F1D0 tcl8 \u00F1\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F1D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 F1D0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F180 tcl8 \u00F1\u0080 -1 {} {Missing second trail byte} + utf-8 F180 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F180 strict {} 0 {} {Missing second trail byte} + utf-8 F1BF tcl8 \u00F1\u00BF -1 {} {Missing second trail byte} + utf-8 F1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F1BF strict {} 0 {} {Missing second trail byte} + utf-8 F1807F tcl8 \u00F1\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F1807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F1BF7F tcl8 \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F1BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F180BF tcl8 \u00F1\u20AC\u00BF -1 {knownBug} {Missing third trail byte} + utf-8 F180BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F180BF strict {} 0 {} {Missing third trail byte} + utf-8 F1BF81 tcl8 \u00F1\u00BF\u0081 -1 {} {Missing third trail byte} + utf-8 F1BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F1BF81 strict {} 0 {} {Missing third trail byte} + utf-8 F1BF807F tcl8 \u00F1\u00BF\u20AC\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F1BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F1BF817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F180BFD0 tcl8 \u00F1\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F180BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F180BFD0 strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F3 tcl8 \u00F3 -1 {} {Missing trail byte} + utf-8 F3 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F3 strict {} 0 {} {Missing trail byte} + utf-8 F37F tcl8 \u00F3\u8F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F37F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} + utf-8 F37F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F3D0 tcl8 \u00F3\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F3D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 F3D0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F380 tcl8 \u00F3\u0080 -1 {} {Missing second trail byte} + utf-8 F380 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F380 strict {} 0 {} {Missing second trail byte} + utf-8 F3BF tcl8 \u00F3\u00BF -1 {} {Missing second trail byte} + utf-8 F3BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F3BF strict {} 0 {} {Missing second trail byte} + utf-8 F3807F tcl8 \u00F3\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F3807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F3807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F3BF7F tcl8 \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F3BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F3BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F380BF tcl8 \u00F3\u20AC\u00BF -1 {knownBug} {Missing third trail byte} + utf-8 F380BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F380BF strict {} 0 {} {Missing third trail byte} + utf-8 F3BF81 tcl8 \u00F3\u00BF\u0081 -1 {} {Missing third trail byte} + utf-8 F3BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F3BF81 strict {} 0 {} {Missing third trail byte} + utf-8 F3BF807F tcl8 \u00F3\u00BF\u20AC\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F3BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F3BF817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F380BFD0 tcl8 \u00F3\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F380BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F380BFD0 strict {} 0 {} {Third trail byte must be 80:BF} + + utf-8 F4 tcl8 \u00F4 -1 {} {Missing trail byte} + utf-8 F4 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F4 strict {} 0 {} {Missing trail byte} + utf-8 F47F tcl8 \u00F4\u7F -1 {knownBug} {First trail byte must be 80:8F} + utf-8 F47F replace \uFFFD\u7F -1 {knownW3C} {First trail byte must be 80:8F} + utf-8 F47F strict {} 0 {} {First trail byte must be 80:8F} + utf-8 F490 tcl8 \u00F4\u0090 -1 {knownBug} {First trail byte must be 80:8F} + utf-8 F490 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:8F} + utf-8 F490 strict {} 0 {} {First trail byte must be 80:8F} + utf-8 F480 tcl8 \u00F4\u0080 -1 {} {Missing second trail byte} + utf-8 F480 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F480 strict {} 0 {} {Missing second trail byte} + utf-8 F48F tcl8 \u00F4\u008F -1 {} {Missing second trail byte} + utf-8 F48F replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F48F strict {} 0 {} {Missing second trail byte} + utf-8 F4807F tcl8 \u00F4\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F4807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F4807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F48F7F tcl8 \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F48F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F48F7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F48081 tcl8 \u00F4\u20AC\u0081 -1 {knownBug} {Missing third trail byte} + utf-8 F48081 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F48081 strict {} 0 {} {Missing third trail byte} + utf-8 F48F81 tcl8 \u00F4\u008F\u0081 -1 {} {Missing third trail byte} + utf-8 F48F81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F48F81 strict {} 0 {} {Missing third trail byte} + utf-8 F481817F tcl8 \u00F4\u0081\u0081\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F480817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F480817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F48FBFD0 tcl8 \u00F4\u008F\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F48FBFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F48FBFD0 strict {} 0 {} {Third trail byte must be 80:BF} + + utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} -- cgit v0.12 From 41c5d1cd91756ac3614489931ebe22a4095a6cf9 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sat, 18 Feb 2023 17:41:44 +0000 Subject: Minor refactoring/fixes after merge --- generic/tclEncoding.c | 42 ++++++++++-------------------------------- tests/encoding.test | 4 ++-- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2095b4c..7e5ec22 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2452,38 +2452,16 @@ UtfToUtfProc( : TCL_CONVERT_SYNTAX; break; } - if (PROFILE_REPLACE(profile)) { - ch = UNICODE_REPLACE_CHAR; - ++src; - } else { - /* TCL_ENCODING_PROFILE_TCL8 */ - ch = UCHAR(*src); - char chbuf[2]; - chbuf[0] = UCHAR(*src++); chbuf[1] = 0; - TclUtfToUCS4(chbuf, &ch); - } - } - else { - /* - * Incomplete bytes for real UTF-8 target. - * TODO - no profile check here because did not have any - * checks in the pre-profile code. Why? Is it because on - * output a valid internal utf-8 stream is assumed? - */ - char chbuf[2]; - /* - * TODO - this code seems broken to me. - * - it does not check profiles - * - generates invalid output for real UTF-8 target - * (consider \xC2) - * A possible explanation is this behavior matches the - * Tcl8 decoding behavior of mapping invalid bytes to the same - * code point value. Still, at least strictness checks should - * be made. - */ - chbuf[0] = UCHAR(*src++); chbuf[1] = 0; - TclUtfToUCS4(chbuf, &ch); - } + } + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + ++src; + } else { + /* TCL_ENCODING_PROFILE_TCL8 */ + char chbuf[2]; + chbuf[0] = UCHAR(*src++); chbuf[1] = 0; + TclUtfToUCS4(chbuf, &ch); + } dst += Tcl_UniCharToUtf(ch, dst); } else { diff --git a/tests/encoding.test b/tests/encoding.test index 36728d1..7199138 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -848,10 +848,10 @@ test encoding-24.41 {Parse invalid utf-8 with -profile strict} -body { encoding convertfrom -profile strict utf-8 \xED\xA0\x80\xED\xB0\x80 } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'} test encoding-24.42 {Parse invalid utf-8, fallback to cp1252 [885c86a9a0]} -body { - encoding convertfrom -nocomplain utf-8 \xF0\x80\x80\x80 + encoding convertfrom -profile tcl8 utf-8 \xF0\x80\x80\x80 } -result \xF0\u20AC\u20AC\u20AC test encoding-24.43 {Parse invalid utf-8, fallback to cp1252 [885c86a9a0]} -body { - encoding convertfrom -nocomplain utf-8 \x80 + encoding convertfrom -profile tcl8 utf-8 \x80 } -result \u20AC file delete [file join [temporaryDirectory] iso2022.txt] -- cgit v0.12 From 9f595d2fa36d13395f1bfb16559f7519c08e873f Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sun, 19 Feb 2023 07:40:29 +0000 Subject: Remove knownBug test constraints now that fix has been merged from core-8-branch --- tests/cmdAH.test | 131 +++++++++++++++++++++++++++---------------------------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 11a8188..faa604a 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -278,41 +278,40 @@ lappend encInvalidBytes {*}{ # (UtfToUtfProc). # Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080 lappend encInvalidBytes {*}{ - utf-8 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} - utf-8 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} + utf-8 80 tcl8 \u20AC -1 {} {map to cp1252} utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte} utf-8 80 strict {} 0 {} {Smallest invalid byte} - utf-8 81 tcl8 \u0081 -1 {knownBug} {map to cp1252} - utf-8 82 tcl8 \u201A -1 {knownBug} {map to cp1252} - utf-8 83 tcl8 \u0192 -1 {knownBug} {map to cp1252} - utf-8 84 tcl8 \u201E -1 {knownBug} {map to cp1252} - utf-8 85 tcl8 \u2026 -1 {knownBug} {map to cp1252} - utf-8 86 tcl8 \u2020 -1 {knownBug} {map to cp1252} - utf-8 87 tcl8 \u2021 -1 {knownBug} {map to cp1252} - utf-8 88 tcl8 \u0276 -1 {knownBug} {map to cp1252} - utf-8 89 tcl8 \u2030 -1 {knownBug} {map to cp1252} - utf-8 8A tcl8 \u0160 -1 {knownBug} {map to cp1252} - utf-8 8B tcl8 \u2039 -1 {knownBug} {map to cp1252} - utf-8 8C tcl8 \u0152 -1 {knownBug} {map to cp1252} - utf-8 8D tcl8 \u008D -1 {knownBug} {map to cp1252} - utf-8 8E tcl8 \u017D -1 {knownBug} {map to cp1252} - utf-8 8F tcl8 \u008F -1 {knownBug} {map to cp1252} - utf-8 90 tcl8 \u0090 -1 {knownBug} {map to cp1252} - utf-8 91 tcl8 \u2018 -1 {knownBug} {map to cp1252} - utf-8 92 tcl8 \u2019 -1 {knownBug} {map to cp1252} - utf-8 93 tcl8 \u201C -1 {knownBug} {map to cp1252} - utf-8 94 tcl8 \u201D -1 {knownBug} {map to cp1252} - utf-8 95 tcl8 \u2022 -1 {knownBug} {map to cp1252} - utf-8 96 tcl8 \u2013 -1 {knownBug} {map to cp1252} - utf-8 97 tcl8 \u2014 -1 {knownBug} {map to cp1252} - utf-8 98 tcl8 \u02DC -1 {knownBug} {map to cp1252} - utf-8 99 tcl8 \u2122 -1 {knownBug} {map to cp1252} - utf-8 9A tcl8 \u0161 -1 {knownBug} {map to cp1252} - utf-8 9B tcl8 \u203A -1 {knownBug} {map to cp1252} - utf-8 9C tcl8 \u0153 -1 {knownBug} {map to cp1252} - utf-8 9D tcl8 \u009D -1 {knownBug} {map to cp1252} - utf-8 9E tcl8 \u017E -1 {knownBug} {map to cp1252} - utf-8 9F tcl8 \u0178 -1 {knownBug} {map to cp1252} + utf-8 81 tcl8 \u0081 -1 {} {map to cp1252} + utf-8 82 tcl8 \u201A -1 {} {map to cp1252} + utf-8 83 tcl8 \u0192 -1 {} {map to cp1252} + utf-8 84 tcl8 \u201E -1 {} {map to cp1252} + utf-8 85 tcl8 \u2026 -1 {} {map to cp1252} + utf-8 86 tcl8 \u2020 -1 {} {map to cp1252} + utf-8 87 tcl8 \u2021 -1 {} {map to cp1252} + utf-8 88 tcl8 \u02C6 -1 {} {map to cp1252} + utf-8 89 tcl8 \u2030 -1 {} {map to cp1252} + utf-8 8A tcl8 \u0160 -1 {} {map to cp1252} + utf-8 8B tcl8 \u2039 -1 {} {map to cp1252} + utf-8 8C tcl8 \u0152 -1 {} {map to cp1252} + utf-8 8D tcl8 \u008D -1 {} {map to cp1252} + utf-8 8E tcl8 \u017D -1 {} {map to cp1252} + utf-8 8F tcl8 \u008F -1 {} {map to cp1252} + utf-8 90 tcl8 \u0090 -1 {} {map to cp1252} + utf-8 91 tcl8 \u2018 -1 {} {map to cp1252} + utf-8 92 tcl8 \u2019 -1 {} {map to cp1252} + utf-8 93 tcl8 \u201C -1 {} {map to cp1252} + utf-8 94 tcl8 \u201D -1 {} {map to cp1252} + utf-8 95 tcl8 \u2022 -1 {} {map to cp1252} + utf-8 96 tcl8 \u2013 -1 {} {map to cp1252} + utf-8 97 tcl8 \u2014 -1 {} {map to cp1252} + utf-8 98 tcl8 \u02DC -1 {} {map to cp1252} + utf-8 99 tcl8 \u2122 -1 {} {map to cp1252} + utf-8 9A tcl8 \u0161 -1 {} {map to cp1252} + utf-8 9B tcl8 \u203A -1 {} {map to cp1252} + utf-8 9C tcl8 \u0153 -1 {} {map to cp1252} + utf-8 9D tcl8 \u009D -1 {} {map to cp1252} + utf-8 9E tcl8 \u017E -1 {} {map to cp1252} + utf-8 9F tcl8 \u0178 -1 {} {map to cp1252} utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} @@ -343,10 +342,10 @@ lappend encInvalidBytes {*}{ utf-8 E0 tcl8 \u00E0 -1 {} {Missing trail byte} utf-8 E0 replace \uFFFD -1 {} {Missing trail byte} utf-8 E0 strict {} 0 {} {Missing trail byte} - utf-8 E080 tcl8 \u00E0\u20AC -1 {knownBug} {First trail byte must be A0:BF} + utf-8 E080 tcl8 \u00E0\u20AC -1 {} {First trail byte must be A0:BF} utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} - utf-8 E09F tcl8 \u00E0\u0178 -1 {knownBug} {First trail byte must be A0:BF} + utf-8 E09F tcl8 \u00E0\u0178 -1 {} {First trail byte must be A0:BF} utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} utf-8 E0A0 tcl8 \u00E0\u00A0 -1 {} {Missing second trail byte} @@ -374,7 +373,7 @@ lappend encInvalidBytes {*}{ utf-8 E1BF tcl8 \u00E1\u00BF -1 {} {Missing second trail byte} utf-8 E1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 E1BF strict {} 0 {} {Missing second trail byte} - utf-8 E1807F tcl8 \u00E1\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 E1807F tcl8 \u00E1\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 E1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 E1807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 E1BF7F tcl8 \u00E1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} @@ -392,7 +391,7 @@ lappend encInvalidBytes {*}{ utf-8 ECBF tcl8 \u00EC\u00BF -1 {} {Missing second trail byte} utf-8 ECBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 ECBF strict {} 0 {} {Missing second trail byte} - utf-8 EC807F tcl8 \u00EC\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 EC807F tcl8 \u00EC\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 EC807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 EC807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 ECBF7F tcl8 \u00EC\u00BF\x7F -1 {} {Second trail byte must be 80:BF} @@ -402,10 +401,10 @@ lappend encInvalidBytes {*}{ utf-8 ED tcl8 \u00ED -1 {} {Missing trail byte} utf-8 ED replace \uFFFD -1 {} {Missing trail byte} utf-8 ED strict {} 0 {} {Missing trail byte} - utf-8 ED7F tcl8 \u00ED\u7F -1 {knownBug} {First trail byte must be 80:9F} + utf-8 ED7F tcl8 \u00ED\u7F -1 {} {First trail byte must be 80:9F} utf-8 ED7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:9F} utf-8 ED7F strict {} 0 {} {First trail byte must be 80:9F} - utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {knownBug} {First trail byte must be 80:9F} + utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {} {First trail byte must be 80:9F} utf-8 EDA0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:9F} utf-8 EDA0 strict {} 0 {} {First trail byte must be 80:9F} utf-8 ED81 tcl8 \u00ED\u0081 -1 {} {Missing second trail byte} @@ -414,10 +413,10 @@ lappend encInvalidBytes {*}{ utf-8 EDBF tcl8 \u00ED\u00BF -1 {} {Missing second trail byte} utf-8 EDBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 EDBF strict {} 0 {} {Missing second trail byte} - utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 ED807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 ED807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF} utf-8 ED9F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 ED9F7F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 EDA080 tcl8 \uD800 -1 {} {High surrogate} @@ -442,10 +441,10 @@ lappend encInvalidBytes {*}{ utf-8 EE tcl8 \u00EE -1 {} {Missing trail byte} utf-8 EE replace \uFFFD -1 {} {Missing trail byte} utf-8 EE strict {} 0 {} {Missing trail byte} - utf-8 EE7F tcl8 \u00EE\u7F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EE7F tcl8 \u00EE\u7F -1 {} {First trail byte must be 80:BF} utf-8 EE7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} utf-8 EE7F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 EED0 tcl8 \u00EE\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EED0 tcl8 \u00EE\u00D0 -1 {} {First trail byte must be 80:BF} utf-8 EED0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} utf-8 EED0 strict {} 0 {} {First trail byte must be 80:BF} utf-8 EE81 tcl8 \u00EE\u0081 -1 {} {Missing second trail byte} @@ -454,7 +453,7 @@ lappend encInvalidBytes {*}{ utf-8 EEBF tcl8 \u00EE\u00BF -1 {} {Missing second trail byte} utf-8 EEBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 EEBF strict {} 0 {} {Missing second trail byte} - utf-8 EE807F tcl8 \u00EE\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 EE807F tcl8 \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 EE807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 EE807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 EEBF7F tcl8 \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF} @@ -463,10 +462,10 @@ lappend encInvalidBytes {*}{ utf-8 EF tcl8 \u00EF -1 {} {Missing trail byte} utf-8 EF replace \uFFFD -1 {} {Missing trail byte} utf-8 EF strict {} 0 {} {Missing trail byte} - utf-8 EF7F tcl8 \u00EF\u7F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EF7F tcl8 \u00EF\u7F -1 {} {First trail byte must be 80:BF} utf-8 EF7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} utf-8 EF7F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 EFD0 tcl8 \u00EF\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 EFD0 tcl8 \u00EF\u00D0 -1 {} {First trail byte must be 80:BF} utf-8 EFD0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} utf-8 EFD0 strict {} 0 {} {First trail byte must be 80:BF} utf-8 EF81 tcl8 \u00EF\u0081 -1 {} {Missing second trail byte} @@ -475,7 +474,7 @@ lappend encInvalidBytes {*}{ utf-8 EFBF tcl8 \u00EF\u00BF -1 {} {Missing second trail byte} utf-8 EFBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 EFBF strict {} 0 {} {Missing second trail byte} - utf-8 EF807F tcl8 \u00EF\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 EF807F tcl8 \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 EF807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 EF807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 EFBF7F tcl8 \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF} @@ -485,10 +484,10 @@ lappend encInvalidBytes {*}{ utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte} utf-8 F0 replace \uFFFD -1 {} {Missing trail byte} utf-8 F0 strict {} 0 {} {Missing trail byte} - utf-8 F08F tcl8 \u00F0\u8F -1 {knownBug} {First trail byte must be 90:BF} + utf-8 F08F tcl8 \u00F0\u8F -1 {} {First trail byte must be 90:BF} utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF} - utf-8 F0D0 tcl8 \u00F0\u00D0 -1 {knownBug} {First trail byte must be 90:BF} + utf-8 F0D0 tcl8 \u00F0\u00D0 -1 {} {First trail byte must be 90:BF} utf-8 F0D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 90:BF} utf-8 F0D0 strict {} 0 {} {First trail byte must be 90:BF} utf-8 F090 tcl8 \u00F0\u0090 -1 {} {Missing second trail byte} @@ -497,7 +496,7 @@ lappend encInvalidBytes {*}{ utf-8 F0BF tcl8 \u00F0\u00BF -1 {} {Missing second trail byte} utf-8 F0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 F0BF strict {} 0 {} {Missing second trail byte} - utf-8 F0907F tcl8 \u00F0\u0090\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F0907F tcl8 \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F0907F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F0907F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 F0BF7F tcl8 \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} @@ -509,7 +508,7 @@ lappend encInvalidBytes {*}{ utf-8 F0BF81 tcl8 \u00F0\u00BF\u0081 -1 {} {Missing third trail byte} utf-8 F0BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F0BF81 strict {} 0 {} {Missing third trail byte} - utf-8 F0BF807F tcl8 \u00F0\u00BF\u20AC\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F0BF807F tcl8 \u00F0\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} utf-8 F0BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} utf-8 F0BF817F strict {} 0 {} {Third trail byte must be 80:BF} utf-8 F090BFD0 tcl8 \u00F0\u0090\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} @@ -519,10 +518,10 @@ lappend encInvalidBytes {*}{ utf-8 F1 tcl8 \u00F1 -1 {} {Missing trail byte} utf-8 F1 replace \uFFFD -1 {} {Missing trail byte} utf-8 F1 strict {} 0 {} {Missing trail byte} - utf-8 F17F tcl8 \u00F1\u8F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F17F tcl8 \u00F1\u7F -1 {} {First trail byte must be 80:BF} utf-8 F17F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} utf-8 F17F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 F1D0 tcl8 \u00F1\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F1D0 tcl8 \u00F1\u00D0 -1 {} {First trail byte must be 80:BF} utf-8 F1D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} utf-8 F1D0 strict {} 0 {} {First trail byte must be 80:BF} utf-8 F180 tcl8 \u00F1\u20AC -1 {} {Missing second trail byte} @@ -531,19 +530,19 @@ lappend encInvalidBytes {*}{ utf-8 F1BF tcl8 \u00F1\u00BF -1 {} {Missing second trail byte} utf-8 F1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 F1BF strict {} 0 {} {Missing second trail byte} - utf-8 F1807F tcl8 \u00F1\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F1807F tcl8 \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F1807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 F1BF7F tcl8 \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F1BF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F180BF tcl8 \u00F1\u20AC\u00BF -1 {knownBug} {Missing third trail byte} + utf-8 F180BF tcl8 \u00F1\u20AC\u00BF -1 {} {Missing third trail byte} utf-8 F180BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F180BF strict {} 0 {} {Missing third trail byte} utf-8 F1BF81 tcl8 \u00F1\u00BF\u0081 -1 {} {Missing third trail byte} utf-8 F1BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F1BF81 strict {} 0 {} {Missing third trail byte} - utf-8 F1BF807F tcl8 \u00F1\u00BF\u20AC\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F1BF807F tcl8 \u00F1\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} utf-8 F1BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} utf-8 F1BF817F strict {} 0 {} {Third trail byte must be 80:BF} utf-8 F180BFD0 tcl8 \u00F1\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} @@ -552,10 +551,10 @@ lappend encInvalidBytes {*}{ utf-8 F3 tcl8 \u00F3 -1 {} {Missing trail byte} utf-8 F3 replace \uFFFD -1 {} {Missing trail byte} utf-8 F3 strict {} 0 {} {Missing trail byte} - utf-8 F37F tcl8 \u00F3\u8F -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F37F tcl8 \u00F3\x7F -1 {} {First trail byte must be 80:BF} utf-8 F37F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} utf-8 F37F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 F3D0 tcl8 \u00F3\u00D0 -1 {knownBug} {First trail byte must be 80:BF} + utf-8 F3D0 tcl8 \u00F3\u00D0 -1 {} {First trail byte must be 80:BF} utf-8 F3D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} utf-8 F3D0 strict {} 0 {} {First trail byte must be 80:BF} utf-8 F380 tcl8 \u00F3\u20AC -1 {} {Missing second trail byte} @@ -564,19 +563,19 @@ lappend encInvalidBytes {*}{ utf-8 F3BF tcl8 \u00F3\u00BF -1 {} {Missing second trail byte} utf-8 F3BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 F3BF strict {} 0 {} {Missing second trail byte} - utf-8 F3807F tcl8 \u00F3\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F3807F tcl8 \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F3807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F3807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 F3BF7F tcl8 \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F3BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F3BF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F380BF tcl8 \u00F3\u20AC\u00BF -1 {knownBug} {Missing third trail byte} + utf-8 F380BF tcl8 \u00F3\u20AC\u00BF -1 {} {Missing third trail byte} utf-8 F380BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F380BF strict {} 0 {} {Missing third trail byte} utf-8 F3BF81 tcl8 \u00F3\u00BF\u0081 -1 {} {Missing third trail byte} utf-8 F3BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F3BF81 strict {} 0 {} {Missing third trail byte} - utf-8 F3BF807F tcl8 \u00F3\u00BF\u20AC\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F3BF807F tcl8 \u00F3\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} utf-8 F3BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} utf-8 F3BF817F strict {} 0 {} {Third trail byte must be 80:BF} utf-8 F380BFD0 tcl8 \u00F3\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} @@ -586,10 +585,10 @@ lappend encInvalidBytes {*}{ utf-8 F4 tcl8 \u00F4 -1 {} {Missing trail byte} utf-8 F4 replace \uFFFD -1 {} {Missing trail byte} utf-8 F4 strict {} 0 {} {Missing trail byte} - utf-8 F47F tcl8 \u00F4\u7F -1 {knownBug} {First trail byte must be 80:8F} + utf-8 F47F tcl8 \u00F4\u7F -1 {} {First trail byte must be 80:8F} utf-8 F47F replace \uFFFD\u7F -1 {knownW3C} {First trail byte must be 80:8F} utf-8 F47F strict {} 0 {} {First trail byte must be 80:8F} - utf-8 F490 tcl8 \u00F4\u0090 -1 {knownBug} {First trail byte must be 80:8F} + utf-8 F490 tcl8 \u00F4\u0090 -1 {} {First trail byte must be 80:8F} utf-8 F490 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:8F} utf-8 F490 strict {} 0 {} {First trail byte must be 80:8F} utf-8 F480 tcl8 \u00F4\u20AC -1 {} {Missing second trail byte} @@ -598,19 +597,19 @@ lappend encInvalidBytes {*}{ utf-8 F48F tcl8 \u00F4\u008F -1 {} {Missing second trail byte} utf-8 F48F replace \uFFFD -1 {knownW3C} {Missing second trail byte} utf-8 F48F strict {} 0 {} {Missing second trail byte} - utf-8 F4807F tcl8 \u00F4\u20AC\x7F -1 {knownBug} {Second trail byte must be 80:BF} + utf-8 F4807F tcl8 \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F4807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F4807F strict {} 0 {} {Second trail byte must be 80:BF} utf-8 F48F7F tcl8 \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF} utf-8 F48F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} utf-8 F48F7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F48081 tcl8 \u00F4\u20AC\u0081 -1 {knownBug} {Missing third trail byte} + utf-8 F48081 tcl8 \u00F4\u20AC\u0081 -1 {} {Missing third trail byte} utf-8 F48081 replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F48081 strict {} 0 {} {Missing third trail byte} utf-8 F48F81 tcl8 \u00F4\u008F\u0081 -1 {} {Missing third trail byte} utf-8 F48F81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} utf-8 F48F81 strict {} 0 {} {Missing third trail byte} - utf-8 F481817F tcl8 \u00F4\u0081\u0081\x7F -1 {knownBug} {Third trail byte must be 80:BF} + utf-8 F481817F tcl8 \u00F4\u0081\u0081\x7F -1 {} {Third trail byte must be 80:BF} utf-8 F480817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} utf-8 F480817F strict {} 0 {} {Third trail byte must be 80:BF} utf-8 F48FBFD0 tcl8 \u00F4\u008F\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} -- cgit v0.12 From 41af9f9e84d0b6cee2116ff08e297db05786e6ce Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Mon, 20 Feb 2023 15:08:58 +0000 Subject: Add UTF16 and UTF32 tests --- tests/cmdAH.test | 193 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 137 insertions(+), 56 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index faa604a..1fbe6d2 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -185,15 +185,58 @@ set encDefaultProfile tcl8; # Should reflect the default from implementation # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically -# generated based on le/be versions. Also add all ranges from Unicode standard -# Table 3.7 +# generated based on le/be versions. set encValidStrings { - ascii ABC 414243 - utf-8 A\u0000\u03A9\u8A9E\U00010384 4100CEA9E8AA9EF0908E84 - utf-16le A\u0000\u03A9\u8A9E\U00010384 41000000A9039E8A00D884DF - utf-16be A\u0000\u03A9\u8A9E\U00010384 0041000003A98A9ED800DF84 - utf-32le A\u0000\u03A9\u8A9E\U00010384 4100000000000000A90300009E8A000084030100 - utf-32be A\u0000\u03A9\u8A9E\U00010384 0000004100000000000003A900008A9E00010384 + ascii \u0000 00 {} {Lowest ASCII} + ascii \u007F 7F knownBug {Highest ASCII} + + utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1} + utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1} + utf-8 \u0080 C280 {} {Unicode Table 3.7 Row 2} + utf-8 \u07FF DFBF {} {Unicode Table 3.7 Row 2} + utf-8 \u0800 E0A080 {} {Unicode Table 3.7 Row 3} + utf-8 \u0FFF E0BFBF {} {Unicode Table 3.7 Row 3} + utf-8 \u1000 E18080 {} {Unicode Table 3.7 Row 4} + utf-8 \uCFFF ECBFBF {} {Unicode Table 3.7 Row 4} + utf-8 \uD000 ED8080 {} {Unicode Table 3.7 Row 5} + utf-8 \uD7FF ED9FBF {} {Unicode Table 3.7 Row 5} + utf-8 \uE000 EE8080 {} {Unicode Table 3.7 Row 6} + utf-8 \uFFFF EFBFBF {} {Unicode Table 3.7 Row 6} + utf-8 \U10000 F0908080 {} {Unicode Table 3.7 Row 7} + utf-8 \U3FFFF F0BFBFBF {} {Unicode Table 3.7 Row 7} + utf-8 \U40000 F1808080 {} {Unicode Table 3.7 Row 8} + utf-8 \UFFFFF F3BFBFBF {} {Unicode Table 3.7 Row 8} + utf-8 \U100000 F4808080 {} {Unicode Table 3.7 Row 9} + utf-8 \U10FFFF F48FBFBF {} {Unicode Table 3.7 Row 9} + utf-8 A\u03A9\u8A9E\U00010384 41CEA9E8AA9EF0908E84 {} {Unicode 2.5} + + utf-16le \u0000 0000 {} {Lowest code unit} + utf-16le \uD7FF FFD7 {} {Below high surrogate range} + utf-16le \uE000 00E0 {} {Above low surrogate range} + utf-16le \uFFFF FFFF {} {Highest code unit} + utf-16le \U010000 00D800DC {} {First surrogate pair} + utf-16le \U10FFFF FFDBFFDF {} {First surrogate pair} + utf-16le A\u03A9\u8A9E\U00010384 4100A9039E8A00D884DF {} {Unicode 2.5} + + utf-16be \u0000 0000 {} {Lowest code unit} + utf-16be \uD7FF D7FF {} {Below high surrogate range} + utf-16be \uE000 E000 {} {Above low surrogate range} + utf-16be \uFFFF FFFF {} {Highest code unit} + utf-16be \U010000 D800DC00 {} {First surrogate pair} + utf-16be \U10FFFF DBFFDFFF {} {First surrogate pair} + utf-16be A\u03A9\u8A9E\U00010384 004103A98A9ED800DF84 {} {Unicode 2.5} + + utf-32le \u0000 00000000 {} {Lowest code unit} + utf-32le \uFFFF FFFF0000 {} {Highest BMP} + utf-32le \U010000 00000100 {} {First supplementary} + utf-32le \U10FFFF ffff1000 {} {Last supplementary} + utf-32le A\u03A9\u8A9E\U00010384 41000000A90300009E8A000084030100 {} {Unicode 2.5} + + utf-32be \u0000 00000000 {} {Lowest code unit} + utf-32be \uFFFF 0000FFFF {} {Highest BMP} + utf-32be \U010000 00010000 {} {First supplementary} + utf-32be \U10FFFF 0010FFFF {} {Last supplementary} + utf-32be A\u03A9\u8A9E\U00010384 00000041000003A900008A9E00010384 {} {Unicode 2.5} } # Invalid byte sequences. These are driven from a table with format @@ -211,8 +254,7 @@ set encValidStrings { # If the ctrl field is empty it is treated as all of the above # Note if there is any other value by itself, it will cause the test to # be skipped. This is intentional to skip known bugs. - -# TODO - other encodings and test cases +# TODO - non-UTF encodings # ascii - Any byte above 127 is invalid and is mapped # to the same numeric code point except for the range @@ -616,8 +658,6 @@ lappend encInvalidBytes {*}{ utf-8 F48FBFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} utf-8 F48FBFD0 strict {} 0 {} {Third trail byte must be 80:BF} - - utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} utf-8 F5 strict {} 0 {} {F5:FF are invalid everywhere} @@ -631,42 +671,73 @@ lappend encInvalidBytes {*}{ utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3.11} } -set xxencInvalidBytes { - - utf-8 \x41\x80\x42 tcl8 A\u0080B -1 80 - utf-8 \x41\x80\x42 replace A\uFFFDB -1 80 - utf-8 \x41\x80\x42 strict A 1 80 - utf-8 \x41\xC0\x80\x42 tcl8 A\u0000B -1 C080 - utf-8 \x41\xC0\x80\x42 strict A 1 C080 - utf-8 \x41\xC1\x42 tcl8 A\u00C1B -1 C1 - utf-8 \x41\xC1\x42 replace A\uFFFDB -1 C1 - utf-8 \x41\xC1\x42 strict A 1 C1 - utf-8 \x41\xC2\x42 tcl8 A\u00C2B -1 C2-nontrail - utf-8 \x41\xC2\x42 replace A\uFFFDB -1 C2-nontrail - utf-8 \x41\xC2\x42 strict A 1 C2-nontrail - utf-8 \x41\xC2 tcl8 A\u00C2 -1 C2-incomplete - utf-8 \x41\xC2 replace A\uFFFD -1 C2-incomplete - utf-8 \x41\xC2 strict A 1 C2-incomplete - utf-8 A\xed\xa0\x80B tcl8 A\uD800B -1 High-surrogate - utf-8 A\xed\xa0\x80B strict A 1 High-surrogate - utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate - utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate - utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate - utf-8 \xed\xa0\x80\xed\xb0\x80 strict {} 0 High-low-surrogate +# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated +# based on these depending on platform endianness. Note truncated tests can only +# happen when the sequence is at the end (including by itself) Thus {solo tail} +# in some cases. +lappend encInvalidBytes {*}{ + utf-16le 41 tcl8 {} -1 {solo tail} {Truncated} + utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-16le 41 strict {} 0 {solo tail} {Truncated} + utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} + utf-16le 00D8 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16le 00D8 strict {} 0 {knownBug} {Missing low surrogate} + utf-16le 00DC tcl8 \uDC00 -1 {} {Missing high surrogate} + utf-16le 00DC replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16le 00DC strict {} 0 {knownBug} {Missing high surrogate} } -set utf32-le-TODO { - utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} - utf-32le \x00\xD8\x00\x00 strict "" 0 {High-surrogate} - utf-32le \x00\xDC\x00\x00 tcl8 \uDC00 -1 {Low-surrogate} - utf-32le \x00\xDC\x00\x00 strict "" 0 {Low-surrogate} - utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 tcl8 \uD800\uDC00 -1 {High-low-surrogate} - utf-32le \x00\xD8\x00\x00\x00\xDC\x00\x00 strict "" 0 {High-low-surrogate} - utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 tcl8 \uDC00\uD800 -1 {High-low-surrogate} - utf-32le \x00\xDC\x00\x00\x00\xD8\x00\x00 strict "" 0 {High-low-surrogate} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 tcl8 A\uD800B -1 {High-surrogate-middle} - utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00 strict A 4 {High-surrogate-middle} + +# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated +# based on these depending on platform endianness. Note truncated tests can only +# happen when the sequence is at the end (including by itself) Thus {solo tail} +# in some cases. +lappend encInvalidBytes {*}{ + utf-32le 41 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 41 replace \uFFFD -1 {solo} {Truncated} + utf-32le 41 strict {} 0 {solo tail} {Truncated} + utf-32le 4100 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 4100 replace \uFFFD -1 {solo} {Truncated} + utf-32le 4100 strict {} 0 {solo tail} {Truncated} + utf-32le 410000 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 410000 replace \uFFFD -1 {solo} {Truncated} + utf-32le 410000 strict {} 0 {solo tail} {Truncated} + utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate} + utf-32le 00D80000 replace \uFFFD -1 {} {High-surrogate} + utf-32le 00D80000 strict {} 0 {} {High-surrogate} + utf-32le 00DC0000 tcl8 \uDC00 -1 {} {Low-surrogate} + utf-32le 00DC0000 replace \uFFFD -1 {} {Low-surrogate} + utf-32le 00DC0000 strict {} 0 {} {Low-surrogate} + utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} + utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} + utf-32le 00D8000000DC0000 strict {} 0 {} {High-low-surrogate-pair} + utf-32le 00001100 tcl8 \UFFFD -1 {} {Out of range} + utf-32le 00001100 replace \UFFFD -1 {} {Out of range} + utf-32le 00001100 strict {} 0 {} {Out of range} + utf-32le FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} + utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range} + utf-32le FFFFFFFF strict {} 0 {} {Out of range} + + utf-32be 41 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 0041 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 000041 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate} + utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate} + utf-32be 0000D800 strict {} 0 {} {High-surrogate} + utf-32be 0000DC00 tcl8 \uDC00 -1 {} {Low-surrogate} + utf-32be 0000DC00 replace \uFFFD -1 {} {Low-surrogate} + utf-32be 0000DC00 strict {} 0 {} {Low-surrogate} + utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} + utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} + utf-32be 0000D8000000DC00 strict {} 0 {} {High-low-surrogate-pair} + utf-32be 00110000 tcl8 \UFFFD -1 {} {Out of range} + utf-32be 00110000 replace \UFFFD -1 {} {Out of range} + utf-32be 00110000 strict {} 0 {} {Out of range} + utf-32be FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} + utf-32be FFFFFFFF replace \UFFFD -1 {} {Out of range} + utf-32be FFFFFFFF strict {} 0 {} {Out of range} } + # Strings that cannot be encoded for specific encoding / profiles # {encoding string profile exptedresult expectedfailindex ctrl comment} # should be unique for test ids to be unique. @@ -682,7 +753,7 @@ set utf32-le-TODO { # If the ctrl field is empty it is treated as all of the above # Note if there is any other value by itself, it will cause the test to # be skipped. This is intentional to skip known bugs. -# TODO - other encodings and test cases +# TODO - other encodings # TODO - out of range code point (note cannot be generated by \U notation) set encUnencodableStrings { ascii \u00e0 tcl8 3f -1 {} {unencodable} @@ -883,7 +954,8 @@ testconvert cmdAH-4.3.12 { } # convertfrom ?-profile? : valid byte sequences -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set prefix A set suffix B @@ -899,6 +971,7 @@ foreach {enc str hex} $encValidStrings { # convertfrom ?-profile? : invalid byte sequences foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + if {"knownBug" in $ctrl} continue set bytes [binary format H* $hex] set prefix A set suffix B @@ -945,12 +1018,13 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { } # convertfrom -failindex ?-profile? - valid data -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set prefix A set suffix B - set prefix_bytes [encoding convertto $enc A] - set suffix_bytes [encoding convertto $enc B] + set prefix_bytes [encoding convertto $enc $prefix] + set suffix_bytes [encoding convertto $enc $suffix] foreach profile $encProfiles { testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile @@ -961,11 +1035,14 @@ foreach {enc str hex} $encValidStrings { # convertfrom -failindex ?-profile? - invalid data foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { + if {"knownBug" in $ctrl} continue # There are multiple test cases based on location of invalid bytes set bytes [binary decode hex $hex] set prefix A set suffix B - set prefixLen [string length [encoding convertto $enc $prefix]] + set prefix_bytes [encoding convertto $enc $prefix] + set suffix_bytes [encoding convertto $enc $suffix] + set prefixLen [string length $prefix_bytes] if {$ctrl eq {} || "solo" in $ctrl} { testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile } @@ -977,7 +1054,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # Failure expected set result "" } - testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $result $failidx] $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -989,7 +1066,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix$bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $result $expected_failidx] $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -1001,7 +1078,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix$bytes$suffix [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $result $expected_failidx] $profile } } @@ -1041,7 +1118,8 @@ testconvert cmdAH-4.4.12 { # convertto ?-profile? : valid byte sequences -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A @@ -1058,6 +1136,7 @@ foreach {enc str hex} $encValidStrings { # convertto ?-profile? : invalid byte sequences foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A @@ -1105,7 +1184,8 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { } # convertto -failindex ?-profile? - valid data -foreach {enc str hex} $encValidStrings { +foreach {enc str hex ctrl comment} $encValidStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A @@ -1122,6 +1202,7 @@ foreach {enc str hex} $encValidStrings { # convertto -failindex ?-profile? - invalid data foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { + if {"knownBug" in $ctrl} continue set bytes [binary decode hex $hex] set printable [printable $str] set prefix A -- cgit v0.12 From fa9ac8a850701b20b6c178fdbf30b705148ffd6b Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Mon, 20 Feb 2023 15:41:15 +0000 Subject: Fix replace profile handling of truncated surrogates --- generic/tclCmdAH.c | 9 +++++---- generic/tclEncoding.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 692c75b..4dfb541 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -695,7 +695,8 @@ EncodingConvertfromObjCmd( } result = Tcl_ExternalToUtfDStringEx(encoding, bytesPtr, length, flags, &ds); - if (result != TCL_INDEX_NONE) { + if (result != TCL_INDEX_NONE && + TCL_ENCODING_PROFILE_GET(flags) != TCL_ENCODING_PROFILE_TCL8) { if (failVarObj != NULL) { if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; @@ -776,7 +777,8 @@ EncodingConverttoObjCmd( stringPtr = TclGetStringFromObj(data, &length); result = Tcl_UtfToExternalDStringEx(encoding, stringPtr, length, flags, &ds); - if (result != TCL_INDEX_NONE) { + if (result != TCL_INDEX_NONE && + TCL_ENCODING_PROFILE_GET(flags) != TCL_ENCODING_PROFILE_TCL8) { if (failVarObj != NULL) { /* I hope, wide int will cover size_t data type */ if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { @@ -795,8 +797,7 @@ EncodingConverttoObjCmd( Tcl_DStringFree(&ds); return TCL_ERROR; } - } - else if (failVarObj != NULL) { + } else if (failVarObj != NULL) { if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; } diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 7e5ec22..024570a 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2594,7 +2594,7 @@ Utf32ToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; - int result, numChars, charLimit = INT_MAX; + int result, extra, numChars, charLimit = INT_MAX; int ch = 0; flags |= PTR2INT(clientData); @@ -2606,8 +2606,9 @@ Utf32ToUtfProc( /* * Check alignment with utf-32 (4 == sizeof(UTF-32)) */ - - if ((srcLen % 4) != 0) { + extra = srcLen % 4; + if (extra != 0) { + /* We have a truncated code unit */ result = TCL_CONVERT_MULTIBYTE; srcLen &= -4; } @@ -2669,13 +2670,27 @@ Utf32ToUtfProc( } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned int); + src += 4; } if ((ch & ~0x3FF) == 0xD800) { /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is "replace", stick FFFD in its place. + */ + if (extra && (flags & TCL_ENCODING_END) && PROFILE_REPLACE(flags)) { + src += extra; /* Go past truncated code unit */ + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + result = TCL_OK; + } + } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; @@ -2822,7 +2837,7 @@ Utf16ToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; - int result, numChars, charLimit = INT_MAX; + int result, extra, numChars, charLimit = INT_MAX; unsigned short ch = 0; flags |= PTR2INT(clientData); @@ -2835,7 +2850,8 @@ Utf16ToUtfProc( * Check alignment with utf-16 (2 == sizeof(UTF-16)) */ - if ((srcLen % 2) != 0) { + extra = srcLen % 2; + if (extra != 0) { result = TCL_CONVERT_MULTIBYTE; srcLen--; } @@ -2891,6 +2907,20 @@ Utf16ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } + /* + * If we had a truncated code unit at the end AND this is the last + * fragment AND profile is "replace", stick FFFD in its place. + */ + if (extra && (flags & TCL_ENCODING_END) && PROFILE_REPLACE(flags)) { + ++src;/* Go past the truncated code unit */ + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + } else { + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + result = TCL_OK; + } + } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; -- cgit v0.12 From 4d644dfb73457eb3615b30550dd31d1b48bfa7d4 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 21 Feb 2023 16:03:18 +0000 Subject: Generate test data from ICU UCM data files. SBCS only for now --- tools/ucm2tests.tcl | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 tools/ucm2tests.tcl diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl new file mode 100644 index 0000000..22ae529 --- /dev/null +++ b/tools/ucm2tests.tcl @@ -0,0 +1,185 @@ +# ucm2tests.tcl +# +# Parses given ucm files (from ICU) to generate test data +# for encodings. The generated scripts are written to stdout. +# +# tclsh ucmtotests.tcl PATH_TO_ICU_UCM_DIRECTORY +# + +namespace eval ucm { + # No means to change these currently but ... + variable outputChan stdout + variable errorChan stderr + variable verbose 0 + + # Map Tcl encoding name to ICU UCM file name + variable encNameMap + array set encNameMap { + cp1250 glibc-CP1250-2.1.2 + cp1251 glibc-CP1251-2.1.2 + cp1252 glibc-CP1252-2.1.2 + cp1253 glibc-CP1253-2.1.2 + cp1254 glibc-CP1254-2.1.2 + cp1255 glibc-CP1255-2.1.2 + cp1256 glibc-CP1256-2.1.2 + cp1257 glibc-CP1257-2.1.2 + cp1258 glibc-CP1258-2.1.2 + iso8859-1 glibc-ISO_8859_1-2.1.2 + iso8859-2 glibc-ISO_8859_2-2.1.2 + iso8859-3 glibc-ISO_8859_3-2.1.2 + iso8859-4 glibc-ISO_8859_4-2.1.2 + iso8859-5 glibc-ISO_8859_5-2.1.2 + iso8859-6 glibc-ISO_8859_6-2.1.2 + iso8859-7 glibc-ISO_8859_7-2.3.3 + iso8859-8 glibc-ISO_8859_8-2.3.3 + iso8859-9 glibc-ISO_8859_9-2.1.2 + iso8859-10 glibc-ISO_8859_10-2.1.2 + iso8859-11 glibc-ISO_8859_11-2.1.2 + iso8859-13 glibc-ISO_8859_13-2.1.2 + iso8859-14 glibc-ISO_8859_14-2.1.2 + iso8859-15 glibc-ISO_8859_15-2.1.2 + iso8859-16 glibc-ISO_8859_16-2.3.3 + } + + # Dictionary Character map for Tcl encoding + variable charMap +} + +proc ucm::abort {msg} { + variable errorChan + puts $errorChan $msg + exit 1 +} +proc ucm::warn {msg} { + variable errorChan + puts $errorChan $msg +} +proc ucm::log {msg} { + variable verbose + if {$verbose} { + variable errorChan + puts $errorChan $msg + } +} +proc ucm::print {s} { + variable outputChan + puts $outputChan $s +} + +proc ucm::parse_SBCS {fd} { + set result {} + while {[gets $fd line] >= 0} { + if {[string match #* $line]} { + continue + } + if {[string equal "END CHARMAP" [string trim $line]]} { + break + } + if {![regexp {^\s*\s*((\\x[[:xdigit:]]{2})+)\s*(\|(0|1|2|3|4))} $line -> unichar bytes - - precision]} { + error "Unexpected line parsing SBCS: $line" + } + set bytes [string map {\\x {}} $bytes]; # \xNN -> NN + if {$precision eq "" || $precision eq "0"} { + lappend result $unichar $bytes + } else { + # It is a fallback mapping - ignore + } + } + return $result +} + +proc ucm::generate_tests {} { + variable encNameMap + variable charMap + + array set tclNames {} + foreach encName [encoding names] { + set tclNames($encName) "" + } + foreach encName [lsort [array names encNameMap]] { + if {![info exists charMap($encName)]} { + warn "No character map read for $encName" + continue + } + unset tclNames($encName) + print "\n# $encName (generated from $encNameMap($encName))" + print "lappend encValidStrings {*}{" + foreach {unich hex} $charMap($encName) { + print " $encName \\u$unich $hex {} {}" + } + print "}; # $encName" + } + if {[array size tclNames]} { + warn "Missing encoding: [lsort [array names tclNames]]" + } +} + +proc ucm::parse_file {encName ucmPath} { + variable charMap + set fd [open $ucmPath] + try { + # Parse the metadata + unset -nocomplain state + while {[gets $fd line] >= 0} { + if {[regexp {<(code_set_name|mb_cur_max|mb_cur_min|uconv_class|subchar)>\s+(\S+)} $line -> key val]} { + set state($key) $val + } elseif {[regexp {^\s*CHARMAP\s*$} $line]} { + set state(charmap) "" + break + } else { + # Skip all else + } + } + if {![info exists state(charmap)]} { + abort "Error: $path has No CHARMAP line." + } + foreach key {code_set_name uconv_class} { + if {[info exists state($key)]} { + set state($key) [string trim $state($key) {"}] + } + } + if {[info exists charMap($encName)]} { + abort "Duplicate file for $encName ($path)" + } + if {![info exists state(uconv_class)]} { + abort "Error: $path has no uconv_class definition." + } + switch -exact -- $state(uconv_class) { + SBCS { + if {[catch { + set charMap($encName) [parse_SBCS $fd] + } result]} { + abort "Could not process $path. $result" + } + } + default { + log "Skipping $path -- not SBCS encoding." + return + } + } + } finally { + close $fd + } +} + +proc ucm::expand_paths {patterns} { + set expanded {} + foreach pat $patterns { + # The file join is for \ -> / + lappend expanded {*}[glob -nocomplain [file join $pat]] + } + return $expanded +} + +proc ucm::run {} { + variable encNameMap + if {[llength $::argv] != 1} { + abort "Usage: [info nameofexecutable] $::argv0 PATHTOUCMFILES" + } + foreach {encName fname} [array get encNameMap] { + ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm] + } + generate_tests +} + +ucm::run -- cgit v0.12 From 9b8fa27457c97577817b8f86b0b658a04867d7c7 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 21 Feb 2023 17:27:16 +0000 Subject: Rework ICU tests to check validity of whole charmap in one test, else too many tests. --- tests/cmdAH.test | 87 +++++++++++++++++++++++++++----------------- tools/ucm2tests.tcl | 101 ++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 144 insertions(+), 44 deletions(-) diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 1fbe6d2..3be2f14 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -186,9 +186,11 @@ set encDefaultProfile tcl8; # Should reflect the default from implementation # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically # generated based on le/be versions. -set encValidStrings { +lappend encValidStrings {*}{ ascii \u0000 00 {} {Lowest ASCII} ascii \u007F 7F knownBug {Highest ASCII} + ascii \u007D 7D {} {Brace - just to verify test scripts are escaped correctly} + ascii \u007B 7B {} {Terminating brace - just to verify test scripts are escaped correctly} utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1} utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1} @@ -361,9 +363,28 @@ lappend encInvalidBytes {*}{ utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} utf-8 C080 strict {} 0 {} {C080 -> invalid} utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} + utf-8 C0A2 tcl8 \u00C0\u00A2 -1 {} {websec.github.io - A} + utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A} + utf-8 C0A2 strict {} 0 {} {websec.github.io - A} + utf-8 C0A7 tcl8 \u00C0\u00A7 -1 {} {websec.github.io - double quote} + utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote} + utf-8 C0A7 strict {} 0 {} {websec.github.io - double quote} + utf-8 C0AE tcl8 \u00C0\u00AE -1 {} {websec.github.io - full stop} + utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop} + utf-8 C0AE strict {} 0 {} {websec.github.io - full stop} + utf-8 C0AF tcl8 \u00C0\u00AF -1 {} {websec.github.io - solidus} + utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus} + utf-8 C0AF strict {} 0 {} {websec.github.io - solidus} + utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} + utf-8 C181 tcl8 \u00C1\u0081 -1 {} {websec.github.io - base test (A)} + utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)} + utf-8 C181 strict {} 0 {} {websec.github.io - base test (A)} + utf-8 C19C tcl8 \u00C1\u0153 -1 {} {websec.github.io - reverse solidus} + utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} + utf-8 C19C strict {} 0 {} {websec.github.io - reverse solidus} utf-8 C2 tcl8 \u00C2 -1 {} {Missing trail byte} utf-8 C2 replace \uFFFD -1 {} {Missing trail byte} @@ -387,6 +408,9 @@ lappend encInvalidBytes {*}{ utf-8 E080 tcl8 \u00E0\u20AC -1 {} {First trail byte must be A0:BF} utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E0819C tcl8 \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus} + utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} + utf-8 E0819C strict {} 0 {} {websec.github.io - reverse solidus} utf-8 E09F tcl8 \u00E0\u0178 -1 {} {First trail byte must be A0:BF} utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} @@ -526,6 +550,9 @@ lappend encInvalidBytes {*}{ utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte} utf-8 F0 replace \uFFFD -1 {} {Missing trail byte} utf-8 F0 strict {} 0 {} {Missing trail byte} + utf-8 F080 tcl8 \u00F0\u20AC -1 {} {First trail byte must be 90:BF} + utf-8 F080 replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} + utf-8 F080 strict {} 0 {} {First trail byte must be 90:BF} utf-8 F08F tcl8 \u00F0\u8F -1 {} {First trail byte must be 90:BF} utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF} @@ -755,7 +782,7 @@ lappend encInvalidBytes {*}{ # be skipped. This is intentional to skip known bugs. # TODO - other encodings # TODO - out of range code point (note cannot be generated by \U notation) -set encUnencodableStrings { +lappend encUnencodableStrings {*}{ ascii \u00e0 tcl8 3f -1 {} {unencodable} ascii \u00e0 strict {} 0 {} {unencodable} @@ -768,12 +795,6 @@ set encUnencodableStrings { utf-8 \uDC00 strict {} 0 {} High-surrogate } -if {$::tcl_platform(byteOrder) eq "littleEndian"} { - set endian le -} else { - set endian be -} - # Maps utf-{16,32}{le,be} to utf-16, utf-32 and # others to "". Used to test utf-16, utf-32 based # on system endianness @@ -881,19 +902,19 @@ proc testprofile {id converter enc profile data result args} { # Generates tests for compiled and uncompiled implementation. # Also generates utf-{16,32} tests if passed encoding is utf-{16,32}{le,be} # The enc and profile are appended to id to generate the test id -proc testfailindex {id converter enc data result {profile default}} { - testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result +proc testfailindex {id converter enc data result failidx {profile default}} { + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc [list $data]\] \[set idx\]" [list $result $failidx] if {[set enc2 [endianUtf $enc]] ne ""} { # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 $data] \[set idx]" $result + testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 [list $data]\] \[set idx]" [list $result $failidx] } # If this is the default profile, generate a test without specifying profile if {$profile eq $::encDefaultProfile} { - testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc [list $data]\] \[set idx]" [list $result $failidx] if {[set enc2 [endianUtf $enc]] ne ""} { # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 $data] \[set idx]" $result + testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 [list $data]\] \[set idx]" [list $result $failidx] } } } @@ -962,10 +983,10 @@ foreach {enc str hex ctrl comment} $encValidStrings { set prefix_bytes [encoding convertto $enc A] set suffix_bytes [encoding convertto $enc B] foreach profile $encProfiles { - testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile - testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile - testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile - testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + testprofile cmdAH-4.3.13.$hex.solo convertfrom $enc $profile $bytes $str + testprofile cmdAH-4.3.13.$hex.lead convertfrom $enc $profile $bytes$suffix_bytes $str$suffix + testprofile cmdAH-4.3.13.$hex.tail convertfrom $enc $profile $prefix_bytes$bytes $prefix$str + testprofile cmdAH-4.3.13.$hex.middle convertfrom $enc $profile $prefix_bytes$bytes$suffix_bytes $prefix$str$suffix } } @@ -1026,10 +1047,10 @@ foreach {enc str hex ctrl comment} $encValidStrings { set prefix_bytes [encoding convertto $enc $prefix] set suffix_bytes [encoding convertto $enc $suffix] foreach profile $encProfiles { - testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile - testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile - testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile - testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes $str -1 $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes $str$suffix -1 $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes $prefix$str -1 $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes $prefix$str$suffix -1 $profile } } @@ -1044,7 +1065,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set suffix_bytes [encoding convertto $enc $suffix] set prefixLen [string length $prefix_bytes] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes $str $failidx $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -1054,7 +1075,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # Failure expected set result "" } - testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $result $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes $result $failidx $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -1066,7 +1087,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes $result $expected_failidx $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -1078,7 +1099,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes $result $expected_failidx $profile } } @@ -1193,10 +1214,10 @@ foreach {enc str hex ctrl comment} $encValidStrings { set prefix_bytes [encoding convertto $enc A] set suffix_bytes [encoding convertto $enc B] foreach profile $encProfiles { - testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str [list $bytes -1] $profile - testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix [list $bytes$suffix_bytes -1] $profile - testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str [list $prefix_bytes$bytes -1] $profile - testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix [list $prefix_bytes$bytes$suffix_bytes -1] $profile + testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str $bytes -1 $profile + testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix $bytes$suffix_bytes -1 $profile + testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str $prefix_bytes$bytes -1 $profile + testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix $prefix_bytes$bytes$suffix_bytes -1 $profile } } @@ -1209,7 +1230,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { set suffix B set prefixLen [string length [encoding convertto $enc $prefix]] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str [list $bytes $failidx] $profile + testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str $bytes $failidx $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -1219,7 +1240,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { # Failure expected set result "" } - testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix $result $failidx $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -1231,7 +1252,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str [list $result $expected_failidx] $profile + testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str $result $expected_failidx $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -1243,7 +1264,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix [list $result $expected_failidx] $profile + testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile } } diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl index 22ae529..e971631 100644 --- a/tools/ucm2tests.tcl +++ b/tools/ucm2tests.tcl @@ -1,14 +1,15 @@ # ucm2tests.tcl # # Parses given ucm files (from ICU) to generate test data -# for encodings. The generated scripts are written to stdout. +# for encodings. # -# tclsh ucmtotests.tcl PATH_TO_ICU_UCM_DIRECTORY +# tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH? # namespace eval ucm { # No means to change these currently but ... - variable outputChan stdout + variable outputPath + variable outputChan variable errorChan stderr variable verbose 0 @@ -24,6 +25,7 @@ namespace eval ucm { cp1256 glibc-CP1256-2.1.2 cp1257 glibc-CP1257-2.1.2 cp1258 glibc-CP1258-2.1.2 + gb1988 glibc-GB_1988_80-2.3.3 iso8859-1 glibc-ISO_8859_1-2.1.2 iso8859-2 glibc-ISO_8859_2-2.1.2 iso8859-3 glibc-ISO_8859_3-2.1.2 @@ -91,27 +93,99 @@ proc ucm::parse_SBCS {fd} { proc ucm::generate_tests {} { variable encNameMap variable charMap + variable outputPath + variable outputChan + + if {[info exists outputPath]} { + set outputChan [open $outputPath w] + } else { + set outputChan stdout + } array set tclNames {} foreach encName [encoding names] { set tclNames($encName) "" } - foreach encName [lsort [array names encNameMap]] { + + # Common procedures + print { +# This file is automatically generated by ucm2tests.tcl. +# Edits will be overwritten on next generation. +# +# Generates tests comparing Tcl encodings to ICU. +# The generated file is NOT standalone. It should be sourced into a test script. + +proc ucmConvertfromMismatches {enc map} { + set mismatches {} + foreach {unihex hex} $map { + set unich [subst "\\U$unihex"] + if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} { + lappend mismatches "<[printable $unich],$hex>" + } + } + return $mismatches +} +proc ucmConverttoMismatches {enc map} { + set mismatches {} + foreach {unihex hex} $map { + set unich [subst "\\U$unihex"] + if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} { + lappend mismatches "<[printable $unich],$hex>" + } + } + return $mismatches +} +if {[info commands printable] eq ""} { + proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } + } + return $print + } +} + } + foreach encName [lsort -dictionary [array names encNameMap]] { if {![info exists charMap($encName)]} { warn "No character map read for $encName" continue } unset tclNames($encName) - print "\n# $encName (generated from $encNameMap($encName))" - print "lappend encValidStrings {*}{" - foreach {unich hex} $charMap($encName) { - print " $encName \\u$unich $hex {} {}" + + print "\n#\n# $encName (generated from $encNameMap($encName))" + print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{" + print " ucmConvertfromMismatches $encName {$charMap($encName)}" + print "\} -result {}" + print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{" + print " ucmConverttoMismatches $encName {$charMap($encName)}" + print "\} -result {}" + if {0} { + # This will generate individual tests for every char + # and test in lead, tail, middle, solo configurations + # but takes considerable time + print "lappend encValidStrings {*}{" + foreach {unich hex} $charMap($encName) { + print " $encName \\u$unich $hex {} {}" + } + print "}; # $encName" } - print "}; # $encName" } if {[array size tclNames]} { warn "Missing encoding: [lsort [array names tclNames]]" } + if {[info exists outputPath]} { + close $outputChan + unset outputChan + } } proc ucm::parse_file {encName ucmPath} { @@ -173,8 +247,13 @@ proc ucm::expand_paths {patterns} { proc ucm::run {} { variable encNameMap - if {[llength $::argv] != 1} { - abort "Usage: [info nameofexecutable] $::argv0 PATHTOUCMFILES" + variable outputPath + switch [llength $::argv] { + 2 {set outputPath [lindex $::argv 1]} + 1 {} + default { + abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?" + } } foreach {encName fname} [array get encNameMap] { ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm] -- cgit v0.12 From 293504812606130380d7240fddbbdc573b9dae8c Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Wed, 22 Feb 2023 13:42:55 +0000 Subject: Add ICU tests for unmapped characters. --- tests/cmdAH.test | 4 + tests/icuUcmTests.tcl | 1891 +++++++++++++++++++++++++++++++++++++++++++++++++ tools/ucm2tests.tcl | 156 +++- 3 files changed, 2017 insertions(+), 34 deletions(-) create mode 100644 tests/icuUcmTests.tcl diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 3be2f14..cfde678 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -795,6 +795,10 @@ lappend encUnencodableStrings {*}{ utf-8 \uDC00 strict {} 0 {} High-surrogate } +# Generated tests comparing against ICU +# TODO - commented out for now as generating a lot of mismatches. +# source [file join [file dirname [info script]] icuUcmTests.tcl] + # Maps utf-{16,32}{le,be} to utf-16, utf-32 and # others to "". Used to test utf-16, utf-32 based # on system endianness diff --git a/tests/icuUcmTests.tcl b/tests/icuUcmTests.tcl new file mode 100644 index 0000000..0c4071f --- /dev/null +++ b/tests/icuUcmTests.tcl @@ -0,0 +1,1891 @@ + +# This file is automatically generated by ucm2tests.tcl. +# Edits will be overwritten on next generation. +# +# Generates tests comparing Tcl encodings to ICU. +# The generated file is NOT standalone. It should be sourced into a test script. + +proc ucmConvertfromMismatches {enc map} { + set mismatches {} + foreach {unihex hex} $map { + set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits + set unich [subst "\\U$unihex"] + if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} { + lappend mismatches "<[printable $unich],$hex>" + } + } + return $mismatches +} +proc ucmConverttoMismatches {enc map} { + set mismatches {} + foreach {unihex hex} $map { + set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits + set unich [subst "\\U$unihex"] + if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} { + lappend mismatches "<[printable $unich],$hex>" + } + } + return $mismatches +} +if {[info commands printable] eq ""} { + proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } + } + return $print + } +} + + +# +# cp1250 (generated from glibc-CP1250-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1250 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1250 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A4 A4 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00BB BB 00C1 C1 00C2 C2 00C4 C4 00C7 C7 00C9 C9 00CB CB 00CD CD 00CE CE 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00DA DA 00DC DC 00DD DD 00DF DF 00E1 E1 00E2 E2 00E4 E4 00E7 E7 00E9 E9 00EB EB 00ED ED 00EE EE 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00FA FA 00FC FC 00FD FD 0102 C3 0103 E3 0104 A5 0105 B9 0106 C6 0107 E6 010C C8 010D E8 010E CF 010F EF 0110 D0 0111 F0 0118 CA 0119 EA 011A CC 011B EC 0139 C5 013A E5 013D BC 013E BE 0141 A3 0142 B3 0143 D1 0144 F1 0147 D2 0148 F2 0150 D5 0151 F5 0154 C0 0155 E0 0158 D8 0159 F8 015A 8C 015B 9C 015E AA 015F BA 0160 8A 0161 9A 0162 DE 0163 FE 0164 8D 0165 9D 016E D9 016F F9 0170 DB 0171 FB 0179 8F 017A 9F 017B AF 017C BF 017D 8E 017E 9E 02C7 A1 02D8 A2 02D9 FF 02DB B2 02DD BD 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1250 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1250 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A4 A4 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00BB BB 00C1 C1 00C2 C2 00C4 C4 00C7 C7 00C9 C9 00CB CB 00CD CD 00CE CE 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00DA DA 00DC DC 00DD DD 00DF DF 00E1 E1 00E2 E2 00E4 E4 00E7 E7 00E9 E9 00EB EB 00ED ED 00EE EE 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00FA FA 00FC FC 00FD FD 0102 C3 0103 E3 0104 A5 0105 B9 0106 C6 0107 E6 010C C8 010D E8 010E CF 010F EF 0110 D0 0111 F0 0118 CA 0119 EA 011A CC 011B EC 0139 C5 013A E5 013D BC 013E BE 0141 A3 0142 B3 0143 D1 0144 F1 0147 D2 0148 F2 0150 D5 0151 F5 0154 C0 0155 E0 0158 D8 0159 F8 015A 8C 015B 9C 015E AA 015F BA 0160 8A 0161 9A 0162 DE 0163 FE 0164 8D 0165 9D 016E D9 016F F9 0170 DB 0171 FB 0179 8F 017A 9F 017B AF 017C BF 017D 8E 017E 9E 02C7 A1 02D8 A2 02D9 FF 02DB B2 02DD BD 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +# cp1250 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1250 81 tcl8 \U00000081 -1 {} {} + cp1250 81 replace \uFFFD -1 {} {} + cp1250 81 strict {} 0 {} {} + cp1250 83 tcl8 \U00000083 -1 {} {} + cp1250 83 replace \uFFFD -1 {} {} + cp1250 83 strict {} 0 {} {} + cp1250 88 tcl8 \U00000088 -1 {} {} + cp1250 88 replace \uFFFD -1 {} {} + cp1250 88 strict {} 0 {} {} + cp1250 90 tcl8 \U00000090 -1 {} {} + cp1250 90 replace \uFFFD -1 {} {} + cp1250 90 strict {} 0 {} {} + cp1250 98 tcl8 \U00000098 -1 {} {} + cp1250 98 replace \uFFFD -1 {} {} + cp1250 98 strict {} 0 {} {} +}; # cp1250 + +# cp1250 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1250 \U00000080 tcl8 1A -1 {} {} + cp1250 \U00000080 replace 1A -1 {} {} + cp1250 \U00000080 strict {} 0 {} {} + cp1250 \U00000400 tcl8 1A -1 {} {} + cp1250 \U00000400 replace 1A -1 {} {} + cp1250 \U00000400 strict {} 0 {} {} + cp1250 \U0000D800 tcl8 1A -1 {} {} + cp1250 \U0000D800 replace 1A -1 {} {} + cp1250 \U0000D800 strict {} 0 {} {} + cp1250 \U0000DC00 tcl8 1A -1 {} {} + cp1250 \U0000DC00 replace 1A -1 {} {} + cp1250 \U0000DC00 strict {} 0 {} {} + cp1250 \U00010000 tcl8 1A -1 {} {} + cp1250 \U00010000 replace 1A -1 {} {} + cp1250 \U00010000 strict {} 0 {} {} + cp1250 \U0010FFFF tcl8 1A -1 {} {} + cp1250 \U0010FFFF replace 1A -1 {} {} + cp1250 \U0010FFFF strict {} 0 {} {} +}; # cp1250 + +# +# cp1251 (generated from glibc-CP1251-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1251 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1251 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A4 A4 00A6 A6 00A7 A7 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B5 B5 00B6 B6 00B7 B7 00BB BB 0401 A8 0402 80 0403 81 0404 AA 0405 BD 0406 B2 0407 AF 0408 A3 0409 8A 040A 8C 040B 8E 040C 8D 040E A1 040F 8F 0410 C0 0411 C1 0412 C2 0413 C3 0414 C4 0415 C5 0416 C6 0417 C7 0418 C8 0419 C9 041A CA 041B CB 041C CC 041D CD 041E CE 041F CF 0420 D0 0421 D1 0422 D2 0423 D3 0424 D4 0425 D5 0426 D6 0427 D7 0428 D8 0429 D9 042A DA 042B DB 042C DC 042D DD 042E DE 042F DF 0430 E0 0431 E1 0432 E2 0433 E3 0434 E4 0435 E5 0436 E6 0437 E7 0438 E8 0439 E9 043A EA 043B EB 043C EC 043D ED 043E EE 043F EF 0440 F0 0441 F1 0442 F2 0443 F3 0444 F4 0445 F5 0446 F6 0447 F7 0448 F8 0449 F9 044A FA 044B FB 044C FC 044D FD 044E FE 044F FF 0451 B8 0452 90 0453 83 0454 BA 0455 BE 0456 B3 0457 BF 0458 BC 0459 9A 045A 9C 045B 9E 045C 9D 045E A2 045F 9F 0490 A5 0491 B4 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 88 2116 B9 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1251 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1251 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A4 A4 00A6 A6 00A7 A7 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B5 B5 00B6 B6 00B7 B7 00BB BB 0401 A8 0402 80 0403 81 0404 AA 0405 BD 0406 B2 0407 AF 0408 A3 0409 8A 040A 8C 040B 8E 040C 8D 040E A1 040F 8F 0410 C0 0411 C1 0412 C2 0413 C3 0414 C4 0415 C5 0416 C6 0417 C7 0418 C8 0419 C9 041A CA 041B CB 041C CC 041D CD 041E CE 041F CF 0420 D0 0421 D1 0422 D2 0423 D3 0424 D4 0425 D5 0426 D6 0427 D7 0428 D8 0429 D9 042A DA 042B DB 042C DC 042D DD 042E DE 042F DF 0430 E0 0431 E1 0432 E2 0433 E3 0434 E4 0435 E5 0436 E6 0437 E7 0438 E8 0439 E9 043A EA 043B EB 043C EC 043D ED 043E EE 043F EF 0440 F0 0441 F1 0442 F2 0443 F3 0444 F4 0445 F5 0446 F6 0447 F7 0448 F8 0449 F9 044A FA 044B FB 044C FC 044D FD 044E FE 044F FF 0451 B8 0452 90 0453 83 0454 BA 0455 BE 0456 B3 0457 BF 0458 BC 0459 9A 045A 9C 045B 9E 045C 9D 045E A2 045F 9F 0490 A5 0491 B4 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 88 2116 B9 2122 99} +} -result {} + +# cp1251 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1251 98 tcl8 \U00000098 -1 {} {} + cp1251 98 replace \uFFFD -1 {} {} + cp1251 98 strict {} 0 {} {} +}; # cp1251 + +# cp1251 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1251 \U00000080 tcl8 1A -1 {} {} + cp1251 \U00000080 replace 1A -1 {} {} + cp1251 \U00000080 strict {} 0 {} {} + cp1251 \U00000400 tcl8 1A -1 {} {} + cp1251 \U00000400 replace 1A -1 {} {} + cp1251 \U00000400 strict {} 0 {} {} + cp1251 \U0000D800 tcl8 1A -1 {} {} + cp1251 \U0000D800 replace 1A -1 {} {} + cp1251 \U0000D800 strict {} 0 {} {} + cp1251 \U0000DC00 tcl8 1A -1 {} {} + cp1251 \U0000DC00 replace 1A -1 {} {} + cp1251 \U0000DC00 strict {} 0 {} {} + cp1251 \U00010000 tcl8 1A -1 {} {} + cp1251 \U00010000 replace 1A -1 {} {} + cp1251 \U00010000 strict {} 0 {} {} + cp1251 \U0010FFFF tcl8 1A -1 {} {} + cp1251 \U0010FFFF replace 1A -1 {} {} + cp1251 \U0010FFFF strict {} 0 {} {} +}; # cp1251 + +# +# cp1252 (generated from glibc-CP1252-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1252 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1252 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D0 D0 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F0 F0 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 00FF FF 0152 8C 0153 9C 0160 8A 0161 9A 0178 9F 017D 8E 017E 9E 0192 83 02C6 88 02DC 98 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1252 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1252 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D0 D0 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F0 F0 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 00FF FF 0152 8C 0153 9C 0160 8A 0161 9A 0178 9F 017D 8E 017E 9E 0192 83 02C6 88 02DC 98 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +# cp1252 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1252 81 tcl8 \U00000081 -1 {} {} + cp1252 81 replace \uFFFD -1 {} {} + cp1252 81 strict {} 0 {} {} + cp1252 8D tcl8 \U0000008D -1 {} {} + cp1252 8D replace \uFFFD -1 {} {} + cp1252 8D strict {} 0 {} {} + cp1252 8F tcl8 \U0000008F -1 {} {} + cp1252 8F replace \uFFFD -1 {} {} + cp1252 8F strict {} 0 {} {} + cp1252 90 tcl8 \U00000090 -1 {} {} + cp1252 90 replace \uFFFD -1 {} {} + cp1252 90 strict {} 0 {} {} + cp1252 9D tcl8 \U0000009D -1 {} {} + cp1252 9D replace \uFFFD -1 {} {} + cp1252 9D strict {} 0 {} {} +}; # cp1252 + +# cp1252 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1252 \U00000080 tcl8 1A -1 {} {} + cp1252 \U00000080 replace 1A -1 {} {} + cp1252 \U00000080 strict {} 0 {} {} + cp1252 \U00000400 tcl8 1A -1 {} {} + cp1252 \U00000400 replace 1A -1 {} {} + cp1252 \U00000400 strict {} 0 {} {} + cp1252 \U0000D800 tcl8 1A -1 {} {} + cp1252 \U0000D800 replace 1A -1 {} {} + cp1252 \U0000D800 strict {} 0 {} {} + cp1252 \U0000DC00 tcl8 1A -1 {} {} + cp1252 \U0000DC00 replace 1A -1 {} {} + cp1252 \U0000DC00 strict {} 0 {} {} + cp1252 \U00010000 tcl8 1A -1 {} {} + cp1252 \U00010000 replace 1A -1 {} {} + cp1252 \U00010000 strict {} 0 {} {} + cp1252 \U0010FFFF tcl8 1A -1 {} {} + cp1252 \U0010FFFF replace 1A -1 {} {} + cp1252 \U0010FFFF strict {} 0 {} {} +}; # cp1252 + +# +# cp1253 (generated from glibc-CP1253-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1253 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1253 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B5 B5 00B6 B6 00B7 B7 00BB BB 00BD BD 0192 83 0384 B4 0385 A1 0386 A2 0388 B8 0389 B9 038A BA 038C BC 038E BE 038F BF 0390 C0 0391 C1 0392 C2 0393 C3 0394 C4 0395 C5 0396 C6 0397 C7 0398 C8 0399 C9 039A CA 039B CB 039C CC 039D CD 039E CE 039F CF 03A0 D0 03A1 D1 03A3 D3 03A4 D4 03A5 D5 03A6 D6 03A7 D7 03A8 D8 03A9 D9 03AA DA 03AB DB 03AC DC 03AD DD 03AE DE 03AF DF 03B0 E0 03B1 E1 03B2 E2 03B3 E3 03B4 E4 03B5 E5 03B6 E6 03B7 E7 03B8 E8 03B9 E9 03BA EA 03BB EB 03BC EC 03BD ED 03BE EE 03BF EF 03C0 F0 03C1 F1 03C2 F2 03C3 F3 03C4 F4 03C5 F5 03C6 F6 03C7 F7 03C8 F8 03C9 F9 03CA FA 03CB FB 03CC FC 03CD FD 03CE FE 2013 96 2014 97 2015 AF 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1253 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1253 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B5 B5 00B6 B6 00B7 B7 00BB BB 00BD BD 0192 83 0384 B4 0385 A1 0386 A2 0388 B8 0389 B9 038A BA 038C BC 038E BE 038F BF 0390 C0 0391 C1 0392 C2 0393 C3 0394 C4 0395 C5 0396 C6 0397 C7 0398 C8 0399 C9 039A CA 039B CB 039C CC 039D CD 039E CE 039F CF 03A0 D0 03A1 D1 03A3 D3 03A4 D4 03A5 D5 03A6 D6 03A7 D7 03A8 D8 03A9 D9 03AA DA 03AB DB 03AC DC 03AD DD 03AE DE 03AF DF 03B0 E0 03B1 E1 03B2 E2 03B3 E3 03B4 E4 03B5 E5 03B6 E6 03B7 E7 03B8 E8 03B9 E9 03BA EA 03BB EB 03BC EC 03BD ED 03BE EE 03BF EF 03C0 F0 03C1 F1 03C2 F2 03C3 F3 03C4 F4 03C5 F5 03C6 F6 03C7 F7 03C8 F8 03C9 F9 03CA FA 03CB FB 03CC FC 03CD FD 03CE FE 2013 96 2014 97 2015 AF 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +# cp1253 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1253 81 tcl8 \U00000081 -1 {} {} + cp1253 81 replace \uFFFD -1 {} {} + cp1253 81 strict {} 0 {} {} + cp1253 88 tcl8 \U00000088 -1 {} {} + cp1253 88 replace \uFFFD -1 {} {} + cp1253 88 strict {} 0 {} {} + cp1253 8A tcl8 \U0000008A -1 {} {} + cp1253 8A replace \uFFFD -1 {} {} + cp1253 8A strict {} 0 {} {} + cp1253 8C tcl8 \U0000008C -1 {} {} + cp1253 8C replace \uFFFD -1 {} {} + cp1253 8C strict {} 0 {} {} + cp1253 8D tcl8 \U0000008D -1 {} {} + cp1253 8D replace \uFFFD -1 {} {} + cp1253 8D strict {} 0 {} {} + cp1253 8E tcl8 \U0000008E -1 {} {} + cp1253 8E replace \uFFFD -1 {} {} + cp1253 8E strict {} 0 {} {} + cp1253 8F tcl8 \U0000008F -1 {} {} + cp1253 8F replace \uFFFD -1 {} {} + cp1253 8F strict {} 0 {} {} + cp1253 90 tcl8 \U00000090 -1 {} {} + cp1253 90 replace \uFFFD -1 {} {} + cp1253 90 strict {} 0 {} {} + cp1253 98 tcl8 \U00000098 -1 {} {} + cp1253 98 replace \uFFFD -1 {} {} + cp1253 98 strict {} 0 {} {} + cp1253 9A tcl8 \U0000009A -1 {} {} + cp1253 9A replace \uFFFD -1 {} {} + cp1253 9A strict {} 0 {} {} + cp1253 9C tcl8 \U0000009C -1 {} {} + cp1253 9C replace \uFFFD -1 {} {} + cp1253 9C strict {} 0 {} {} + cp1253 9D tcl8 \U0000009D -1 {} {} + cp1253 9D replace \uFFFD -1 {} {} + cp1253 9D strict {} 0 {} {} + cp1253 9E tcl8 \U0000009E -1 {} {} + cp1253 9E replace \uFFFD -1 {} {} + cp1253 9E strict {} 0 {} {} + cp1253 9F tcl8 \U0000009F -1 {} {} + cp1253 9F replace \uFFFD -1 {} {} + cp1253 9F strict {} 0 {} {} + cp1253 AA tcl8 \U000000AA -1 {} {} + cp1253 AA replace \uFFFD -1 {} {} + cp1253 AA strict {} 0 {} {} + cp1253 D2 tcl8 \U000000D2 -1 {} {} + cp1253 D2 replace \uFFFD -1 {} {} + cp1253 D2 strict {} 0 {} {} + cp1253 FF tcl8 \U000000FF -1 {} {} + cp1253 FF replace \uFFFD -1 {} {} + cp1253 FF strict {} 0 {} {} +}; # cp1253 + +# cp1253 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1253 \U00000080 tcl8 1A -1 {} {} + cp1253 \U00000080 replace 1A -1 {} {} + cp1253 \U00000080 strict {} 0 {} {} + cp1253 \U00000400 tcl8 1A -1 {} {} + cp1253 \U00000400 replace 1A -1 {} {} + cp1253 \U00000400 strict {} 0 {} {} + cp1253 \U0000D800 tcl8 1A -1 {} {} + cp1253 \U0000D800 replace 1A -1 {} {} + cp1253 \U0000D800 strict {} 0 {} {} + cp1253 \U0000DC00 tcl8 1A -1 {} {} + cp1253 \U0000DC00 replace 1A -1 {} {} + cp1253 \U0000DC00 strict {} 0 {} {} + cp1253 \U00010000 tcl8 1A -1 {} {} + cp1253 \U00010000 replace 1A -1 {} {} + cp1253 \U00010000 strict {} 0 {} {} + cp1253 \U0010FFFF tcl8 1A -1 {} {} + cp1253 \U0010FFFF replace 1A -1 {} {} + cp1253 \U0010FFFF strict {} 0 {} {} +}; # cp1253 + +# +# cp1254 (generated from glibc-CP1254-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1254 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1254 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 011E D0 011F F0 0130 DD 0131 FD 0152 8C 0153 9C 015E DE 015F FE 0160 8A 0161 9A 0178 9F 0192 83 02C6 88 02DC 98 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1254 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1254 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 011E D0 011F F0 0130 DD 0131 FD 0152 8C 0153 9C 015E DE 015F FE 0160 8A 0161 9A 0178 9F 0192 83 02C6 88 02DC 98 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +# cp1254 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1254 81 tcl8 \U00000081 -1 {} {} + cp1254 81 replace \uFFFD -1 {} {} + cp1254 81 strict {} 0 {} {} + cp1254 8D tcl8 \U0000008D -1 {} {} + cp1254 8D replace \uFFFD -1 {} {} + cp1254 8D strict {} 0 {} {} + cp1254 8E tcl8 \U0000008E -1 {} {} + cp1254 8E replace \uFFFD -1 {} {} + cp1254 8E strict {} 0 {} {} + cp1254 8F tcl8 \U0000008F -1 {} {} + cp1254 8F replace \uFFFD -1 {} {} + cp1254 8F strict {} 0 {} {} + cp1254 90 tcl8 \U00000090 -1 {} {} + cp1254 90 replace \uFFFD -1 {} {} + cp1254 90 strict {} 0 {} {} + cp1254 9D tcl8 \U0000009D -1 {} {} + cp1254 9D replace \uFFFD -1 {} {} + cp1254 9D strict {} 0 {} {} + cp1254 9E tcl8 \U0000009E -1 {} {} + cp1254 9E replace \uFFFD -1 {} {} + cp1254 9E strict {} 0 {} {} +}; # cp1254 + +# cp1254 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1254 \U00000080 tcl8 1A -1 {} {} + cp1254 \U00000080 replace 1A -1 {} {} + cp1254 \U00000080 strict {} 0 {} {} + cp1254 \U00000400 tcl8 1A -1 {} {} + cp1254 \U00000400 replace 1A -1 {} {} + cp1254 \U00000400 strict {} 0 {} {} + cp1254 \U0000D800 tcl8 1A -1 {} {} + cp1254 \U0000D800 replace 1A -1 {} {} + cp1254 \U0000D800 strict {} 0 {} {} + cp1254 \U0000DC00 tcl8 1A -1 {} {} + cp1254 \U0000DC00 replace 1A -1 {} {} + cp1254 \U0000DC00 strict {} 0 {} {} + cp1254 \U00010000 tcl8 1A -1 {} {} + cp1254 \U00010000 replace 1A -1 {} {} + cp1254 \U00010000 strict {} 0 {} {} + cp1254 \U0010FFFF tcl8 1A -1 {} {} + cp1254 \U0010FFFF replace 1A -1 {} {} + cp1254 \U0010FFFF strict {} 0 {} {} +}; # cp1254 + +# +# cp1255 (generated from glibc-CP1255-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1255 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1255 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00D7 AA 00F7 BA 0192 83 02C6 88 02DC 98 05B0 C0 05B1 C1 05B2 C2 05B3 C3 05B4 C4 05B5 C5 05B6 C6 05B7 C7 05B8 C8 05B9 C9 05BB CB 05BC CC 05BD CD 05BE CE 05BF CF 05C0 D0 05C1 D1 05C2 D2 05C3 D3 05D0 E0 05D1 E1 05D2 E2 05D3 E3 05D4 E4 05D5 E5 05D6 E6 05D7 E7 05D8 E8 05D9 E9 05DA EA 05DB EB 05DC EC 05DD ED 05DE EE 05DF EF 05E0 F0 05E1 F1 05E2 F2 05E3 F3 05E4 F4 05E5 F5 05E6 F6 05E7 F7 05E8 F8 05E9 F9 05EA FA 05F0 D4 05F1 D5 05F2 D6 05F3 D7 05F4 D8 200E FD 200F FE 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AA A4 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1255 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1255 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00D7 AA 00F7 BA 0192 83 02C6 88 02DC 98 05B0 C0 05B1 C1 05B2 C2 05B3 C3 05B4 C4 05B5 C5 05B6 C6 05B7 C7 05B8 C8 05B9 C9 05BB CB 05BC CC 05BD CD 05BE CE 05BF CF 05C0 D0 05C1 D1 05C2 D2 05C3 D3 05D0 E0 05D1 E1 05D2 E2 05D3 E3 05D4 E4 05D5 E5 05D6 E6 05D7 E7 05D8 E8 05D9 E9 05DA EA 05DB EB 05DC EC 05DD ED 05DE EE 05DF EF 05E0 F0 05E1 F1 05E2 F2 05E3 F3 05E4 F4 05E5 F5 05E6 F6 05E7 F7 05E8 F8 05E9 F9 05EA FA 05F0 D4 05F1 D5 05F2 D6 05F3 D7 05F4 D8 200E FD 200F FE 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AA A4 20AC 80 2122 99} +} -result {} + +# cp1255 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1255 81 tcl8 \U00000081 -1 {} {} + cp1255 81 replace \uFFFD -1 {} {} + cp1255 81 strict {} 0 {} {} + cp1255 8A tcl8 \U0000008A -1 {} {} + cp1255 8A replace \uFFFD -1 {} {} + cp1255 8A strict {} 0 {} {} + cp1255 8C tcl8 \U0000008C -1 {} {} + cp1255 8C replace \uFFFD -1 {} {} + cp1255 8C strict {} 0 {} {} + cp1255 8D tcl8 \U0000008D -1 {} {} + cp1255 8D replace \uFFFD -1 {} {} + cp1255 8D strict {} 0 {} {} + cp1255 8E tcl8 \U0000008E -1 {} {} + cp1255 8E replace \uFFFD -1 {} {} + cp1255 8E strict {} 0 {} {} + cp1255 8F tcl8 \U0000008F -1 {} {} + cp1255 8F replace \uFFFD -1 {} {} + cp1255 8F strict {} 0 {} {} + cp1255 90 tcl8 \U00000090 -1 {} {} + cp1255 90 replace \uFFFD -1 {} {} + cp1255 90 strict {} 0 {} {} + cp1255 9A tcl8 \U0000009A -1 {} {} + cp1255 9A replace \uFFFD -1 {} {} + cp1255 9A strict {} 0 {} {} + cp1255 9C tcl8 \U0000009C -1 {} {} + cp1255 9C replace \uFFFD -1 {} {} + cp1255 9C strict {} 0 {} {} + cp1255 9D tcl8 \U0000009D -1 {} {} + cp1255 9D replace \uFFFD -1 {} {} + cp1255 9D strict {} 0 {} {} + cp1255 9E tcl8 \U0000009E -1 {} {} + cp1255 9E replace \uFFFD -1 {} {} + cp1255 9E strict {} 0 {} {} + cp1255 9F tcl8 \U0000009F -1 {} {} + cp1255 9F replace \uFFFD -1 {} {} + cp1255 9F strict {} 0 {} {} + cp1255 CA tcl8 \U000000CA -1 {} {} + cp1255 CA replace \uFFFD -1 {} {} + cp1255 CA strict {} 0 {} {} + cp1255 D9 tcl8 \U000000D9 -1 {} {} + cp1255 D9 replace \uFFFD -1 {} {} + cp1255 D9 strict {} 0 {} {} + cp1255 DA tcl8 \U000000DA -1 {} {} + cp1255 DA replace \uFFFD -1 {} {} + cp1255 DA strict {} 0 {} {} + cp1255 DB tcl8 \U000000DB -1 {} {} + cp1255 DB replace \uFFFD -1 {} {} + cp1255 DB strict {} 0 {} {} + cp1255 DC tcl8 \U000000DC -1 {} {} + cp1255 DC replace \uFFFD -1 {} {} + cp1255 DC strict {} 0 {} {} + cp1255 DD tcl8 \U000000DD -1 {} {} + cp1255 DD replace \uFFFD -1 {} {} + cp1255 DD strict {} 0 {} {} + cp1255 DE tcl8 \U000000DE -1 {} {} + cp1255 DE replace \uFFFD -1 {} {} + cp1255 DE strict {} 0 {} {} + cp1255 DF tcl8 \U000000DF -1 {} {} + cp1255 DF replace \uFFFD -1 {} {} + cp1255 DF strict {} 0 {} {} + cp1255 FB tcl8 \U000000FB -1 {} {} + cp1255 FB replace \uFFFD -1 {} {} + cp1255 FB strict {} 0 {} {} + cp1255 FC tcl8 \U000000FC -1 {} {} + cp1255 FC replace \uFFFD -1 {} {} + cp1255 FC strict {} 0 {} {} + cp1255 FF tcl8 \U000000FF -1 {} {} + cp1255 FF replace \uFFFD -1 {} {} + cp1255 FF strict {} 0 {} {} +}; # cp1255 + +# cp1255 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1255 \U00000080 tcl8 1A -1 {} {} + cp1255 \U00000080 replace 1A -1 {} {} + cp1255 \U00000080 strict {} 0 {} {} + cp1255 \U00000400 tcl8 1A -1 {} {} + cp1255 \U00000400 replace 1A -1 {} {} + cp1255 \U00000400 strict {} 0 {} {} + cp1255 \U0000D800 tcl8 1A -1 {} {} + cp1255 \U0000D800 replace 1A -1 {} {} + cp1255 \U0000D800 strict {} 0 {} {} + cp1255 \U0000DC00 tcl8 1A -1 {} {} + cp1255 \U0000DC00 replace 1A -1 {} {} + cp1255 \U0000DC00 strict {} 0 {} {} + cp1255 \U00010000 tcl8 1A -1 {} {} + cp1255 \U00010000 replace 1A -1 {} {} + cp1255 \U00010000 strict {} 0 {} {} + cp1255 \U0010FFFF tcl8 1A -1 {} {} + cp1255 \U0010FFFF replace 1A -1 {} {} + cp1255 \U0010FFFF strict {} 0 {} {} +}; # cp1255 + +# +# cp1256 (generated from glibc-CP1256-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1256 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1256 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00D7 D7 00E0 E0 00E2 E2 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EE EE 00EF EF 00F4 F4 00F7 F7 00F9 F9 00FB FB 00FC FC 0152 8C 0153 9C 0192 83 02C6 88 060C A1 061B BA 061F BF 0621 C1 0622 C2 0623 C3 0624 C4 0625 C5 0626 C6 0627 C7 0628 C8 0629 C9 062A CA 062B CB 062C CC 062D CD 062E CE 062F CF 0630 D0 0631 D1 0632 D2 0633 D3 0634 D4 0635 D5 0636 D6 0637 D8 0638 D9 0639 DA 063A DB 0640 DC 0641 DD 0642 DE 0643 DF 0644 E1 0645 E3 0646 E4 0647 E5 0648 E6 0649 EC 064A ED 064B F0 064C F1 064D F2 064E F3 064F F5 0650 F6 0651 F8 0652 FA 0679 8A 067E 81 0686 8D 0688 8F 0691 9A 0698 8E 06A9 98 06AF 90 06BA 9F 06BE AA 06C1 C0 06D2 FF 200C 9D 200D 9E 200E FD 200F FE 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1256 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1256 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00D7 D7 00E0 E0 00E2 E2 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EE EE 00EF EF 00F4 F4 00F7 F7 00F9 F9 00FB FB 00FC FC 0152 8C 0153 9C 0192 83 02C6 88 060C A1 061B BA 061F BF 0621 C1 0622 C2 0623 C3 0624 C4 0625 C5 0626 C6 0627 C7 0628 C8 0629 C9 062A CA 062B CB 062C CC 062D CD 062E CE 062F CF 0630 D0 0631 D1 0632 D2 0633 D3 0634 D4 0635 D5 0636 D6 0637 D8 0638 D9 0639 DA 063A DB 0640 DC 0641 DD 0642 DE 0643 DF 0644 E1 0645 E3 0646 E4 0647 E5 0648 E6 0649 EC 064A ED 064B F0 064C F1 064D F2 064E F3 064F F5 0650 F6 0651 F8 0652 FA 0679 8A 067E 81 0686 8D 0688 8F 0691 9A 0698 8E 06A9 98 06AF 90 06BA 9F 06BE AA 06C1 C0 06D2 FF 200C 9D 200D 9E 200E FD 200F FE 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +# cp1256 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # cp1256 + +# cp1256 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1256 \U00000080 tcl8 1A -1 {} {} + cp1256 \U00000080 replace 1A -1 {} {} + cp1256 \U00000080 strict {} 0 {} {} + cp1256 \U00000400 tcl8 1A -1 {} {} + cp1256 \U00000400 replace 1A -1 {} {} + cp1256 \U00000400 strict {} 0 {} {} + cp1256 \U0000D800 tcl8 1A -1 {} {} + cp1256 \U0000D800 replace 1A -1 {} {} + cp1256 \U0000D800 strict {} 0 {} {} + cp1256 \U0000DC00 tcl8 1A -1 {} {} + cp1256 \U0000DC00 replace 1A -1 {} {} + cp1256 \U0000DC00 strict {} 0 {} {} + cp1256 \U00010000 tcl8 1A -1 {} {} + cp1256 \U00010000 replace 1A -1 {} {} + cp1256 \U00010000 strict {} 0 {} {} + cp1256 \U0010FFFF tcl8 1A -1 {} {} + cp1256 \U0010FFFF replace 1A -1 {} {} + cp1256 \U0010FFFF strict {} 0 {} {} +}; # cp1256 + +# +# cp1257 (generated from glibc-CP1257-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1257 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1257 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A6 A6 00A7 A7 00A8 8D 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF 9D 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 8F 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00C4 C4 00C5 C5 00C6 AF 00C9 C9 00D3 D3 00D5 D5 00D6 D6 00D7 D7 00D8 A8 00DC DC 00DF DF 00E4 E4 00E5 E5 00E6 BF 00E9 E9 00F3 F3 00F5 F5 00F6 F6 00F7 F7 00F8 B8 00FC FC 0100 C2 0101 E2 0104 C0 0105 E0 0106 C3 0107 E3 010C C8 010D E8 0112 C7 0113 E7 0116 CB 0117 EB 0118 C6 0119 E6 0122 CC 0123 EC 012A CE 012B EE 012E C1 012F E1 0136 CD 0137 ED 013B CF 013C EF 0141 D9 0142 F9 0143 D1 0144 F1 0145 D2 0146 F2 014C D4 014D F4 0156 AA 0157 BA 015A DA 015B FA 0160 D0 0161 F0 016A DB 016B FB 0172 D8 0173 F8 0179 CA 017A EA 017B DD 017C FD 017D DE 017E FE 02C7 8E 02D9 FF 02DB 9E 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1257 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1257 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A6 A6 00A7 A7 00A8 8D 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF 9D 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 8F 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00C4 C4 00C5 C5 00C6 AF 00C9 C9 00D3 D3 00D5 D5 00D6 D6 00D7 D7 00D8 A8 00DC DC 00DF DF 00E4 E4 00E5 E5 00E6 BF 00E9 E9 00F3 F3 00F5 F5 00F6 F6 00F7 F7 00F8 B8 00FC FC 0100 C2 0101 E2 0104 C0 0105 E0 0106 C3 0107 E3 010C C8 010D E8 0112 C7 0113 E7 0116 CB 0117 EB 0118 C6 0119 E6 0122 CC 0123 EC 012A CE 012B EE 012E C1 012F E1 0136 CD 0137 ED 013B CF 013C EF 0141 D9 0142 F9 0143 D1 0144 F1 0145 D2 0146 F2 014C D4 014D F4 0156 AA 0157 BA 015A DA 015B FA 0160 D0 0161 F0 016A DB 016B FB 0172 D8 0173 F8 0179 CA 017A EA 017B DD 017C FD 017D DE 017E FE 02C7 8E 02D9 FF 02DB 9E 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AC 80 2122 99} +} -result {} + +# cp1257 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1257 81 tcl8 \U00000081 -1 {} {} + cp1257 81 replace \uFFFD -1 {} {} + cp1257 81 strict {} 0 {} {} + cp1257 83 tcl8 \U00000083 -1 {} {} + cp1257 83 replace \uFFFD -1 {} {} + cp1257 83 strict {} 0 {} {} + cp1257 88 tcl8 \U00000088 -1 {} {} + cp1257 88 replace \uFFFD -1 {} {} + cp1257 88 strict {} 0 {} {} + cp1257 8A tcl8 \U0000008A -1 {} {} + cp1257 8A replace \uFFFD -1 {} {} + cp1257 8A strict {} 0 {} {} + cp1257 8C tcl8 \U0000008C -1 {} {} + cp1257 8C replace \uFFFD -1 {} {} + cp1257 8C strict {} 0 {} {} + cp1257 90 tcl8 \U00000090 -1 {} {} + cp1257 90 replace \uFFFD -1 {} {} + cp1257 90 strict {} 0 {} {} + cp1257 98 tcl8 \U00000098 -1 {} {} + cp1257 98 replace \uFFFD -1 {} {} + cp1257 98 strict {} 0 {} {} + cp1257 9A tcl8 \U0000009A -1 {} {} + cp1257 9A replace \uFFFD -1 {} {} + cp1257 9A strict {} 0 {} {} + cp1257 9C tcl8 \U0000009C -1 {} {} + cp1257 9C replace \uFFFD -1 {} {} + cp1257 9C strict {} 0 {} {} + cp1257 9F tcl8 \U0000009F -1 {} {} + cp1257 9F replace \uFFFD -1 {} {} + cp1257 9F strict {} 0 {} {} + cp1257 A1 tcl8 \U000000A1 -1 {} {} + cp1257 A1 replace \uFFFD -1 {} {} + cp1257 A1 strict {} 0 {} {} + cp1257 A5 tcl8 \U000000A5 -1 {} {} + cp1257 A5 replace \uFFFD -1 {} {} + cp1257 A5 strict {} 0 {} {} +}; # cp1257 + +# cp1257 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1257 \U00000080 tcl8 1A -1 {} {} + cp1257 \U00000080 replace 1A -1 {} {} + cp1257 \U00000080 strict {} 0 {} {} + cp1257 \U00000400 tcl8 1A -1 {} {} + cp1257 \U00000400 replace 1A -1 {} {} + cp1257 \U00000400 strict {} 0 {} {} + cp1257 \U0000D800 tcl8 1A -1 {} {} + cp1257 \U0000D800 replace 1A -1 {} {} + cp1257 \U0000D800 strict {} 0 {} {} + cp1257 \U0000DC00 tcl8 1A -1 {} {} + cp1257 \U0000DC00 replace 1A -1 {} {} + cp1257 \U0000DC00 strict {} 0 {} {} + cp1257 \U00010000 tcl8 1A -1 {} {} + cp1257 \U00010000 replace 1A -1 {} {} + cp1257 \U00010000 strict {} 0 {} {} + cp1257 \U0010FFFF tcl8 1A -1 {} {} + cp1257 \U0010FFFF replace 1A -1 {} {} + cp1257 \U0010FFFF strict {} 0 {} {} +}; # cp1257 + +# +# cp1258 (generated from glibc-CP1258-2.1.2) + +test encoding-convertfrom-ucmCompare-cp1258 {Compare against ICU UCM} -body { + ucmConvertfromMismatches cp1258 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CD CD 00CE CE 00CF CF 00D1 D1 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00ED ED 00EE EE 00EF EF 00F1 F1 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 0102 C3 0103 E3 0110 D0 0111 F0 0152 8C 0153 9C 0178 9F 0192 83 01A0 D5 01A1 F5 01AF DD 01B0 FD 02C6 88 02DC 98 0300 CC 0303 DE 0309 D2 0323 F2 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AB FE 20AC 80 2122 99} +} -result {} + +test encoding-convertto-ucmCompare-cp1258 {Compare against ICU UCM} -body { + ucmConverttoMismatches cp1258 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CD CD 00CE CE 00CF CF 00D1 D1 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00ED ED 00EE EE 00EF EF 00F1 F1 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 0102 C3 0103 E3 0110 D0 0111 F0 0152 8C 0153 9C 0178 9F 0192 83 01A0 D5 01A1 F5 01AF DD 01B0 FD 02C6 88 02DC 98 0300 CC 0303 DE 0309 D2 0323 F2 2013 96 2014 97 2018 91 2019 92 201A 82 201C 93 201D 94 201E 84 2020 86 2021 87 2022 95 2026 85 2030 89 2039 8B 203A 9B 20AB FE 20AC 80 2122 99} +} -result {} + +# cp1258 - invalid byte sequences +lappend encInvalidBytes {*}{ + cp1258 81 tcl8 \U00000081 -1 {} {} + cp1258 81 replace \uFFFD -1 {} {} + cp1258 81 strict {} 0 {} {} + cp1258 8A tcl8 \U0000008A -1 {} {} + cp1258 8A replace \uFFFD -1 {} {} + cp1258 8A strict {} 0 {} {} + cp1258 8D tcl8 \U0000008D -1 {} {} + cp1258 8D replace \uFFFD -1 {} {} + cp1258 8D strict {} 0 {} {} + cp1258 8E tcl8 \U0000008E -1 {} {} + cp1258 8E replace \uFFFD -1 {} {} + cp1258 8E strict {} 0 {} {} + cp1258 8F tcl8 \U0000008F -1 {} {} + cp1258 8F replace \uFFFD -1 {} {} + cp1258 8F strict {} 0 {} {} + cp1258 90 tcl8 \U00000090 -1 {} {} + cp1258 90 replace \uFFFD -1 {} {} + cp1258 90 strict {} 0 {} {} + cp1258 9A tcl8 \U0000009A -1 {} {} + cp1258 9A replace \uFFFD -1 {} {} + cp1258 9A strict {} 0 {} {} + cp1258 9D tcl8 \U0000009D -1 {} {} + cp1258 9D replace \uFFFD -1 {} {} + cp1258 9D strict {} 0 {} {} + cp1258 9E tcl8 \U0000009E -1 {} {} + cp1258 9E replace \uFFFD -1 {} {} + cp1258 9E strict {} 0 {} {} + cp1258 EC tcl8 \U000000EC -1 {} {} + cp1258 EC replace \uFFFD -1 {} {} + cp1258 EC strict {} 0 {} {} +}; # cp1258 + +# cp1258 - invalid byte sequences +lappend encUnencodableStrings {*}{ + cp1258 \U00000080 tcl8 1A -1 {} {} + cp1258 \U00000080 replace 1A -1 {} {} + cp1258 \U00000080 strict {} 0 {} {} + cp1258 \U00000400 tcl8 1A -1 {} {} + cp1258 \U00000400 replace 1A -1 {} {} + cp1258 \U00000400 strict {} 0 {} {} + cp1258 \U0000D800 tcl8 1A -1 {} {} + cp1258 \U0000D800 replace 1A -1 {} {} + cp1258 \U0000D800 strict {} 0 {} {} + cp1258 \U0000DC00 tcl8 1A -1 {} {} + cp1258 \U0000DC00 replace 1A -1 {} {} + cp1258 \U0000DC00 strict {} 0 {} {} + cp1258 \U00010000 tcl8 1A -1 {} {} + cp1258 \U00010000 replace 1A -1 {} {} + cp1258 \U00010000 strict {} 0 {} {} + cp1258 \U0010FFFF tcl8 1A -1 {} {} + cp1258 \U0010FFFF replace 1A -1 {} {} + cp1258 \U0010FFFF strict {} 0 {} {} +}; # cp1258 + +# +# gb1988 (generated from glibc-GB_1988_80-2.3.3) + +test encoding-convertfrom-ucmCompare-gb1988 {Compare against ICU UCM} -body { + ucmConvertfromMismatches gb1988 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007F 7F 00A5 24 203E 7E} +} -result {} + +test encoding-convertto-ucmCompare-gb1988 {Compare against ICU UCM} -body { + ucmConverttoMismatches gb1988 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007F 7F 00A5 24 203E 7E} +} -result {} + +# gb1988 - invalid byte sequences +lappend encInvalidBytes {*}{ + gb1988 80 tcl8 \U00000080 -1 {} {} + gb1988 80 replace \uFFFD -1 {} {} + gb1988 80 strict {} 0 {} {} + gb1988 81 tcl8 \U00000081 -1 {} {} + gb1988 81 replace \uFFFD -1 {} {} + gb1988 81 strict {} 0 {} {} + gb1988 82 tcl8 \U00000082 -1 {} {} + gb1988 82 replace \uFFFD -1 {} {} + gb1988 82 strict {} 0 {} {} + gb1988 83 tcl8 \U00000083 -1 {} {} + gb1988 83 replace \uFFFD -1 {} {} + gb1988 83 strict {} 0 {} {} + gb1988 84 tcl8 \U00000084 -1 {} {} + gb1988 84 replace \uFFFD -1 {} {} + gb1988 84 strict {} 0 {} {} + gb1988 85 tcl8 \U00000085 -1 {} {} + gb1988 85 replace \uFFFD -1 {} {} + gb1988 85 strict {} 0 {} {} + gb1988 86 tcl8 \U00000086 -1 {} {} + gb1988 86 replace \uFFFD -1 {} {} + gb1988 86 strict {} 0 {} {} + gb1988 87 tcl8 \U00000087 -1 {} {} + gb1988 87 replace \uFFFD -1 {} {} + gb1988 87 strict {} 0 {} {} + gb1988 88 tcl8 \U00000088 -1 {} {} + gb1988 88 replace \uFFFD -1 {} {} + gb1988 88 strict {} 0 {} {} + gb1988 89 tcl8 \U00000089 -1 {} {} + gb1988 89 replace \uFFFD -1 {} {} + gb1988 89 strict {} 0 {} {} + gb1988 8A tcl8 \U0000008A -1 {} {} + gb1988 8A replace \uFFFD -1 {} {} + gb1988 8A strict {} 0 {} {} + gb1988 8B tcl8 \U0000008B -1 {} {} + gb1988 8B replace \uFFFD -1 {} {} + gb1988 8B strict {} 0 {} {} + gb1988 8C tcl8 \U0000008C -1 {} {} + gb1988 8C replace \uFFFD -1 {} {} + gb1988 8C strict {} 0 {} {} + gb1988 8D tcl8 \U0000008D -1 {} {} + gb1988 8D replace \uFFFD -1 {} {} + gb1988 8D strict {} 0 {} {} + gb1988 8E tcl8 \U0000008E -1 {} {} + gb1988 8E replace \uFFFD -1 {} {} + gb1988 8E strict {} 0 {} {} + gb1988 8F tcl8 \U0000008F -1 {} {} + gb1988 8F replace \uFFFD -1 {} {} + gb1988 8F strict {} 0 {} {} + gb1988 90 tcl8 \U00000090 -1 {} {} + gb1988 90 replace \uFFFD -1 {} {} + gb1988 90 strict {} 0 {} {} + gb1988 91 tcl8 \U00000091 -1 {} {} + gb1988 91 replace \uFFFD -1 {} {} + gb1988 91 strict {} 0 {} {} + gb1988 92 tcl8 \U00000092 -1 {} {} + gb1988 92 replace \uFFFD -1 {} {} + gb1988 92 strict {} 0 {} {} + gb1988 93 tcl8 \U00000093 -1 {} {} + gb1988 93 replace \uFFFD -1 {} {} + gb1988 93 strict {} 0 {} {} + gb1988 94 tcl8 \U00000094 -1 {} {} + gb1988 94 replace \uFFFD -1 {} {} + gb1988 94 strict {} 0 {} {} + gb1988 95 tcl8 \U00000095 -1 {} {} + gb1988 95 replace \uFFFD -1 {} {} + gb1988 95 strict {} 0 {} {} + gb1988 96 tcl8 \U00000096 -1 {} {} + gb1988 96 replace \uFFFD -1 {} {} + gb1988 96 strict {} 0 {} {} + gb1988 97 tcl8 \U00000097 -1 {} {} + gb1988 97 replace \uFFFD -1 {} {} + gb1988 97 strict {} 0 {} {} + gb1988 98 tcl8 \U00000098 -1 {} {} + gb1988 98 replace \uFFFD -1 {} {} + gb1988 98 strict {} 0 {} {} + gb1988 99 tcl8 \U00000099 -1 {} {} + gb1988 99 replace \uFFFD -1 {} {} + gb1988 99 strict {} 0 {} {} + gb1988 9A tcl8 \U0000009A -1 {} {} + gb1988 9A replace \uFFFD -1 {} {} + gb1988 9A strict {} 0 {} {} + gb1988 9B tcl8 \U0000009B -1 {} {} + gb1988 9B replace \uFFFD -1 {} {} + gb1988 9B strict {} 0 {} {} + gb1988 9C tcl8 \U0000009C -1 {} {} + gb1988 9C replace \uFFFD -1 {} {} + gb1988 9C strict {} 0 {} {} + gb1988 9D tcl8 \U0000009D -1 {} {} + gb1988 9D replace \uFFFD -1 {} {} + gb1988 9D strict {} 0 {} {} + gb1988 9E tcl8 \U0000009E -1 {} {} + gb1988 9E replace \uFFFD -1 {} {} + gb1988 9E strict {} 0 {} {} + gb1988 9F tcl8 \U0000009F -1 {} {} + gb1988 9F replace \uFFFD -1 {} {} + gb1988 9F strict {} 0 {} {} + gb1988 A0 tcl8 \U000000A0 -1 {} {} + gb1988 A0 replace \uFFFD -1 {} {} + gb1988 A0 strict {} 0 {} {} + gb1988 A1 tcl8 \U000000A1 -1 {} {} + gb1988 A1 replace \uFFFD -1 {} {} + gb1988 A1 strict {} 0 {} {} + gb1988 A2 tcl8 \U000000A2 -1 {} {} + gb1988 A2 replace \uFFFD -1 {} {} + gb1988 A2 strict {} 0 {} {} + gb1988 A3 tcl8 \U000000A3 -1 {} {} + gb1988 A3 replace \uFFFD -1 {} {} + gb1988 A3 strict {} 0 {} {} + gb1988 A4 tcl8 \U000000A4 -1 {} {} + gb1988 A4 replace \uFFFD -1 {} {} + gb1988 A4 strict {} 0 {} {} + gb1988 A5 tcl8 \U000000A5 -1 {} {} + gb1988 A5 replace \uFFFD -1 {} {} + gb1988 A5 strict {} 0 {} {} + gb1988 A6 tcl8 \U000000A6 -1 {} {} + gb1988 A6 replace \uFFFD -1 {} {} + gb1988 A6 strict {} 0 {} {} + gb1988 A7 tcl8 \U000000A7 -1 {} {} + gb1988 A7 replace \uFFFD -1 {} {} + gb1988 A7 strict {} 0 {} {} + gb1988 A8 tcl8 \U000000A8 -1 {} {} + gb1988 A8 replace \uFFFD -1 {} {} + gb1988 A8 strict {} 0 {} {} + gb1988 A9 tcl8 \U000000A9 -1 {} {} + gb1988 A9 replace \uFFFD -1 {} {} + gb1988 A9 strict {} 0 {} {} + gb1988 AA tcl8 \U000000AA -1 {} {} + gb1988 AA replace \uFFFD -1 {} {} + gb1988 AA strict {} 0 {} {} + gb1988 AB tcl8 \U000000AB -1 {} {} + gb1988 AB replace \uFFFD -1 {} {} + gb1988 AB strict {} 0 {} {} + gb1988 AC tcl8 \U000000AC -1 {} {} + gb1988 AC replace \uFFFD -1 {} {} + gb1988 AC strict {} 0 {} {} + gb1988 AD tcl8 \U000000AD -1 {} {} + gb1988 AD replace \uFFFD -1 {} {} + gb1988 AD strict {} 0 {} {} + gb1988 AE tcl8 \U000000AE -1 {} {} + gb1988 AE replace \uFFFD -1 {} {} + gb1988 AE strict {} 0 {} {} + gb1988 AF tcl8 \U000000AF -1 {} {} + gb1988 AF replace \uFFFD -1 {} {} + gb1988 AF strict {} 0 {} {} + gb1988 B0 tcl8 \U000000B0 -1 {} {} + gb1988 B0 replace \uFFFD -1 {} {} + gb1988 B0 strict {} 0 {} {} + gb1988 B1 tcl8 \U000000B1 -1 {} {} + gb1988 B1 replace \uFFFD -1 {} {} + gb1988 B1 strict {} 0 {} {} + gb1988 B2 tcl8 \U000000B2 -1 {} {} + gb1988 B2 replace \uFFFD -1 {} {} + gb1988 B2 strict {} 0 {} {} + gb1988 B3 tcl8 \U000000B3 -1 {} {} + gb1988 B3 replace \uFFFD -1 {} {} + gb1988 B3 strict {} 0 {} {} + gb1988 B4 tcl8 \U000000B4 -1 {} {} + gb1988 B4 replace \uFFFD -1 {} {} + gb1988 B4 strict {} 0 {} {} + gb1988 B5 tcl8 \U000000B5 -1 {} {} + gb1988 B5 replace \uFFFD -1 {} {} + gb1988 B5 strict {} 0 {} {} + gb1988 B6 tcl8 \U000000B6 -1 {} {} + gb1988 B6 replace \uFFFD -1 {} {} + gb1988 B6 strict {} 0 {} {} + gb1988 B7 tcl8 \U000000B7 -1 {} {} + gb1988 B7 replace \uFFFD -1 {} {} + gb1988 B7 strict {} 0 {} {} + gb1988 B8 tcl8 \U000000B8 -1 {} {} + gb1988 B8 replace \uFFFD -1 {} {} + gb1988 B8 strict {} 0 {} {} + gb1988 B9 tcl8 \U000000B9 -1 {} {} + gb1988 B9 replace \uFFFD -1 {} {} + gb1988 B9 strict {} 0 {} {} + gb1988 BA tcl8 \U000000BA -1 {} {} + gb1988 BA replace \uFFFD -1 {} {} + gb1988 BA strict {} 0 {} {} + gb1988 BB tcl8 \U000000BB -1 {} {} + gb1988 BB replace \uFFFD -1 {} {} + gb1988 BB strict {} 0 {} {} + gb1988 BC tcl8 \U000000BC -1 {} {} + gb1988 BC replace \uFFFD -1 {} {} + gb1988 BC strict {} 0 {} {} + gb1988 BD tcl8 \U000000BD -1 {} {} + gb1988 BD replace \uFFFD -1 {} {} + gb1988 BD strict {} 0 {} {} + gb1988 BE tcl8 \U000000BE -1 {} {} + gb1988 BE replace \uFFFD -1 {} {} + gb1988 BE strict {} 0 {} {} + gb1988 BF tcl8 \U000000BF -1 {} {} + gb1988 BF replace \uFFFD -1 {} {} + gb1988 BF strict {} 0 {} {} + gb1988 C0 tcl8 \U000000C0 -1 {} {} + gb1988 C0 replace \uFFFD -1 {} {} + gb1988 C0 strict {} 0 {} {} + gb1988 C1 tcl8 \U000000C1 -1 {} {} + gb1988 C1 replace \uFFFD -1 {} {} + gb1988 C1 strict {} 0 {} {} + gb1988 C2 tcl8 \U000000C2 -1 {} {} + gb1988 C2 replace \uFFFD -1 {} {} + gb1988 C2 strict {} 0 {} {} + gb1988 C3 tcl8 \U000000C3 -1 {} {} + gb1988 C3 replace \uFFFD -1 {} {} + gb1988 C3 strict {} 0 {} {} + gb1988 C4 tcl8 \U000000C4 -1 {} {} + gb1988 C4 replace \uFFFD -1 {} {} + gb1988 C4 strict {} 0 {} {} + gb1988 C5 tcl8 \U000000C5 -1 {} {} + gb1988 C5 replace \uFFFD -1 {} {} + gb1988 C5 strict {} 0 {} {} + gb1988 C6 tcl8 \U000000C6 -1 {} {} + gb1988 C6 replace \uFFFD -1 {} {} + gb1988 C6 strict {} 0 {} {} + gb1988 C7 tcl8 \U000000C7 -1 {} {} + gb1988 C7 replace \uFFFD -1 {} {} + gb1988 C7 strict {} 0 {} {} + gb1988 C8 tcl8 \U000000C8 -1 {} {} + gb1988 C8 replace \uFFFD -1 {} {} + gb1988 C8 strict {} 0 {} {} + gb1988 C9 tcl8 \U000000C9 -1 {} {} + gb1988 C9 replace \uFFFD -1 {} {} + gb1988 C9 strict {} 0 {} {} + gb1988 CA tcl8 \U000000CA -1 {} {} + gb1988 CA replace \uFFFD -1 {} {} + gb1988 CA strict {} 0 {} {} + gb1988 CB tcl8 \U000000CB -1 {} {} + gb1988 CB replace \uFFFD -1 {} {} + gb1988 CB strict {} 0 {} {} + gb1988 CC tcl8 \U000000CC -1 {} {} + gb1988 CC replace \uFFFD -1 {} {} + gb1988 CC strict {} 0 {} {} + gb1988 CD tcl8 \U000000CD -1 {} {} + gb1988 CD replace \uFFFD -1 {} {} + gb1988 CD strict {} 0 {} {} + gb1988 CE tcl8 \U000000CE -1 {} {} + gb1988 CE replace \uFFFD -1 {} {} + gb1988 CE strict {} 0 {} {} + gb1988 CF tcl8 \U000000CF -1 {} {} + gb1988 CF replace \uFFFD -1 {} {} + gb1988 CF strict {} 0 {} {} + gb1988 D0 tcl8 \U000000D0 -1 {} {} + gb1988 D0 replace \uFFFD -1 {} {} + gb1988 D0 strict {} 0 {} {} + gb1988 D1 tcl8 \U000000D1 -1 {} {} + gb1988 D1 replace \uFFFD -1 {} {} + gb1988 D1 strict {} 0 {} {} + gb1988 D2 tcl8 \U000000D2 -1 {} {} + gb1988 D2 replace \uFFFD -1 {} {} + gb1988 D2 strict {} 0 {} {} + gb1988 D3 tcl8 \U000000D3 -1 {} {} + gb1988 D3 replace \uFFFD -1 {} {} + gb1988 D3 strict {} 0 {} {} + gb1988 D4 tcl8 \U000000D4 -1 {} {} + gb1988 D4 replace \uFFFD -1 {} {} + gb1988 D4 strict {} 0 {} {} + gb1988 D5 tcl8 \U000000D5 -1 {} {} + gb1988 D5 replace \uFFFD -1 {} {} + gb1988 D5 strict {} 0 {} {} + gb1988 D6 tcl8 \U000000D6 -1 {} {} + gb1988 D6 replace \uFFFD -1 {} {} + gb1988 D6 strict {} 0 {} {} + gb1988 D7 tcl8 \U000000D7 -1 {} {} + gb1988 D7 replace \uFFFD -1 {} {} + gb1988 D7 strict {} 0 {} {} + gb1988 D8 tcl8 \U000000D8 -1 {} {} + gb1988 D8 replace \uFFFD -1 {} {} + gb1988 D8 strict {} 0 {} {} + gb1988 D9 tcl8 \U000000D9 -1 {} {} + gb1988 D9 replace \uFFFD -1 {} {} + gb1988 D9 strict {} 0 {} {} + gb1988 DA tcl8 \U000000DA -1 {} {} + gb1988 DA replace \uFFFD -1 {} {} + gb1988 DA strict {} 0 {} {} + gb1988 DB tcl8 \U000000DB -1 {} {} + gb1988 DB replace \uFFFD -1 {} {} + gb1988 DB strict {} 0 {} {} + gb1988 DC tcl8 \U000000DC -1 {} {} + gb1988 DC replace \uFFFD -1 {} {} + gb1988 DC strict {} 0 {} {} + gb1988 DD tcl8 \U000000DD -1 {} {} + gb1988 DD replace \uFFFD -1 {} {} + gb1988 DD strict {} 0 {} {} + gb1988 DE tcl8 \U000000DE -1 {} {} + gb1988 DE replace \uFFFD -1 {} {} + gb1988 DE strict {} 0 {} {} + gb1988 DF tcl8 \U000000DF -1 {} {} + gb1988 DF replace \uFFFD -1 {} {} + gb1988 DF strict {} 0 {} {} + gb1988 E0 tcl8 \U000000E0 -1 {} {} + gb1988 E0 replace \uFFFD -1 {} {} + gb1988 E0 strict {} 0 {} {} + gb1988 E1 tcl8 \U000000E1 -1 {} {} + gb1988 E1 replace \uFFFD -1 {} {} + gb1988 E1 strict {} 0 {} {} + gb1988 E2 tcl8 \U000000E2 -1 {} {} + gb1988 E2 replace \uFFFD -1 {} {} + gb1988 E2 strict {} 0 {} {} + gb1988 E3 tcl8 \U000000E3 -1 {} {} + gb1988 E3 replace \uFFFD -1 {} {} + gb1988 E3 strict {} 0 {} {} + gb1988 E4 tcl8 \U000000E4 -1 {} {} + gb1988 E4 replace \uFFFD -1 {} {} + gb1988 E4 strict {} 0 {} {} + gb1988 E5 tcl8 \U000000E5 -1 {} {} + gb1988 E5 replace \uFFFD -1 {} {} + gb1988 E5 strict {} 0 {} {} + gb1988 E6 tcl8 \U000000E6 -1 {} {} + gb1988 E6 replace \uFFFD -1 {} {} + gb1988 E6 strict {} 0 {} {} + gb1988 E7 tcl8 \U000000E7 -1 {} {} + gb1988 E7 replace \uFFFD -1 {} {} + gb1988 E7 strict {} 0 {} {} + gb1988 E8 tcl8 \U000000E8 -1 {} {} + gb1988 E8 replace \uFFFD -1 {} {} + gb1988 E8 strict {} 0 {} {} + gb1988 E9 tcl8 \U000000E9 -1 {} {} + gb1988 E9 replace \uFFFD -1 {} {} + gb1988 E9 strict {} 0 {} {} + gb1988 EA tcl8 \U000000EA -1 {} {} + gb1988 EA replace \uFFFD -1 {} {} + gb1988 EA strict {} 0 {} {} + gb1988 EB tcl8 \U000000EB -1 {} {} + gb1988 EB replace \uFFFD -1 {} {} + gb1988 EB strict {} 0 {} {} + gb1988 EC tcl8 \U000000EC -1 {} {} + gb1988 EC replace \uFFFD -1 {} {} + gb1988 EC strict {} 0 {} {} + gb1988 ED tcl8 \U000000ED -1 {} {} + gb1988 ED replace \uFFFD -1 {} {} + gb1988 ED strict {} 0 {} {} + gb1988 EE tcl8 \U000000EE -1 {} {} + gb1988 EE replace \uFFFD -1 {} {} + gb1988 EE strict {} 0 {} {} + gb1988 EF tcl8 \U000000EF -1 {} {} + gb1988 EF replace \uFFFD -1 {} {} + gb1988 EF strict {} 0 {} {} + gb1988 F0 tcl8 \U000000F0 -1 {} {} + gb1988 F0 replace \uFFFD -1 {} {} + gb1988 F0 strict {} 0 {} {} + gb1988 F1 tcl8 \U000000F1 -1 {} {} + gb1988 F1 replace \uFFFD -1 {} {} + gb1988 F1 strict {} 0 {} {} + gb1988 F2 tcl8 \U000000F2 -1 {} {} + gb1988 F2 replace \uFFFD -1 {} {} + gb1988 F2 strict {} 0 {} {} + gb1988 F3 tcl8 \U000000F3 -1 {} {} + gb1988 F3 replace \uFFFD -1 {} {} + gb1988 F3 strict {} 0 {} {} + gb1988 F4 tcl8 \U000000F4 -1 {} {} + gb1988 F4 replace \uFFFD -1 {} {} + gb1988 F4 strict {} 0 {} {} + gb1988 F5 tcl8 \U000000F5 -1 {} {} + gb1988 F5 replace \uFFFD -1 {} {} + gb1988 F5 strict {} 0 {} {} + gb1988 F6 tcl8 \U000000F6 -1 {} {} + gb1988 F6 replace \uFFFD -1 {} {} + gb1988 F6 strict {} 0 {} {} + gb1988 F7 tcl8 \U000000F7 -1 {} {} + gb1988 F7 replace \uFFFD -1 {} {} + gb1988 F7 strict {} 0 {} {} + gb1988 F8 tcl8 \U000000F8 -1 {} {} + gb1988 F8 replace \uFFFD -1 {} {} + gb1988 F8 strict {} 0 {} {} + gb1988 F9 tcl8 \U000000F9 -1 {} {} + gb1988 F9 replace \uFFFD -1 {} {} + gb1988 F9 strict {} 0 {} {} + gb1988 FA tcl8 \U000000FA -1 {} {} + gb1988 FA replace \uFFFD -1 {} {} + gb1988 FA strict {} 0 {} {} + gb1988 FB tcl8 \U000000FB -1 {} {} + gb1988 FB replace \uFFFD -1 {} {} + gb1988 FB strict {} 0 {} {} + gb1988 FC tcl8 \U000000FC -1 {} {} + gb1988 FC replace \uFFFD -1 {} {} + gb1988 FC strict {} 0 {} {} + gb1988 FD tcl8 \U000000FD -1 {} {} + gb1988 FD replace \uFFFD -1 {} {} + gb1988 FD strict {} 0 {} {} + gb1988 FE tcl8 \U000000FE -1 {} {} + gb1988 FE replace \uFFFD -1 {} {} + gb1988 FE strict {} 0 {} {} + gb1988 FF tcl8 \U000000FF -1 {} {} + gb1988 FF replace \uFFFD -1 {} {} + gb1988 FF strict {} 0 {} {} +}; # gb1988 + +# gb1988 - invalid byte sequences +lappend encUnencodableStrings {*}{ + gb1988 \U00000024 tcl8 1A -1 {} {} + gb1988 \U00000024 replace 1A -1 {} {} + gb1988 \U00000024 strict {} 0 {} {} + gb1988 \U00000400 tcl8 1A -1 {} {} + gb1988 \U00000400 replace 1A -1 {} {} + gb1988 \U00000400 strict {} 0 {} {} + gb1988 \U0000D800 tcl8 1A -1 {} {} + gb1988 \U0000D800 replace 1A -1 {} {} + gb1988 \U0000D800 strict {} 0 {} {} + gb1988 \U0000DC00 tcl8 1A -1 {} {} + gb1988 \U0000DC00 replace 1A -1 {} {} + gb1988 \U0000DC00 strict {} 0 {} {} + gb1988 \U00010000 tcl8 1A -1 {} {} + gb1988 \U00010000 replace 1A -1 {} {} + gb1988 \U00010000 strict {} 0 {} {} + gb1988 \U0010FFFF tcl8 1A -1 {} {} + gb1988 \U0010FFFF replace 1A -1 {} {} + gb1988 \U0010FFFF strict {} 0 {} {} +}; # gb1988 + +# +# iso8859-1 (generated from glibc-ISO_8859_1-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-1 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-1 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D0 D0 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F0 F0 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 00FF FF} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-1 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-1 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D0 D0 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F0 F0 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 00FF FF} +} -result {} + +# iso8859-1 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-1 + +# iso8859-1 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-1 \U00000400 tcl8 1A -1 {} {} + iso8859-1 \U00000400 replace 1A -1 {} {} + iso8859-1 \U00000400 strict {} 0 {} {} + iso8859-1 \U0000D800 tcl8 1A -1 {} {} + iso8859-1 \U0000D800 replace 1A -1 {} {} + iso8859-1 \U0000D800 strict {} 0 {} {} + iso8859-1 \U0000DC00 tcl8 1A -1 {} {} + iso8859-1 \U0000DC00 replace 1A -1 {} {} + iso8859-1 \U0000DC00 strict {} 0 {} {} + iso8859-1 \U00010000 tcl8 1A -1 {} {} + iso8859-1 \U00010000 replace 1A -1 {} {} + iso8859-1 \U00010000 strict {} 0 {} {} + iso8859-1 \U0010FFFF tcl8 1A -1 {} {} + iso8859-1 \U0010FFFF replace 1A -1 {} {} + iso8859-1 \U0010FFFF strict {} 0 {} {} +}; # iso8859-1 + +# +# iso8859-2 (generated from glibc-ISO_8859_2-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-2 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-2 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A4 A4 00A7 A7 00A8 A8 00AD AD 00B0 B0 00B4 B4 00B8 B8 00C1 C1 00C2 C2 00C4 C4 00C7 C7 00C9 C9 00CB CB 00CD CD 00CE CE 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00DA DA 00DC DC 00DD DD 00DF DF 00E1 E1 00E2 E2 00E4 E4 00E7 E7 00E9 E9 00EB EB 00ED ED 00EE EE 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00FA FA 00FC FC 00FD FD 0102 C3 0103 E3 0104 A1 0105 B1 0106 C6 0107 E6 010C C8 010D E8 010E CF 010F EF 0110 D0 0111 F0 0118 CA 0119 EA 011A CC 011B EC 0139 C5 013A E5 013D A5 013E B5 0141 A3 0142 B3 0143 D1 0144 F1 0147 D2 0148 F2 0150 D5 0151 F5 0154 C0 0155 E0 0158 D8 0159 F8 015A A6 015B B6 015E AA 015F BA 0160 A9 0161 B9 0162 DE 0163 FE 0164 AB 0165 BB 016E D9 016F F9 0170 DB 0171 FB 0179 AC 017A BC 017B AF 017C BF 017D AE 017E BE 02C7 B7 02D8 A2 02D9 FF 02DB B2 02DD BD} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-2 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-2 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A4 A4 00A7 A7 00A8 A8 00AD AD 00B0 B0 00B4 B4 00B8 B8 00C1 C1 00C2 C2 00C4 C4 00C7 C7 00C9 C9 00CB CB 00CD CD 00CE CE 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00DA DA 00DC DC 00DD DD 00DF DF 00E1 E1 00E2 E2 00E4 E4 00E7 E7 00E9 E9 00EB EB 00ED ED 00EE EE 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00FA FA 00FC FC 00FD FD 0102 C3 0103 E3 0104 A1 0105 B1 0106 C6 0107 E6 010C C8 010D E8 010E CF 010F EF 0110 D0 0111 F0 0118 CA 0119 EA 011A CC 011B EC 0139 C5 013A E5 013D A5 013E B5 0141 A3 0142 B3 0143 D1 0144 F1 0147 D2 0148 F2 0150 D5 0151 F5 0154 C0 0155 E0 0158 D8 0159 F8 015A A6 015B B6 015E AA 015F BA 0160 A9 0161 B9 0162 DE 0163 FE 0164 AB 0165 BB 016E D9 016F F9 0170 DB 0171 FB 0179 AC 017A BC 017B AF 017C BF 017D AE 017E BE 02C7 B7 02D8 A2 02D9 FF 02DB B2 02DD BD} +} -result {} + +# iso8859-2 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-2 + +# iso8859-2 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-2 \U000000A1 tcl8 1A -1 {} {} + iso8859-2 \U000000A1 replace 1A -1 {} {} + iso8859-2 \U000000A1 strict {} 0 {} {} + iso8859-2 \U00000400 tcl8 1A -1 {} {} + iso8859-2 \U00000400 replace 1A -1 {} {} + iso8859-2 \U00000400 strict {} 0 {} {} + iso8859-2 \U0000D800 tcl8 1A -1 {} {} + iso8859-2 \U0000D800 replace 1A -1 {} {} + iso8859-2 \U0000D800 strict {} 0 {} {} + iso8859-2 \U0000DC00 tcl8 1A -1 {} {} + iso8859-2 \U0000DC00 replace 1A -1 {} {} + iso8859-2 \U0000DC00 strict {} 0 {} {} + iso8859-2 \U00010000 tcl8 1A -1 {} {} + iso8859-2 \U00010000 replace 1A -1 {} {} + iso8859-2 \U00010000 strict {} 0 {} {} + iso8859-2 \U0010FFFF tcl8 1A -1 {} {} + iso8859-2 \U0010FFFF replace 1A -1 {} {} + iso8859-2 \U0010FFFF strict {} 0 {} {} +}; # iso8859-2 + +# +# iso8859-3 (generated from glibc-ISO_8859_3-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-3 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-3 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A3 A3 00A4 A4 00A7 A7 00A8 A8 00AD AD 00B0 B0 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B7 B7 00B8 B8 00BD BD 00C0 C0 00C1 C1 00C2 C2 00C4 C4 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E4 E4 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00F9 F9 00FA FA 00FB FB 00FC FC 0108 C6 0109 E6 010A C5 010B E5 011C D8 011D F8 011E AB 011F BB 0120 D5 0121 F5 0124 A6 0125 B6 0126 A1 0127 B1 0130 A9 0131 B9 0134 AC 0135 BC 015C DE 015D FE 015E AA 015F BA 016C DD 016D FD 017B AF 017C BF 02D8 A2 02D9 FF} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-3 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-3 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A3 A3 00A4 A4 00A7 A7 00A8 A8 00AD AD 00B0 B0 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B7 B7 00B8 B8 00BD BD 00C0 C0 00C1 C1 00C2 C2 00C4 C4 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D6 D6 00D7 D7 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E4 E4 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F6 F6 00F7 F7 00F9 F9 00FA FA 00FB FB 00FC FC 0108 C6 0109 E6 010A C5 010B E5 011C D8 011D F8 011E AB 011F BB 0120 D5 0121 F5 0124 A6 0125 B6 0126 A1 0127 B1 0130 A9 0131 B9 0134 AC 0135 BC 015C DE 015D FE 015E AA 015F BA 016C DD 016D FD 017B AF 017C BF 02D8 A2 02D9 FF} +} -result {} + +# iso8859-3 - invalid byte sequences +lappend encInvalidBytes {*}{ + iso8859-3 A5 tcl8 \U000000A5 -1 {} {} + iso8859-3 A5 replace \uFFFD -1 {} {} + iso8859-3 A5 strict {} 0 {} {} + iso8859-3 AE tcl8 \U000000AE -1 {} {} + iso8859-3 AE replace \uFFFD -1 {} {} + iso8859-3 AE strict {} 0 {} {} + iso8859-3 BE tcl8 \U000000BE -1 {} {} + iso8859-3 BE replace \uFFFD -1 {} {} + iso8859-3 BE strict {} 0 {} {} + iso8859-3 C3 tcl8 \U000000C3 -1 {} {} + iso8859-3 C3 replace \uFFFD -1 {} {} + iso8859-3 C3 strict {} 0 {} {} + iso8859-3 D0 tcl8 \U000000D0 -1 {} {} + iso8859-3 D0 replace \uFFFD -1 {} {} + iso8859-3 D0 strict {} 0 {} {} + iso8859-3 E3 tcl8 \U000000E3 -1 {} {} + iso8859-3 E3 replace \uFFFD -1 {} {} + iso8859-3 E3 strict {} 0 {} {} + iso8859-3 F0 tcl8 \U000000F0 -1 {} {} + iso8859-3 F0 replace \uFFFD -1 {} {} + iso8859-3 F0 strict {} 0 {} {} +}; # iso8859-3 + +# iso8859-3 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-3 \U000000A1 tcl8 1A -1 {} {} + iso8859-3 \U000000A1 replace 1A -1 {} {} + iso8859-3 \U000000A1 strict {} 0 {} {} + iso8859-3 \U00000400 tcl8 1A -1 {} {} + iso8859-3 \U00000400 replace 1A -1 {} {} + iso8859-3 \U00000400 strict {} 0 {} {} + iso8859-3 \U0000D800 tcl8 1A -1 {} {} + iso8859-3 \U0000D800 replace 1A -1 {} {} + iso8859-3 \U0000D800 strict {} 0 {} {} + iso8859-3 \U0000DC00 tcl8 1A -1 {} {} + iso8859-3 \U0000DC00 replace 1A -1 {} {} + iso8859-3 \U0000DC00 strict {} 0 {} {} + iso8859-3 \U00010000 tcl8 1A -1 {} {} + iso8859-3 \U00010000 replace 1A -1 {} {} + iso8859-3 \U00010000 strict {} 0 {} {} + iso8859-3 \U0010FFFF tcl8 1A -1 {} {} + iso8859-3 \U0010FFFF replace 1A -1 {} {} + iso8859-3 \U0010FFFF strict {} 0 {} {} +}; # iso8859-3 + +# +# iso8859-4 (generated from glibc-ISO_8859_4-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-4 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-4 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A4 A4 00A7 A7 00A8 A8 00AD AD 00AF AF 00B0 B0 00B4 B4 00B8 B8 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C9 C9 00CB CB 00CD CD 00CE CE 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00DA DA 00DB DB 00DC DC 00DF DF 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E9 E9 00EB EB 00ED ED 00EE EE 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00FA FA 00FB FB 00FC FC 0100 C0 0101 E0 0104 A1 0105 B1 010C C8 010D E8 0110 D0 0111 F0 0112 AA 0113 BA 0116 CC 0117 EC 0118 CA 0119 EA 0122 AB 0123 BB 0128 A5 0129 B5 012A CF 012B EF 012E C7 012F E7 0136 D3 0137 F3 0138 A2 013B A6 013C B6 0145 D1 0146 F1 014A BD 014B BF 014C D2 014D F2 0156 A3 0157 B3 0160 A9 0161 B9 0166 AC 0167 BC 0168 DD 0169 FD 016A DE 016B FE 0172 D9 0173 F9 017D AE 017E BE 02C7 B7 02D9 FF 02DB B2} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-4 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-4 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A4 A4 00A7 A7 00A8 A8 00AD AD 00AF AF 00B0 B0 00B4 B4 00B8 B8 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C9 C9 00CB CB 00CD CD 00CE CE 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00DA DA 00DB DB 00DC DC 00DF DF 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E9 E9 00EB EB 00ED ED 00EE EE 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00FA FA 00FB FB 00FC FC 0100 C0 0101 E0 0104 A1 0105 B1 010C C8 010D E8 0110 D0 0111 F0 0112 AA 0113 BA 0116 CC 0117 EC 0118 CA 0119 EA 0122 AB 0123 BB 0128 A5 0129 B5 012A CF 012B EF 012E C7 012F E7 0136 D3 0137 F3 0138 A2 013B A6 013C B6 0145 D1 0146 F1 014A BD 014B BF 014C D2 014D F2 0156 A3 0157 B3 0160 A9 0161 B9 0166 AC 0167 BC 0168 DD 0169 FD 016A DE 016B FE 0172 D9 0173 F9 017D AE 017E BE 02C7 B7 02D9 FF 02DB B2} +} -result {} + +# iso8859-4 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-4 + +# iso8859-4 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-4 \U000000A1 tcl8 1A -1 {} {} + iso8859-4 \U000000A1 replace 1A -1 {} {} + iso8859-4 \U000000A1 strict {} 0 {} {} + iso8859-4 \U00000400 tcl8 1A -1 {} {} + iso8859-4 \U00000400 replace 1A -1 {} {} + iso8859-4 \U00000400 strict {} 0 {} {} + iso8859-4 \U0000D800 tcl8 1A -1 {} {} + iso8859-4 \U0000D800 replace 1A -1 {} {} + iso8859-4 \U0000D800 strict {} 0 {} {} + iso8859-4 \U0000DC00 tcl8 1A -1 {} {} + iso8859-4 \U0000DC00 replace 1A -1 {} {} + iso8859-4 \U0000DC00 strict {} 0 {} {} + iso8859-4 \U00010000 tcl8 1A -1 {} {} + iso8859-4 \U00010000 replace 1A -1 {} {} + iso8859-4 \U00010000 strict {} 0 {} {} + iso8859-4 \U0010FFFF tcl8 1A -1 {} {} + iso8859-4 \U0010FFFF replace 1A -1 {} {} + iso8859-4 \U0010FFFF strict {} 0 {} {} +}; # iso8859-4 + +# +# iso8859-5 (generated from glibc-ISO_8859_5-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-5 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-5 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A7 FD 00AD AD 0401 A1 0402 A2 0403 A3 0404 A4 0405 A5 0406 A6 0407 A7 0408 A8 0409 A9 040A AA 040B AB 040C AC 040E AE 040F AF 0410 B0 0411 B1 0412 B2 0413 B3 0414 B4 0415 B5 0416 B6 0417 B7 0418 B8 0419 B9 041A BA 041B BB 041C BC 041D BD 041E BE 041F BF 0420 C0 0421 C1 0422 C2 0423 C3 0424 C4 0425 C5 0426 C6 0427 C7 0428 C8 0429 C9 042A CA 042B CB 042C CC 042D CD 042E CE 042F CF 0430 D0 0431 D1 0432 D2 0433 D3 0434 D4 0435 D5 0436 D6 0437 D7 0438 D8 0439 D9 043A DA 043B DB 043C DC 043D DD 043E DE 043F DF 0440 E0 0441 E1 0442 E2 0443 E3 0444 E4 0445 E5 0446 E6 0447 E7 0448 E8 0449 E9 044A EA 044B EB 044C EC 044D ED 044E EE 044F EF 0451 F1 0452 F2 0453 F3 0454 F4 0455 F5 0456 F6 0457 F7 0458 F8 0459 F9 045A FA 045B FB 045C FC 045E FE 045F FF 2116 F0} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-5 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-5 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A7 FD 00AD AD 0401 A1 0402 A2 0403 A3 0404 A4 0405 A5 0406 A6 0407 A7 0408 A8 0409 A9 040A AA 040B AB 040C AC 040E AE 040F AF 0410 B0 0411 B1 0412 B2 0413 B3 0414 B4 0415 B5 0416 B6 0417 B7 0418 B8 0419 B9 041A BA 041B BB 041C BC 041D BD 041E BE 041F BF 0420 C0 0421 C1 0422 C2 0423 C3 0424 C4 0425 C5 0426 C6 0427 C7 0428 C8 0429 C9 042A CA 042B CB 042C CC 042D CD 042E CE 042F CF 0430 D0 0431 D1 0432 D2 0433 D3 0434 D4 0435 D5 0436 D6 0437 D7 0438 D8 0439 D9 043A DA 043B DB 043C DC 043D DD 043E DE 043F DF 0440 E0 0441 E1 0442 E2 0443 E3 0444 E4 0445 E5 0446 E6 0447 E7 0448 E8 0449 E9 044A EA 044B EB 044C EC 044D ED 044E EE 044F EF 0451 F1 0452 F2 0453 F3 0454 F4 0455 F5 0456 F6 0457 F7 0458 F8 0459 F9 045A FA 045B FB 045C FC 045E FE 045F FF 2116 F0} +} -result {} + +# iso8859-5 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-5 + +# iso8859-5 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-5 \U000000A1 tcl8 1A -1 {} {} + iso8859-5 \U000000A1 replace 1A -1 {} {} + iso8859-5 \U000000A1 strict {} 0 {} {} + iso8859-5 \U00000400 tcl8 1A -1 {} {} + iso8859-5 \U00000400 replace 1A -1 {} {} + iso8859-5 \U00000400 strict {} 0 {} {} + iso8859-5 \U0000D800 tcl8 1A -1 {} {} + iso8859-5 \U0000D800 replace 1A -1 {} {} + iso8859-5 \U0000D800 strict {} 0 {} {} + iso8859-5 \U0000DC00 tcl8 1A -1 {} {} + iso8859-5 \U0000DC00 replace 1A -1 {} {} + iso8859-5 \U0000DC00 strict {} 0 {} {} + iso8859-5 \U00010000 tcl8 1A -1 {} {} + iso8859-5 \U00010000 replace 1A -1 {} {} + iso8859-5 \U00010000 strict {} 0 {} {} + iso8859-5 \U0010FFFF tcl8 1A -1 {} {} + iso8859-5 \U0010FFFF replace 1A -1 {} {} + iso8859-5 \U0010FFFF strict {} 0 {} {} +}; # iso8859-5 + +# +# iso8859-6 (generated from glibc-ISO_8859_6-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-6 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-6 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A4 A4 00AD AD 060C AC 061B BB 061F BF 0621 C1 0622 C2 0623 C3 0624 C4 0625 C5 0626 C6 0627 C7 0628 C8 0629 C9 062A CA 062B CB 062C CC 062D CD 062E CE 062F CF 0630 D0 0631 D1 0632 D2 0633 D3 0634 D4 0635 D5 0636 D6 0637 D7 0638 D8 0639 D9 063A DA 0640 E0 0641 E1 0642 E2 0643 E3 0644 E4 0645 E5 0646 E6 0647 E7 0648 E8 0649 E9 064A EA 064B EB 064C EC 064D ED 064E EE 064F EF 0650 F0 0651 F1 0652 F2} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-6 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-6 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A4 A4 00AD AD 060C AC 061B BB 061F BF 0621 C1 0622 C2 0623 C3 0624 C4 0625 C5 0626 C6 0627 C7 0628 C8 0629 C9 062A CA 062B CB 062C CC 062D CD 062E CE 062F CF 0630 D0 0631 D1 0632 D2 0633 D3 0634 D4 0635 D5 0636 D6 0637 D7 0638 D8 0639 D9 063A DA 0640 E0 0641 E1 0642 E2 0643 E3 0644 E4 0645 E5 0646 E6 0647 E7 0648 E8 0649 E9 064A EA 064B EB 064C EC 064D ED 064E EE 064F EF 0650 F0 0651 F1 0652 F2} +} -result {} + +# iso8859-6 - invalid byte sequences +lappend encInvalidBytes {*}{ + iso8859-6 A1 tcl8 \U000000A1 -1 {} {} + iso8859-6 A1 replace \uFFFD -1 {} {} + iso8859-6 A1 strict {} 0 {} {} + iso8859-6 A2 tcl8 \U000000A2 -1 {} {} + iso8859-6 A2 replace \uFFFD -1 {} {} + iso8859-6 A2 strict {} 0 {} {} + iso8859-6 A3 tcl8 \U000000A3 -1 {} {} + iso8859-6 A3 replace \uFFFD -1 {} {} + iso8859-6 A3 strict {} 0 {} {} + iso8859-6 A5 tcl8 \U000000A5 -1 {} {} + iso8859-6 A5 replace \uFFFD -1 {} {} + iso8859-6 A5 strict {} 0 {} {} + iso8859-6 A6 tcl8 \U000000A6 -1 {} {} + iso8859-6 A6 replace \uFFFD -1 {} {} + iso8859-6 A6 strict {} 0 {} {} + iso8859-6 A7 tcl8 \U000000A7 -1 {} {} + iso8859-6 A7 replace \uFFFD -1 {} {} + iso8859-6 A7 strict {} 0 {} {} + iso8859-6 A8 tcl8 \U000000A8 -1 {} {} + iso8859-6 A8 replace \uFFFD -1 {} {} + iso8859-6 A8 strict {} 0 {} {} + iso8859-6 A9 tcl8 \U000000A9 -1 {} {} + iso8859-6 A9 replace \uFFFD -1 {} {} + iso8859-6 A9 strict {} 0 {} {} + iso8859-6 AA tcl8 \U000000AA -1 {} {} + iso8859-6 AA replace \uFFFD -1 {} {} + iso8859-6 AA strict {} 0 {} {} + iso8859-6 AB tcl8 \U000000AB -1 {} {} + iso8859-6 AB replace \uFFFD -1 {} {} + iso8859-6 AB strict {} 0 {} {} + iso8859-6 AE tcl8 \U000000AE -1 {} {} + iso8859-6 AE replace \uFFFD -1 {} {} + iso8859-6 AE strict {} 0 {} {} + iso8859-6 AF tcl8 \U000000AF -1 {} {} + iso8859-6 AF replace \uFFFD -1 {} {} + iso8859-6 AF strict {} 0 {} {} + iso8859-6 B0 tcl8 \U000000B0 -1 {} {} + iso8859-6 B0 replace \uFFFD -1 {} {} + iso8859-6 B0 strict {} 0 {} {} + iso8859-6 B1 tcl8 \U000000B1 -1 {} {} + iso8859-6 B1 replace \uFFFD -1 {} {} + iso8859-6 B1 strict {} 0 {} {} + iso8859-6 B2 tcl8 \U000000B2 -1 {} {} + iso8859-6 B2 replace \uFFFD -1 {} {} + iso8859-6 B2 strict {} 0 {} {} + iso8859-6 B3 tcl8 \U000000B3 -1 {} {} + iso8859-6 B3 replace \uFFFD -1 {} {} + iso8859-6 B3 strict {} 0 {} {} + iso8859-6 B4 tcl8 \U000000B4 -1 {} {} + iso8859-6 B4 replace \uFFFD -1 {} {} + iso8859-6 B4 strict {} 0 {} {} + iso8859-6 B5 tcl8 \U000000B5 -1 {} {} + iso8859-6 B5 replace \uFFFD -1 {} {} + iso8859-6 B5 strict {} 0 {} {} + iso8859-6 B6 tcl8 \U000000B6 -1 {} {} + iso8859-6 B6 replace \uFFFD -1 {} {} + iso8859-6 B6 strict {} 0 {} {} + iso8859-6 B7 tcl8 \U000000B7 -1 {} {} + iso8859-6 B7 replace \uFFFD -1 {} {} + iso8859-6 B7 strict {} 0 {} {} + iso8859-6 B8 tcl8 \U000000B8 -1 {} {} + iso8859-6 B8 replace \uFFFD -1 {} {} + iso8859-6 B8 strict {} 0 {} {} + iso8859-6 B9 tcl8 \U000000B9 -1 {} {} + iso8859-6 B9 replace \uFFFD -1 {} {} + iso8859-6 B9 strict {} 0 {} {} + iso8859-6 BA tcl8 \U000000BA -1 {} {} + iso8859-6 BA replace \uFFFD -1 {} {} + iso8859-6 BA strict {} 0 {} {} + iso8859-6 BC tcl8 \U000000BC -1 {} {} + iso8859-6 BC replace \uFFFD -1 {} {} + iso8859-6 BC strict {} 0 {} {} + iso8859-6 BD tcl8 \U000000BD -1 {} {} + iso8859-6 BD replace \uFFFD -1 {} {} + iso8859-6 BD strict {} 0 {} {} + iso8859-6 BE tcl8 \U000000BE -1 {} {} + iso8859-6 BE replace \uFFFD -1 {} {} + iso8859-6 BE strict {} 0 {} {} + iso8859-6 C0 tcl8 \U000000C0 -1 {} {} + iso8859-6 C0 replace \uFFFD -1 {} {} + iso8859-6 C0 strict {} 0 {} {} + iso8859-6 DB tcl8 \U000000DB -1 {} {} + iso8859-6 DB replace \uFFFD -1 {} {} + iso8859-6 DB strict {} 0 {} {} + iso8859-6 DC tcl8 \U000000DC -1 {} {} + iso8859-6 DC replace \uFFFD -1 {} {} + iso8859-6 DC strict {} 0 {} {} + iso8859-6 DD tcl8 \U000000DD -1 {} {} + iso8859-6 DD replace \uFFFD -1 {} {} + iso8859-6 DD strict {} 0 {} {} + iso8859-6 DE tcl8 \U000000DE -1 {} {} + iso8859-6 DE replace \uFFFD -1 {} {} + iso8859-6 DE strict {} 0 {} {} + iso8859-6 DF tcl8 \U000000DF -1 {} {} + iso8859-6 DF replace \uFFFD -1 {} {} + iso8859-6 DF strict {} 0 {} {} + iso8859-6 F3 tcl8 \U000000F3 -1 {} {} + iso8859-6 F3 replace \uFFFD -1 {} {} + iso8859-6 F3 strict {} 0 {} {} + iso8859-6 F4 tcl8 \U000000F4 -1 {} {} + iso8859-6 F4 replace \uFFFD -1 {} {} + iso8859-6 F4 strict {} 0 {} {} + iso8859-6 F5 tcl8 \U000000F5 -1 {} {} + iso8859-6 F5 replace \uFFFD -1 {} {} + iso8859-6 F5 strict {} 0 {} {} + iso8859-6 F6 tcl8 \U000000F6 -1 {} {} + iso8859-6 F6 replace \uFFFD -1 {} {} + iso8859-6 F6 strict {} 0 {} {} + iso8859-6 F7 tcl8 \U000000F7 -1 {} {} + iso8859-6 F7 replace \uFFFD -1 {} {} + iso8859-6 F7 strict {} 0 {} {} + iso8859-6 F8 tcl8 \U000000F8 -1 {} {} + iso8859-6 F8 replace \uFFFD -1 {} {} + iso8859-6 F8 strict {} 0 {} {} + iso8859-6 F9 tcl8 \U000000F9 -1 {} {} + iso8859-6 F9 replace \uFFFD -1 {} {} + iso8859-6 F9 strict {} 0 {} {} + iso8859-6 FA tcl8 \U000000FA -1 {} {} + iso8859-6 FA replace \uFFFD -1 {} {} + iso8859-6 FA strict {} 0 {} {} + iso8859-6 FB tcl8 \U000000FB -1 {} {} + iso8859-6 FB replace \uFFFD -1 {} {} + iso8859-6 FB strict {} 0 {} {} + iso8859-6 FC tcl8 \U000000FC -1 {} {} + iso8859-6 FC replace \uFFFD -1 {} {} + iso8859-6 FC strict {} 0 {} {} + iso8859-6 FD tcl8 \U000000FD -1 {} {} + iso8859-6 FD replace \uFFFD -1 {} {} + iso8859-6 FD strict {} 0 {} {} + iso8859-6 FE tcl8 \U000000FE -1 {} {} + iso8859-6 FE replace \uFFFD -1 {} {} + iso8859-6 FE strict {} 0 {} {} + iso8859-6 FF tcl8 \U000000FF -1 {} {} + iso8859-6 FF replace \uFFFD -1 {} {} + iso8859-6 FF strict {} 0 {} {} +}; # iso8859-6 + +# iso8859-6 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-6 \U000000A1 tcl8 1A -1 {} {} + iso8859-6 \U000000A1 replace 1A -1 {} {} + iso8859-6 \U000000A1 strict {} 0 {} {} + iso8859-6 \U00000400 tcl8 1A -1 {} {} + iso8859-6 \U00000400 replace 1A -1 {} {} + iso8859-6 \U00000400 strict {} 0 {} {} + iso8859-6 \U0000D800 tcl8 1A -1 {} {} + iso8859-6 \U0000D800 replace 1A -1 {} {} + iso8859-6 \U0000D800 strict {} 0 {} {} + iso8859-6 \U0000DC00 tcl8 1A -1 {} {} + iso8859-6 \U0000DC00 replace 1A -1 {} {} + iso8859-6 \U0000DC00 strict {} 0 {} {} + iso8859-6 \U00010000 tcl8 1A -1 {} {} + iso8859-6 \U00010000 replace 1A -1 {} {} + iso8859-6 \U00010000 strict {} 0 {} {} + iso8859-6 \U0010FFFF tcl8 1A -1 {} {} + iso8859-6 \U0010FFFF replace 1A -1 {} {} + iso8859-6 \U0010FFFF strict {} 0 {} {} +}; # iso8859-6 + +# +# iso8859-7 (generated from glibc-ISO_8859_7-2.3.3) + +test encoding-convertfrom-ucmCompare-iso8859-7 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-7 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A3 A3 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B7 B7 00BB BB 00BD BD 037A AA 0384 B4 0385 B5 0386 B6 0388 B8 0389 B9 038A BA 038C BC 038E BE 038F BF 0390 C0 0391 C1 0392 C2 0393 C3 0394 C4 0395 C5 0396 C6 0397 C7 0398 C8 0399 C9 039A CA 039B CB 039C CC 039D CD 039E CE 039F CF 03A0 D0 03A1 D1 03A3 D3 03A4 D4 03A5 D5 03A6 D6 03A7 D7 03A8 D8 03A9 D9 03AA DA 03AB DB 03AC DC 03AD DD 03AE DE 03AF DF 03B0 E0 03B1 E1 03B2 E2 03B3 E3 03B4 E4 03B5 E5 03B6 E6 03B7 E7 03B8 E8 03B9 E9 03BA EA 03BB EB 03BC EC 03BD ED 03BE EE 03BF EF 03C0 F0 03C1 F1 03C2 F2 03C3 F3 03C4 F4 03C5 F5 03C6 F6 03C7 F7 03C8 F8 03C9 F9 03CA FA 03CB FB 03CC FC 03CD FD 03CE FE 2015 AF 2018 A1 2019 A2 20AC A4 20AF A5} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-7 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-7 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A3 A3 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B7 B7 00BB BB 00BD BD 037A AA 0384 B4 0385 B5 0386 B6 0388 B8 0389 B9 038A BA 038C BC 038E BE 038F BF 0390 C0 0391 C1 0392 C2 0393 C3 0394 C4 0395 C5 0396 C6 0397 C7 0398 C8 0399 C9 039A CA 039B CB 039C CC 039D CD 039E CE 039F CF 03A0 D0 03A1 D1 03A3 D3 03A4 D4 03A5 D5 03A6 D6 03A7 D7 03A8 D8 03A9 D9 03AA DA 03AB DB 03AC DC 03AD DD 03AE DE 03AF DF 03B0 E0 03B1 E1 03B2 E2 03B3 E3 03B4 E4 03B5 E5 03B6 E6 03B7 E7 03B8 E8 03B9 E9 03BA EA 03BB EB 03BC EC 03BD ED 03BE EE 03BF EF 03C0 F0 03C1 F1 03C2 F2 03C3 F3 03C4 F4 03C5 F5 03C6 F6 03C7 F7 03C8 F8 03C9 F9 03CA FA 03CB FB 03CC FC 03CD FD 03CE FE 2015 AF 2018 A1 2019 A2 20AC A4 20AF A5} +} -result {} + +# iso8859-7 - invalid byte sequences +lappend encInvalidBytes {*}{ + iso8859-7 AE tcl8 \U000000AE -1 {} {} + iso8859-7 AE replace \uFFFD -1 {} {} + iso8859-7 AE strict {} 0 {} {} + iso8859-7 D2 tcl8 \U000000D2 -1 {} {} + iso8859-7 D2 replace \uFFFD -1 {} {} + iso8859-7 D2 strict {} 0 {} {} + iso8859-7 FF tcl8 \U000000FF -1 {} {} + iso8859-7 FF replace \uFFFD -1 {} {} + iso8859-7 FF strict {} 0 {} {} +}; # iso8859-7 + +# iso8859-7 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-7 \U000000A1 tcl8 1A -1 {} {} + iso8859-7 \U000000A1 replace 1A -1 {} {} + iso8859-7 \U000000A1 strict {} 0 {} {} + iso8859-7 \U00000400 tcl8 1A -1 {} {} + iso8859-7 \U00000400 replace 1A -1 {} {} + iso8859-7 \U00000400 strict {} 0 {} {} + iso8859-7 \U0000D800 tcl8 1A -1 {} {} + iso8859-7 \U0000D800 replace 1A -1 {} {} + iso8859-7 \U0000D800 strict {} 0 {} {} + iso8859-7 \U0000DC00 tcl8 1A -1 {} {} + iso8859-7 \U0000DC00 replace 1A -1 {} {} + iso8859-7 \U0000DC00 strict {} 0 {} {} + iso8859-7 \U00010000 tcl8 1A -1 {} {} + iso8859-7 \U00010000 replace 1A -1 {} {} + iso8859-7 \U00010000 strict {} 0 {} {} + iso8859-7 \U0010FFFF tcl8 1A -1 {} {} + iso8859-7 \U0010FFFF replace 1A -1 {} {} + iso8859-7 \U0010FFFF strict {} 0 {} {} +}; # iso8859-7 + +# +# iso8859-8 (generated from glibc-ISO_8859_8-2.3.3) + +test encoding-convertfrom-ucmCompare-iso8859-8 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-8 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00D7 AA 00F7 BA 05D0 E0 05D1 E1 05D2 E2 05D3 E3 05D4 E4 05D5 E5 05D6 E6 05D7 E7 05D8 E8 05D9 E9 05DA EA 05DB EB 05DC EC 05DD ED 05DE EE 05DF EF 05E0 F0 05E1 F1 05E2 F2 05E3 F3 05E4 F4 05E5 F5 05E6 F6 05E7 F7 05E8 F8 05E9 F9 05EA FA 200E FD 200F FE 2017 DF} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-8 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-8 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00D7 AA 00F7 BA 05D0 E0 05D1 E1 05D2 E2 05D3 E3 05D4 E4 05D5 E5 05D6 E6 05D7 E7 05D8 E8 05D9 E9 05DA EA 05DB EB 05DC EC 05DD ED 05DE EE 05DF EF 05E0 F0 05E1 F1 05E2 F2 05E3 F3 05E4 F4 05E5 F5 05E6 F6 05E7 F7 05E8 F8 05E9 F9 05EA FA 200E FD 200F FE 2017 DF} +} -result {} + +# iso8859-8 - invalid byte sequences +lappend encInvalidBytes {*}{ + iso8859-8 A1 tcl8 \U000000A1 -1 {} {} + iso8859-8 A1 replace \uFFFD -1 {} {} + iso8859-8 A1 strict {} 0 {} {} + iso8859-8 BF tcl8 \U000000BF -1 {} {} + iso8859-8 BF replace \uFFFD -1 {} {} + iso8859-8 BF strict {} 0 {} {} + iso8859-8 C0 tcl8 \U000000C0 -1 {} {} + iso8859-8 C0 replace \uFFFD -1 {} {} + iso8859-8 C0 strict {} 0 {} {} + iso8859-8 C1 tcl8 \U000000C1 -1 {} {} + iso8859-8 C1 replace \uFFFD -1 {} {} + iso8859-8 C1 strict {} 0 {} {} + iso8859-8 C2 tcl8 \U000000C2 -1 {} {} + iso8859-8 C2 replace \uFFFD -1 {} {} + iso8859-8 C2 strict {} 0 {} {} + iso8859-8 C3 tcl8 \U000000C3 -1 {} {} + iso8859-8 C3 replace \uFFFD -1 {} {} + iso8859-8 C3 strict {} 0 {} {} + iso8859-8 C4 tcl8 \U000000C4 -1 {} {} + iso8859-8 C4 replace \uFFFD -1 {} {} + iso8859-8 C4 strict {} 0 {} {} + iso8859-8 C5 tcl8 \U000000C5 -1 {} {} + iso8859-8 C5 replace \uFFFD -1 {} {} + iso8859-8 C5 strict {} 0 {} {} + iso8859-8 C6 tcl8 \U000000C6 -1 {} {} + iso8859-8 C6 replace \uFFFD -1 {} {} + iso8859-8 C6 strict {} 0 {} {} + iso8859-8 C7 tcl8 \U000000C7 -1 {} {} + iso8859-8 C7 replace \uFFFD -1 {} {} + iso8859-8 C7 strict {} 0 {} {} + iso8859-8 C8 tcl8 \U000000C8 -1 {} {} + iso8859-8 C8 replace \uFFFD -1 {} {} + iso8859-8 C8 strict {} 0 {} {} + iso8859-8 C9 tcl8 \U000000C9 -1 {} {} + iso8859-8 C9 replace \uFFFD -1 {} {} + iso8859-8 C9 strict {} 0 {} {} + iso8859-8 CA tcl8 \U000000CA -1 {} {} + iso8859-8 CA replace \uFFFD -1 {} {} + iso8859-8 CA strict {} 0 {} {} + iso8859-8 CB tcl8 \U000000CB -1 {} {} + iso8859-8 CB replace \uFFFD -1 {} {} + iso8859-8 CB strict {} 0 {} {} + iso8859-8 CC tcl8 \U000000CC -1 {} {} + iso8859-8 CC replace \uFFFD -1 {} {} + iso8859-8 CC strict {} 0 {} {} + iso8859-8 CD tcl8 \U000000CD -1 {} {} + iso8859-8 CD replace \uFFFD -1 {} {} + iso8859-8 CD strict {} 0 {} {} + iso8859-8 CE tcl8 \U000000CE -1 {} {} + iso8859-8 CE replace \uFFFD -1 {} {} + iso8859-8 CE strict {} 0 {} {} + iso8859-8 CF tcl8 \U000000CF -1 {} {} + iso8859-8 CF replace \uFFFD -1 {} {} + iso8859-8 CF strict {} 0 {} {} + iso8859-8 D0 tcl8 \U000000D0 -1 {} {} + iso8859-8 D0 replace \uFFFD -1 {} {} + iso8859-8 D0 strict {} 0 {} {} + iso8859-8 D1 tcl8 \U000000D1 -1 {} {} + iso8859-8 D1 replace \uFFFD -1 {} {} + iso8859-8 D1 strict {} 0 {} {} + iso8859-8 D2 tcl8 \U000000D2 -1 {} {} + iso8859-8 D2 replace \uFFFD -1 {} {} + iso8859-8 D2 strict {} 0 {} {} + iso8859-8 D3 tcl8 \U000000D3 -1 {} {} + iso8859-8 D3 replace \uFFFD -1 {} {} + iso8859-8 D3 strict {} 0 {} {} + iso8859-8 D4 tcl8 \U000000D4 -1 {} {} + iso8859-8 D4 replace \uFFFD -1 {} {} + iso8859-8 D4 strict {} 0 {} {} + iso8859-8 D5 tcl8 \U000000D5 -1 {} {} + iso8859-8 D5 replace \uFFFD -1 {} {} + iso8859-8 D5 strict {} 0 {} {} + iso8859-8 D6 tcl8 \U000000D6 -1 {} {} + iso8859-8 D6 replace \uFFFD -1 {} {} + iso8859-8 D6 strict {} 0 {} {} + iso8859-8 D7 tcl8 \U000000D7 -1 {} {} + iso8859-8 D7 replace \uFFFD -1 {} {} + iso8859-8 D7 strict {} 0 {} {} + iso8859-8 D8 tcl8 \U000000D8 -1 {} {} + iso8859-8 D8 replace \uFFFD -1 {} {} + iso8859-8 D8 strict {} 0 {} {} + iso8859-8 D9 tcl8 \U000000D9 -1 {} {} + iso8859-8 D9 replace \uFFFD -1 {} {} + iso8859-8 D9 strict {} 0 {} {} + iso8859-8 DA tcl8 \U000000DA -1 {} {} + iso8859-8 DA replace \uFFFD -1 {} {} + iso8859-8 DA strict {} 0 {} {} + iso8859-8 DB tcl8 \U000000DB -1 {} {} + iso8859-8 DB replace \uFFFD -1 {} {} + iso8859-8 DB strict {} 0 {} {} + iso8859-8 DC tcl8 \U000000DC -1 {} {} + iso8859-8 DC replace \uFFFD -1 {} {} + iso8859-8 DC strict {} 0 {} {} + iso8859-8 DD tcl8 \U000000DD -1 {} {} + iso8859-8 DD replace \uFFFD -1 {} {} + iso8859-8 DD strict {} 0 {} {} + iso8859-8 DE tcl8 \U000000DE -1 {} {} + iso8859-8 DE replace \uFFFD -1 {} {} + iso8859-8 DE strict {} 0 {} {} + iso8859-8 FB tcl8 \U000000FB -1 {} {} + iso8859-8 FB replace \uFFFD -1 {} {} + iso8859-8 FB strict {} 0 {} {} + iso8859-8 FC tcl8 \U000000FC -1 {} {} + iso8859-8 FC replace \uFFFD -1 {} {} + iso8859-8 FC strict {} 0 {} {} + iso8859-8 FF tcl8 \U000000FF -1 {} {} + iso8859-8 FF replace \uFFFD -1 {} {} + iso8859-8 FF strict {} 0 {} {} +}; # iso8859-8 + +# iso8859-8 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-8 \U000000A1 tcl8 1A -1 {} {} + iso8859-8 \U000000A1 replace 1A -1 {} {} + iso8859-8 \U000000A1 strict {} 0 {} {} + iso8859-8 \U00000400 tcl8 1A -1 {} {} + iso8859-8 \U00000400 replace 1A -1 {} {} + iso8859-8 \U00000400 strict {} 0 {} {} + iso8859-8 \U0000D800 tcl8 1A -1 {} {} + iso8859-8 \U0000D800 replace 1A -1 {} {} + iso8859-8 \U0000D800 strict {} 0 {} {} + iso8859-8 \U0000DC00 tcl8 1A -1 {} {} + iso8859-8 \U0000DC00 replace 1A -1 {} {} + iso8859-8 \U0000DC00 strict {} 0 {} {} + iso8859-8 \U00010000 tcl8 1A -1 {} {} + iso8859-8 \U00010000 replace 1A -1 {} {} + iso8859-8 \U00010000 strict {} 0 {} {} + iso8859-8 \U0010FFFF tcl8 1A -1 {} {} + iso8859-8 \U0010FFFF replace 1A -1 {} {} + iso8859-8 \U0010FFFF strict {} 0 {} {} +}; # iso8859-8 + +# +# iso8859-9 (generated from glibc-ISO_8859_9-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-9 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-9 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 011E D0 011F F0 0130 DD 0131 FD 015E DE 015F FE} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-9 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-9 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A4 A4 00A5 A5 00A6 A6 00A7 A7 00A8 A8 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B4 B4 00B5 B5 00B6 B6 00B7 B7 00B8 B8 00B9 B9 00BA BA 00BB BB 00BC BC 00BD BD 00BE BE 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 011E D0 011F F0 0130 DD 0131 FD 015E DE 015F FE} +} -result {} + +# iso8859-9 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-9 + +# iso8859-9 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-9 \U000000D0 tcl8 1A -1 {} {} + iso8859-9 \U000000D0 replace 1A -1 {} {} + iso8859-9 \U000000D0 strict {} 0 {} {} + iso8859-9 \U00000400 tcl8 1A -1 {} {} + iso8859-9 \U00000400 replace 1A -1 {} {} + iso8859-9 \U00000400 strict {} 0 {} {} + iso8859-9 \U0000D800 tcl8 1A -1 {} {} + iso8859-9 \U0000D800 replace 1A -1 {} {} + iso8859-9 \U0000D800 strict {} 0 {} {} + iso8859-9 \U0000DC00 tcl8 1A -1 {} {} + iso8859-9 \U0000DC00 replace 1A -1 {} {} + iso8859-9 \U0000DC00 strict {} 0 {} {} + iso8859-9 \U00010000 tcl8 1A -1 {} {} + iso8859-9 \U00010000 replace 1A -1 {} {} + iso8859-9 \U00010000 strict {} 0 {} {} + iso8859-9 \U0010FFFF tcl8 1A -1 {} {} + iso8859-9 \U0010FFFF replace 1A -1 {} {} + iso8859-9 \U0010FFFF strict {} 0 {} {} +}; # iso8859-9 + +# +# iso8859-10 (generated from glibc-ISO_8859_10-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-10 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-10 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A7 A7 00AD AD 00B0 B0 00B7 B7 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C9 C9 00CB CB 00CD CD 00CE CE 00CF CF 00D0 D0 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D8 D8 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E9 E9 00EB EB 00ED ED 00EE EE 00EF EF 00F0 F0 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F8 F8 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 0100 C0 0101 E0 0104 A1 0105 B1 010C C8 010D E8 0110 A9 0111 B9 0112 A2 0113 B2 0116 CC 0117 EC 0118 CA 0119 EA 0122 A3 0123 B3 0128 A5 0129 B5 012A A4 012B B4 012E C7 012F E7 0136 A6 0137 B6 0138 FF 013B A8 013C B8 0145 D1 0146 F1 014A AF 014B BF 014C D2 014D F2 0160 AA 0161 BA 0166 AB 0167 BB 0168 D7 0169 F7 016A AE 016B BE 0172 D9 0173 F9 017D AC 017E BC 2015 BD} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-10 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-10 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A7 A7 00AD AD 00B0 B0 00B7 B7 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C9 C9 00CB CB 00CD CD 00CE CE 00CF CF 00D0 D0 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D8 D8 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E9 E9 00EB EB 00ED ED 00EE EE 00EF EF 00F0 F0 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F8 F8 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 0100 C0 0101 E0 0104 A1 0105 B1 010C C8 010D E8 0110 A9 0111 B9 0112 A2 0113 B2 0116 CC 0117 EC 0118 CA 0119 EA 0122 A3 0123 B3 0128 A5 0129 B5 012A A4 012B B4 012E C7 012F E7 0136 A6 0137 B6 0138 FF 013B A8 013C B8 0145 D1 0146 F1 014A AF 014B BF 014C D2 014D F2 0160 AA 0161 BA 0166 AB 0167 BB 0168 D7 0169 F7 016A AE 016B BE 0172 D9 0173 F9 017D AC 017E BC 2015 BD} +} -result {} + +# iso8859-10 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-10 + +# iso8859-10 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-10 \U000000A1 tcl8 1A -1 {} {} + iso8859-10 \U000000A1 replace 1A -1 {} {} + iso8859-10 \U000000A1 strict {} 0 {} {} + iso8859-10 \U00000400 tcl8 1A -1 {} {} + iso8859-10 \U00000400 replace 1A -1 {} {} + iso8859-10 \U00000400 strict {} 0 {} {} + iso8859-10 \U0000D800 tcl8 1A -1 {} {} + iso8859-10 \U0000D800 replace 1A -1 {} {} + iso8859-10 \U0000D800 strict {} 0 {} {} + iso8859-10 \U0000DC00 tcl8 1A -1 {} {} + iso8859-10 \U0000DC00 replace 1A -1 {} {} + iso8859-10 \U0000DC00 strict {} 0 {} {} + iso8859-10 \U00010000 tcl8 1A -1 {} {} + iso8859-10 \U00010000 replace 1A -1 {} {} + iso8859-10 \U00010000 strict {} 0 {} {} + iso8859-10 \U0010FFFF tcl8 1A -1 {} {} + iso8859-10 \U0010FFFF replace 1A -1 {} {} + iso8859-10 \U0010FFFF strict {} 0 {} {} +}; # iso8859-10 + +# +# iso8859-11 (generated from glibc-ISO_8859_11-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-11 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-11 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 0E01 A1 0E02 A2 0E03 A3 0E04 A4 0E05 A5 0E06 A6 0E07 A7 0E08 A8 0E09 A9 0E0A AA 0E0B AB 0E0C AC 0E0D AD 0E0E AE 0E0F AF 0E10 B0 0E11 B1 0E12 B2 0E13 B3 0E14 B4 0E15 B5 0E16 B6 0E17 B7 0E18 B8 0E19 B9 0E1A BA 0E1B BB 0E1C BC 0E1D BD 0E1E BE 0E1F BF 0E20 C0 0E21 C1 0E22 C2 0E23 C3 0E24 C4 0E25 C5 0E26 C6 0E27 C7 0E28 C8 0E29 C9 0E2A CA 0E2B CB 0E2C CC 0E2D CD 0E2E CE 0E2F CF 0E30 D0 0E31 D1 0E32 D2 0E33 D3 0E34 D4 0E35 D5 0E36 D6 0E37 D7 0E38 D8 0E39 D9 0E3A DA 0E3F DF 0E40 E0 0E41 E1 0E42 E2 0E43 E3 0E44 E4 0E45 E5 0E46 E6 0E47 E7 0E48 E8 0E49 E9 0E4A EA 0E4B EB 0E4C EC 0E4D ED 0E4E EE 0E4F EF 0E50 F0 0E51 F1 0E52 F2 0E53 F3 0E54 F4 0E55 F5 0E56 F6 0E57 F7 0E58 F8 0E59 F9 0E5A FA 0E5B FB} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-11 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-11 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 0E01 A1 0E02 A2 0E03 A3 0E04 A4 0E05 A5 0E06 A6 0E07 A7 0E08 A8 0E09 A9 0E0A AA 0E0B AB 0E0C AC 0E0D AD 0E0E AE 0E0F AF 0E10 B0 0E11 B1 0E12 B2 0E13 B3 0E14 B4 0E15 B5 0E16 B6 0E17 B7 0E18 B8 0E19 B9 0E1A BA 0E1B BB 0E1C BC 0E1D BD 0E1E BE 0E1F BF 0E20 C0 0E21 C1 0E22 C2 0E23 C3 0E24 C4 0E25 C5 0E26 C6 0E27 C7 0E28 C8 0E29 C9 0E2A CA 0E2B CB 0E2C CC 0E2D CD 0E2E CE 0E2F CF 0E30 D0 0E31 D1 0E32 D2 0E33 D3 0E34 D4 0E35 D5 0E36 D6 0E37 D7 0E38 D8 0E39 D9 0E3A DA 0E3F DF 0E40 E0 0E41 E1 0E42 E2 0E43 E3 0E44 E4 0E45 E5 0E46 E6 0E47 E7 0E48 E8 0E49 E9 0E4A EA 0E4B EB 0E4C EC 0E4D ED 0E4E EE 0E4F EF 0E50 F0 0E51 F1 0E52 F2 0E53 F3 0E54 F4 0E55 F5 0E56 F6 0E57 F7 0E58 F8 0E59 F9 0E5A FA 0E5B FB} +} -result {} + +# iso8859-11 - invalid byte sequences +lappend encInvalidBytes {*}{ + iso8859-11 DB tcl8 \U000000DB -1 {} {} + iso8859-11 DB replace \uFFFD -1 {} {} + iso8859-11 DB strict {} 0 {} {} + iso8859-11 DC tcl8 \U000000DC -1 {} {} + iso8859-11 DC replace \uFFFD -1 {} {} + iso8859-11 DC strict {} 0 {} {} + iso8859-11 DD tcl8 \U000000DD -1 {} {} + iso8859-11 DD replace \uFFFD -1 {} {} + iso8859-11 DD strict {} 0 {} {} + iso8859-11 DE tcl8 \U000000DE -1 {} {} + iso8859-11 DE replace \uFFFD -1 {} {} + iso8859-11 DE strict {} 0 {} {} + iso8859-11 FC tcl8 \U000000FC -1 {} {} + iso8859-11 FC replace \uFFFD -1 {} {} + iso8859-11 FC strict {} 0 {} {} + iso8859-11 FD tcl8 \U000000FD -1 {} {} + iso8859-11 FD replace \uFFFD -1 {} {} + iso8859-11 FD strict {} 0 {} {} + iso8859-11 FE tcl8 \U000000FE -1 {} {} + iso8859-11 FE replace \uFFFD -1 {} {} + iso8859-11 FE strict {} 0 {} {} + iso8859-11 FF tcl8 \U000000FF -1 {} {} + iso8859-11 FF replace \uFFFD -1 {} {} + iso8859-11 FF strict {} 0 {} {} +}; # iso8859-11 + +# iso8859-11 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-11 \U000000A1 tcl8 1A -1 {} {} + iso8859-11 \U000000A1 replace 1A -1 {} {} + iso8859-11 \U000000A1 strict {} 0 {} {} + iso8859-11 \U00000400 tcl8 1A -1 {} {} + iso8859-11 \U00000400 replace 1A -1 {} {} + iso8859-11 \U00000400 strict {} 0 {} {} + iso8859-11 \U0000D800 tcl8 1A -1 {} {} + iso8859-11 \U0000D800 replace 1A -1 {} {} + iso8859-11 \U0000D800 strict {} 0 {} {} + iso8859-11 \U0000DC00 tcl8 1A -1 {} {} + iso8859-11 \U0000DC00 replace 1A -1 {} {} + iso8859-11 \U0000DC00 strict {} 0 {} {} + iso8859-11 \U00010000 tcl8 1A -1 {} {} + iso8859-11 \U00010000 replace 1A -1 {} {} + iso8859-11 \U00010000 strict {} 0 {} {} + iso8859-11 \U0010FFFF tcl8 1A -1 {} {} + iso8859-11 \U0010FFFF replace 1A -1 {} {} + iso8859-11 \U0010FFFF strict {} 0 {} {} +}; # iso8859-11 + +# +# iso8859-13 (generated from glibc-ISO_8859_13-2.3.3) + +test encoding-convertfrom-ucmCompare-iso8859-13 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-13 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A6 A6 00A7 A7 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B5 B5 00B6 B6 00B7 B7 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00C4 C4 00C5 C5 00C6 AF 00C9 C9 00D3 D3 00D5 D5 00D6 D6 00D7 D7 00D8 A8 00DC DC 00DF DF 00E4 E4 00E5 E5 00E6 BF 00E9 E9 00F3 F3 00F5 F5 00F6 F6 00F7 F7 00F8 B8 00FC FC 0100 C2 0101 E2 0104 C0 0105 E0 0106 C3 0107 E3 010C C8 010D E8 0112 C7 0113 E7 0116 CB 0117 EB 0118 C6 0119 E6 0122 CC 0123 EC 012A CE 012B EE 012E C1 012F E1 0136 CD 0137 ED 013B CF 013C EF 0141 D9 0142 F9 0143 D1 0144 F1 0145 D2 0146 F2 014C D4 014D F4 0156 AA 0157 BA 015A DA 015B FA 0160 D0 0161 F0 016A DB 016B FB 0172 D8 0173 F8 0179 CA 017A EA 017B DD 017C FD 017D DE 017E FE 2019 FF 201C B4 201D A1 201E A5} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-13 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-13 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A2 A2 00A3 A3 00A4 A4 00A6 A6 00A7 A7 00A9 A9 00AB AB 00AC AC 00AD AD 00AE AE 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B5 B5 00B6 B6 00B7 B7 00B9 B9 00BB BB 00BC BC 00BD BD 00BE BE 00C4 C4 00C5 C5 00C6 AF 00C9 C9 00D3 D3 00D5 D5 00D6 D6 00D7 D7 00D8 A8 00DC DC 00DF DF 00E4 E4 00E5 E5 00E6 BF 00E9 E9 00F3 F3 00F5 F5 00F6 F6 00F7 F7 00F8 B8 00FC FC 0100 C2 0101 E2 0104 C0 0105 E0 0106 C3 0107 E3 010C C8 010D E8 0112 C7 0113 E7 0116 CB 0117 EB 0118 C6 0119 E6 0122 CC 0123 EC 012A CE 012B EE 012E C1 012F E1 0136 CD 0137 ED 013B CF 013C EF 0141 D9 0142 F9 0143 D1 0144 F1 0145 D2 0146 F2 014C D4 014D F4 0156 AA 0157 BA 015A DA 015B FA 0160 D0 0161 F0 016A DB 016B FB 0172 D8 0173 F8 0179 CA 017A EA 017B DD 017C FD 017D DE 017E FE 2019 FF 201C B4 201D A1 201E A5} +} -result {} + +# iso8859-13 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-13 + +# iso8859-13 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-13 \U000000A1 tcl8 1A -1 {} {} + iso8859-13 \U000000A1 replace 1A -1 {} {} + iso8859-13 \U000000A1 strict {} 0 {} {} + iso8859-13 \U00000400 tcl8 1A -1 {} {} + iso8859-13 \U00000400 replace 1A -1 {} {} + iso8859-13 \U00000400 strict {} 0 {} {} + iso8859-13 \U0000D800 tcl8 1A -1 {} {} + iso8859-13 \U0000D800 replace 1A -1 {} {} + iso8859-13 \U0000D800 strict {} 0 {} {} + iso8859-13 \U0000DC00 tcl8 1A -1 {} {} + iso8859-13 \U0000DC00 replace 1A -1 {} {} + iso8859-13 \U0000DC00 strict {} 0 {} {} + iso8859-13 \U00010000 tcl8 1A -1 {} {} + iso8859-13 \U00010000 replace 1A -1 {} {} + iso8859-13 \U00010000 strict {} 0 {} {} + iso8859-13 \U0010FFFF tcl8 1A -1 {} {} + iso8859-13 \U0010FFFF replace 1A -1 {} {} + iso8859-13 \U0010FFFF strict {} 0 {} {} +}; # iso8859-13 + +# +# iso8859-14 (generated from glibc-ISO_8859_14-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-14 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-14 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A3 A3 00A7 A7 00A9 A9 00AD AD 00AE AE 00B6 B6 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FF FF 010A A4 010B A5 0120 B2 0121 B3 0174 D0 0175 F0 0176 DE 0177 FE 0178 AF 1E02 A1 1E03 A2 1E0A A6 1E0B AB 1E1E B0 1E1F B1 1E40 B4 1E41 B5 1E56 B7 1E57 B9 1E60 BB 1E61 BF 1E6A D7 1E6B F7 1E80 A8 1E81 B8 1E82 AA 1E83 BA 1E84 BD 1E85 BE 1EF2 AC 1EF3 BC} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-14 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-14 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A3 A3 00A7 A7 00A9 A9 00AD AD 00AE AE 00B6 B6 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FF FF 010A A4 010B A5 0120 B2 0121 B3 0174 D0 0175 F0 0176 DE 0177 FE 0178 AF 1E02 A1 1E03 A2 1E0A A6 1E0B AB 1E1E B0 1E1F B1 1E40 B4 1E41 B5 1E56 B7 1E57 B9 1E60 BB 1E61 BF 1E6A D7 1E6B F7 1E80 A8 1E81 B8 1E82 AA 1E83 BA 1E84 BD 1E85 BE 1EF2 AC 1EF3 BC} +} -result {} + +# iso8859-14 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-14 + +# iso8859-14 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-14 \U000000A1 tcl8 1A -1 {} {} + iso8859-14 \U000000A1 replace 1A -1 {} {} + iso8859-14 \U000000A1 strict {} 0 {} {} + iso8859-14 \U00000400 tcl8 1A -1 {} {} + iso8859-14 \U00000400 replace 1A -1 {} {} + iso8859-14 \U00000400 strict {} 0 {} {} + iso8859-14 \U0000D800 tcl8 1A -1 {} {} + iso8859-14 \U0000D800 replace 1A -1 {} {} + iso8859-14 \U0000D800 strict {} 0 {} {} + iso8859-14 \U0000DC00 tcl8 1A -1 {} {} + iso8859-14 \U0000DC00 replace 1A -1 {} {} + iso8859-14 \U0000DC00 strict {} 0 {} {} + iso8859-14 \U00010000 tcl8 1A -1 {} {} + iso8859-14 \U00010000 replace 1A -1 {} {} + iso8859-14 \U00010000 strict {} 0 {} {} + iso8859-14 \U0010FFFF tcl8 1A -1 {} {} + iso8859-14 \U0010FFFF replace 1A -1 {} {} + iso8859-14 \U0010FFFF strict {} 0 {} {} +}; # iso8859-14 + +# +# iso8859-15 (generated from glibc-ISO_8859_15-2.1.2) + +test encoding-convertfrom-ucmCompare-iso8859-15 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-15 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A5 A5 00A7 A7 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B5 B5 00B6 B6 00B7 B7 00B9 B9 00BA BA 00BB BB 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D0 D0 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F0 F0 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 00FF FF 0152 BC 0153 BD 0160 A6 0161 A8 0178 BE 017D B4 017E B8 20AC A4} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-15 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-15 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A1 A1 00A2 A2 00A3 A3 00A5 A5 00A7 A7 00A9 A9 00AA AA 00AB AB 00AC AC 00AD AD 00AE AE 00AF AF 00B0 B0 00B1 B1 00B2 B2 00B3 B3 00B5 B5 00B6 B6 00B7 B7 00B9 B9 00BA BA 00BB BB 00BF BF 00C0 C0 00C1 C1 00C2 C2 00C3 C3 00C4 C4 00C5 C5 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D0 D0 00D1 D1 00D2 D2 00D3 D3 00D4 D4 00D5 D5 00D6 D6 00D7 D7 00D8 D8 00D9 D9 00DA DA 00DB DB 00DC DC 00DD DD 00DE DE 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E3 E3 00E4 E4 00E5 E5 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F0 F0 00F1 F1 00F2 F2 00F3 F3 00F4 F4 00F5 F5 00F6 F6 00F7 F7 00F8 F8 00F9 F9 00FA FA 00FB FB 00FC FC 00FD FD 00FE FE 00FF FF 0152 BC 0153 BD 0160 A6 0161 A8 0178 BE 017D B4 017E B8 20AC A4} +} -result {} + +# iso8859-15 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-15 + +# iso8859-15 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-15 \U000000A4 tcl8 1A -1 {} {} + iso8859-15 \U000000A4 replace 1A -1 {} {} + iso8859-15 \U000000A4 strict {} 0 {} {} + iso8859-15 \U00000400 tcl8 1A -1 {} {} + iso8859-15 \U00000400 replace 1A -1 {} {} + iso8859-15 \U00000400 strict {} 0 {} {} + iso8859-15 \U0000D800 tcl8 1A -1 {} {} + iso8859-15 \U0000D800 replace 1A -1 {} {} + iso8859-15 \U0000D800 strict {} 0 {} {} + iso8859-15 \U0000DC00 tcl8 1A -1 {} {} + iso8859-15 \U0000DC00 replace 1A -1 {} {} + iso8859-15 \U0000DC00 strict {} 0 {} {} + iso8859-15 \U00010000 tcl8 1A -1 {} {} + iso8859-15 \U00010000 replace 1A -1 {} {} + iso8859-15 \U00010000 strict {} 0 {} {} + iso8859-15 \U0010FFFF tcl8 1A -1 {} {} + iso8859-15 \U0010FFFF replace 1A -1 {} {} + iso8859-15 \U0010FFFF strict {} 0 {} {} +}; # iso8859-15 + +# +# iso8859-16 (generated from glibc-ISO_8859_16-2.3.3) + +test encoding-convertfrom-ucmCompare-iso8859-16 {Compare against ICU UCM} -body { + ucmConvertfromMismatches iso8859-16 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A7 A7 00A9 A9 00AB AB 00AD AD 00B0 B0 00B1 B1 00B6 B6 00B7 B7 00BB BB 00C0 C0 00C1 C1 00C2 C2 00C4 C4 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D2 D2 00D3 D3 00D4 D4 00D6 D6 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E4 E4 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F2 F2 00F3 F3 00F4 F4 00F6 F6 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 0102 C3 0103 E3 0104 A1 0105 A2 0106 C5 0107 E5 010C B2 010D B9 0110 D0 0111 F0 0118 DD 0119 FD 0141 A3 0142 B3 0143 D1 0144 F1 0150 D5 0151 F5 0152 BC 0153 BD 015A D7 015B F7 0160 A6 0161 A8 0170 D8 0171 F8 0178 BE 0179 AC 017A AE 017B AF 017C BF 017D B4 017E B8 0218 AA 0219 BA 021A DE 021B FE 201D B5 201E A5 20AC A4} +} -result {} + +test encoding-convertto-ucmCompare-iso8859-16 {Compare against ICU UCM} -body { + ucmConverttoMismatches iso8859-16 {0000 00 0001 01 0002 02 0003 03 0004 04 0005 05 0006 06 0007 07 0008 08 0009 09 000A 0A 000B 0B 000C 0C 000D 0D 000E 0E 000F 0F 0010 10 0011 11 0012 12 0013 13 0014 14 0015 15 0016 16 0017 17 0018 18 0019 19 001A 1A 001B 1B 001C 1C 001D 1D 001E 1E 001F 1F 0020 20 0021 21 0022 22 0023 23 0024 24 0025 25 0026 26 0027 27 0028 28 0029 29 002A 2A 002B 2B 002C 2C 002D 2D 002E 2E 002F 2F 0030 30 0031 31 0032 32 0033 33 0034 34 0035 35 0036 36 0037 37 0038 38 0039 39 003A 3A 003B 3B 003C 3C 003D 3D 003E 3E 003F 3F 0040 40 0041 41 0042 42 0043 43 0044 44 0045 45 0046 46 0047 47 0048 48 0049 49 004A 4A 004B 4B 004C 4C 004D 4D 004E 4E 004F 4F 0050 50 0051 51 0052 52 0053 53 0054 54 0055 55 0056 56 0057 57 0058 58 0059 59 005A 5A 005B 5B 005C 5C 005D 5D 005E 5E 005F 5F 0060 60 0061 61 0062 62 0063 63 0064 64 0065 65 0066 66 0067 67 0068 68 0069 69 006A 6A 006B 6B 006C 6C 006D 6D 006E 6E 006F 6F 0070 70 0071 71 0072 72 0073 73 0074 74 0075 75 0076 76 0077 77 0078 78 0079 79 007A 7A 007B 7B 007C 7C 007D 7D 007E 7E 007F 7F 0080 80 0081 81 0082 82 0083 83 0084 84 0085 85 0086 86 0087 87 0088 88 0089 89 008A 8A 008B 8B 008C 8C 008D 8D 008E 8E 008F 8F 0090 90 0091 91 0092 92 0093 93 0094 94 0095 95 0096 96 0097 97 0098 98 0099 99 009A 9A 009B 9B 009C 9C 009D 9D 009E 9E 009F 9F 00A0 A0 00A7 A7 00A9 A9 00AB AB 00AD AD 00B0 B0 00B1 B1 00B6 B6 00B7 B7 00BB BB 00C0 C0 00C1 C1 00C2 C2 00C4 C4 00C6 C6 00C7 C7 00C8 C8 00C9 C9 00CA CA 00CB CB 00CC CC 00CD CD 00CE CE 00CF CF 00D2 D2 00D3 D3 00D4 D4 00D6 D6 00D9 D9 00DA DA 00DB DB 00DC DC 00DF DF 00E0 E0 00E1 E1 00E2 E2 00E4 E4 00E6 E6 00E7 E7 00E8 E8 00E9 E9 00EA EA 00EB EB 00EC EC 00ED ED 00EE EE 00EF EF 00F2 F2 00F3 F3 00F4 F4 00F6 F6 00F9 F9 00FA FA 00FB FB 00FC FC 00FF FF 0102 C3 0103 E3 0104 A1 0105 A2 0106 C5 0107 E5 010C B2 010D B9 0110 D0 0111 F0 0118 DD 0119 FD 0141 A3 0142 B3 0143 D1 0144 F1 0150 D5 0151 F5 0152 BC 0153 BD 015A D7 015B F7 0160 A6 0161 A8 0170 D8 0171 F8 0178 BE 0179 AC 017A AE 017B AF 017C BF 017D B4 017E B8 0218 AA 0219 BA 021A DE 021B FE 201D B5 201E A5 20AC A4} +} -result {} + +# iso8859-16 - invalid byte sequences +lappend encInvalidBytes {*}{ +}; # iso8859-16 + +# iso8859-16 - invalid byte sequences +lappend encUnencodableStrings {*}{ + iso8859-16 \U000000A1 tcl8 1A -1 {} {} + iso8859-16 \U000000A1 replace 1A -1 {} {} + iso8859-16 \U000000A1 strict {} 0 {} {} + iso8859-16 \U00000400 tcl8 1A -1 {} {} + iso8859-16 \U00000400 replace 1A -1 {} {} + iso8859-16 \U00000400 strict {} 0 {} {} + iso8859-16 \U0000D800 tcl8 1A -1 {} {} + iso8859-16 \U0000D800 replace 1A -1 {} {} + iso8859-16 \U0000D800 strict {} 0 {} {} + iso8859-16 \U0000DC00 tcl8 1A -1 {} {} + iso8859-16 \U0000DC00 replace 1A -1 {} {} + iso8859-16 \U0000DC00 strict {} 0 {} {} + iso8859-16 \U00010000 tcl8 1A -1 {} {} + iso8859-16 \U00010000 replace 1A -1 {} {} + iso8859-16 \U00010000 strict {} 0 {} {} + iso8859-16 \U0010FFFF tcl8 1A -1 {} {} + iso8859-16 \U0010FFFF replace 1A -1 {} {} + iso8859-16 \U0010FFFF strict {} 0 {} {} +}; # iso8859-16 diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl index e971631..dc878ef 100644 --- a/tools/ucm2tests.tcl +++ b/tools/ucm2tests.tcl @@ -37,14 +37,27 @@ namespace eval ucm { iso8859-9 glibc-ISO_8859_9-2.1.2 iso8859-10 glibc-ISO_8859_10-2.1.2 iso8859-11 glibc-ISO_8859_11-2.1.2 - iso8859-13 glibc-ISO_8859_13-2.1.2 + iso8859-13 glibc-ISO_8859_13-2.3.3 iso8859-14 glibc-ISO_8859_14-2.1.2 iso8859-15 glibc-ISO_8859_15-2.1.2 iso8859-16 glibc-ISO_8859_16-2.3.3 } - # Dictionary Character map for Tcl encoding + # Array keyed by Tcl encoding name. Each element contains mapping of + # Unicode code point -> byte sequence for that encoding as a flat list + # (or dictionary). Both are stored as hex strings variable charMap + + # Array keyed by Tcl encoding name. List of invalid code sequences + # each being a hex string. + variable invalidCodeSequences + + # Array keyed by Tcl encoding name. List of unicode code points that are + # not mapped, each being a hex string. + variable unmappedCodePoints + + # The fallback character per encoding + variable encSubchar } proc ucm::abort {msg} { @@ -68,7 +81,11 @@ proc ucm::print {s} { puts $outputChan $s } -proc ucm::parse_SBCS {fd} { +proc ucm::parse_SBCS {encName fd} { + variable charMap + variable invalidCodeSequences + variable unmappedCodePoints + set result {} while {[gets $fd line] >= 0} { if {[string match #* $line]} { @@ -87,26 +104,44 @@ proc ucm::parse_SBCS {fd} { # It is a fallback mapping - ignore } } - return $result -} + set charMap($encName) $result -proc ucm::generate_tests {} { - variable encNameMap - variable charMap - variable outputPath - variable outputChan - - if {[info exists outputPath]} { - set outputChan [open $outputPath w] - } else { - set outputChan stdout + # Find out invalid code sequences and unicode code points that are not mapped + set valid {} + set mapped {} + foreach {unich bytes} $result { + lappend mapped $unich + lappend valid $bytes + } + set invalidCodeSequences($encName) {} + for {set i 0} {$i <= 255} {incr i} { + set hex [format %.2X $i] + if {[lsearch -exact $valid $hex] < 0} { + lappend invalidCodeSequences($encName) $hex + } } - array set tclNames {} - foreach encName [encoding names] { - set tclNames($encName) "" + set unmappedCodePoints($encName) {} + for {set i 0} {$i <= 65535} {incr i} { + set hex [format %.4X $i] + if {[lsearch -exact $mapped $hex] < 0} { + lappend unmappedCodePoints($encName) $hex + # Only look for (at most) one below 256 and one above 1024 + if {$i < 255} { + # Found one so jump past 8 bits + set i 255 + } else { + break + } + } + if {$i == 255} { + set i 1023 + } } + lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF +} +proc ucm::generate_boilerplate {} { # Common procedures print { # This file is automatically generated by ucm2tests.tcl. @@ -118,6 +153,7 @@ proc ucm::generate_tests {} { proc ucmConvertfromMismatches {enc map} { set mismatches {} foreach {unihex hex} $map { + set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits set unich [subst "\\U$unihex"] if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} { lappend mismatches "<[printable $unich],$hex>" @@ -128,6 +164,7 @@ proc ucmConvertfromMismatches {enc map} { proc ucmConverttoMismatches {enc map} { set mismatches {} foreach {unihex hex} $map { + set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits set unich [subst "\\U$unihex"] if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} { lappend mismatches "<[printable $unich],$hex>" @@ -154,6 +191,30 @@ if {[info commands printable] eq ""} { } } } +} ; # generate_boilerplate + +proc ucm::generate_tests {} { + variable encNameMap + variable charMap + variable invalidCodeSequences + variable unmappedCodePoints + variable outputPath + variable outputChan + variable encSubchar + + if {[info exists outputPath]} { + set outputChan [open $outputPath w] + fconfigure $outputChan -translation lf + } else { + set outputChan stdout + } + + array set tclNames {} + foreach encName [encoding names] { + set tclNames($encName) "" + } + + generate_boilerplate foreach encName [lsort -dictionary [array names encNameMap]] { if {![info exists charMap($encName)]} { warn "No character map read for $encName" @@ -161,6 +222,7 @@ if {[info commands printable] eq ""} { } unset tclNames($encName) + # Print the valid tests print "\n#\n# $encName (generated from $encNameMap($encName))" print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{" print " ucmConvertfromMismatches $encName {$charMap($encName)}" @@ -172,13 +234,42 @@ if {[info commands printable] eq ""} { # This will generate individual tests for every char # and test in lead, tail, middle, solo configurations # but takes considerable time - print "lappend encValidStrings {*}{" + print "lappend encValidStrings \{*\}\{" foreach {unich hex} $charMap($encName) { print " $encName \\u$unich $hex {} {}" } - print "}; # $encName" + print "\}; # $encName" + } + + # Generate the invalidity checks + print "\n# $encName - invalid byte sequences" + print "lappend encInvalidBytes \{*\}\{" + foreach hex $invalidCodeSequences($encName) { + # Map XXXX... to \xXX\xXX... + set uhex [regsub -all .. $hex {\\x\0}] + set uhex \\U[string range 00000000$hex end-7 end] + print " $encName $hex tcl8 $uhex -1 {} {}" + print " $encName $hex replace \\uFFFD -1 {} {}" + print " $encName $hex strict {} 0 {} {}" + } + print "\}; # $encName" + + print "\n# $encName - invalid byte sequences" + print "lappend encUnencodableStrings \{*\}\{" + if {[info exists encSubchar($encName)]} { + set subchar $encSubchar($encName) + } else { + set subchar "3F"; # Tcl uses ? by default } + foreach hex $unmappedCodePoints($encName) { + set uhex \\U[string range 00000000$hex end-7 end] + print " $encName $uhex tcl8 $subchar -1 {} {}" + print " $encName $uhex replace $subchar -1 {} {}" + print " $encName $uhex strict {} 0 {} {}" + } + print "\}; # $encName" } + if {[array size tclNames]} { warn "Missing encoding: [lsort [array names tclNames]]" } @@ -190,6 +281,8 @@ if {[info commands printable] eq ""} { proc ucm::parse_file {encName ucmPath} { variable charMap + variable encSubchar + set fd [open $ucmPath] try { # Parse the metadata @@ -205,7 +298,7 @@ proc ucm::parse_file {encName ucmPath} { } } if {![info exists state(charmap)]} { - abort "Error: $path has No CHARMAP line." + abort "Error: $ucmPath has No CHARMAP line." } foreach key {code_set_name uconv_class} { if {[info exists state($key)]} { @@ -216,18 +309,22 @@ proc ucm::parse_file {encName ucmPath} { abort "Duplicate file for $encName ($path)" } if {![info exists state(uconv_class)]} { - abort "Error: $path has no uconv_class definition." + abort "Error: $ucmPath has no uconv_class definition." + } + if {[info exists state(subchar)]} { + # \xNN\xNN.. -> NNNN.. + set encSubchar($encName) [string map {\\x {}} $state(subchar)] } switch -exact -- $state(uconv_class) { SBCS { if {[catch { - set charMap($encName) [parse_SBCS $fd] + parse_SBCS $encName $fd } result]} { - abort "Could not process $path. $result" + abort "Could not process $ucmPath. $result" } } default { - log "Skipping $path -- not SBCS encoding." + log "Skipping $ucmPath -- not SBCS encoding." return } } @@ -236,15 +333,6 @@ proc ucm::parse_file {encName ucmPath} { } } -proc ucm::expand_paths {patterns} { - set expanded {} - foreach pat $patterns { - # The file join is for \ -> / - lappend expanded {*}[glob -nocomplain [file join $pat]] - } - return $expanded -} - proc ucm::run {} { variable encNameMap variable outputPath -- cgit v0.12 From d1920b380d4a987240715b3ce72f7d68dfca2b09 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 23 Feb 2023 10:22:58 +0000 Subject: Fix gcc warnings and encoding error message (bug [40c61a5d10]) --- generic/tclCmdAH.c | 2 +- generic/tclEncoding.c | 4 ++-- tests/cmdAH.test | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 4dfb541..1b74064 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -589,7 +589,7 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ interp, 1, objv, - "??-profile profile? ?-failindex var? ?encoding?? data"); + "? ?-profile profile? ?-failindex var? encoding ? data"); return TCL_ERROR; } diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index bc830b4..a877468 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -4265,7 +4265,7 @@ TclEncodingProfileNameToId( const char *profileName, /* Name of profile */ int *profilePtr) /* Output */ { - int i; + size_t i; for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { if (!strcmp(profileName, encodingProfiles[i].name)) { @@ -4305,7 +4305,7 @@ TclEncodingProfileIdToName( Tcl_Interp *interp, /* For error messages. May be NULL */ int profileValue) /* Profile #define value */ { - int i; + size_t i; for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { if (profileValue == encodingProfiles[i].value) { diff --git a/tests/cmdAH.test b/tests/cmdAH.test index cfde678..d76607c 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -175,8 +175,8 @@ test cmdAH-3.2 {Tcl_ContinueObjCmd, success} { # encoding command set "numargErrors(encoding system)" {^wrong # args: should be "(encoding |::tcl::encoding::)system \?encoding\?"$} -set "numargErrors(encoding convertfrom)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertfrom \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} -set "numargErrors(encoding convertto)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertto \?\?-profile profile\? \?-failindex var\? \?encoding\?\? data"$} +set "numargErrors(encoding convertfrom)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertfrom \? \?-profile profile\? \?-failindex var\? encoding \? data"$} +set "numargErrors(encoding convertto)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertto \? \?-profile profile\? \?-failindex var\? encoding \? data"$} set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} set "numargErrors(encoding profiles)" {wrong # args: should be "encoding profiles"} -- cgit v0.12 From da915fdadfa41477f967f92d37c63e278621acd7 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 23 Feb 2023 13:19:45 +0000 Subject: New signature for Tcl_ExternalToUtfDStringEx and Tcl_UtfToExternalDStringEx as per TIP 656 --- generic/tcl.decls | 14 ++-- generic/tclCmdAH.c | 99 +++++++++++++++++++++++++--- generic/tclDecls.h | 18 +++--- generic/tclEncoding.c | 174 +++++++++++++++++++++++++++++++++++++++----------- tests/cmdAH.test | 24 +++++-- 5 files changed, 264 insertions(+), 65 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index a48ab02..a789ef6 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2441,13 +2441,17 @@ declare 656 { declare 657 { int Tcl_UniCharIsUnicode(int ch) } + +# TIP 656 declare 658 { - Tcl_Size Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr) -} + int Tcl_ExternalToUtfDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, + const char *src, int srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr) +} declare 659 { - Tcl_Size Tcl_UtfToExternalDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr) + int Tcl_UtfToExternalDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, + const char *src, int srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr) } # TIP #511 diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 1b74064..24b2038 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -671,6 +671,7 @@ EncodingConvertfromObjCmd( int flags; int result; Tcl_Obj *failVarObj; + Tcl_Size errorLocation; if (EncodingConvertParseOptions( interp, objc, objv, &encoding, &data, &flags, &failVarObj) @@ -693,8 +694,47 @@ EncodingConvertfromObjCmd( if (bytesPtr == NULL) { return TCL_ERROR; } - result = Tcl_ExternalToUtfDStringEx(encoding, bytesPtr, length, - flags, &ds); + result = Tcl_ExternalToUtfDStringEx(interp, encoding, bytesPtr, length, flags, + &ds, failVarObj ? &errorLocation : NULL); + /* NOTE: ds must be freed beyond this point even on error */ + switch (result) { + case TCL_OK: + errorLocation = TCL_INDEX_NONE; + break; + case TCL_ERROR: + /* Error in parameters. Should not happen. interp will have error */ + Tcl_DStringFree(&ds); + return TCL_ERROR; + default: + /* + * One of the TCL_CONVERT_* errors. If we were not interested in the + * error location, interp result would already have been filled in + * and we can just return the error. Otherwise, we have to return + * what could be decoded and the returned error location. + */ + if (failVarObj == NULL) { + Tcl_DStringFree(&ds); + return TCL_ERROR; + } + break; + } + + /* + * TCL_OK or a TCL_CONVERT_* error where the caller wants back as much + * data as was converted. + */ + if (failVarObj) { + /* I hope, wide int will cover Tcl_Size data type */ + if (Tcl_ObjSetVar2(interp, + failVarObj, + NULL, + Tcl_NewWideIntObj(errorLocation), + TCL_LEAVE_ERR_MSG) == NULL) { + Tcl_DStringFree(&ds); + return TCL_ERROR; + } + } +#ifdef OBSOLETE if (result != TCL_INDEX_NONE && TCL_ENCODING_PROFILE_GET(flags) != TCL_ENCODING_PROFILE_TCL8) { if (failVarObj != NULL) { @@ -717,6 +757,7 @@ EncodingConvertfromObjCmd( return TCL_ERROR; } } +#endif /* * Note that we cannot use Tcl_DStringResult here because it will @@ -725,9 +766,7 @@ EncodingConvertfromObjCmd( Tcl_SetObjResult(interp, Tcl_DStringToObj(&ds)); - /* - * We're done with the encoding - */ + /* We're done with the encoding */ Tcl_FreeEncoding(encoding); return TCL_OK; @@ -763,6 +802,7 @@ EncodingConverttoObjCmd( int result; int flags; Tcl_Obj *failVarObj; + Tcl_Size errorLocation; if (EncodingConvertParseOptions( interp, objc, objv, &encoding, &data, &flags, &failVarObj) @@ -775,8 +815,47 @@ EncodingConverttoObjCmd( */ stringPtr = TclGetStringFromObj(data, &length); - result = Tcl_UtfToExternalDStringEx(encoding, stringPtr, length, - flags, &ds); + result = Tcl_UtfToExternalDStringEx(interp, encoding, stringPtr, length, flags, + &ds, failVarObj ? &errorLocation : NULL); + /* NOTE: ds must be freed beyond this point even on error */ + + switch (result) { + case TCL_OK: + errorLocation = TCL_INDEX_NONE; + break; + case TCL_ERROR: + /* Error in parameters. Should not happen. interp will have error */ + Tcl_DStringFree(&ds); + return TCL_ERROR; + default: + /* + * One of the TCL_CONVERT_* errors. If we were not interested in the + * error location, interp result would already have been filled in + * and we can just return the error. Otherwise, we have to return + * what could be decoded and the returned error location. + */ + if (failVarObj == NULL) { + Tcl_DStringFree(&ds); + return TCL_ERROR; + } + break; + } + /* + * TCL_OK or a TCL_CONVERT_* error where the caller wants back as much + * data as was converted. + */ + if (failVarObj) { + /* I hope, wide int will cover Tcl_Size data type */ + if (Tcl_ObjSetVar2(interp, + failVarObj, + NULL, + Tcl_NewWideIntObj(errorLocation), + TCL_LEAVE_ERR_MSG) == NULL) { + Tcl_DStringFree(&ds); + return TCL_ERROR; + } + } +#ifdef OBSOLETE if (result != TCL_INDEX_NONE && TCL_ENCODING_PROFILE_GET(flags) != TCL_ENCODING_PROFILE_TCL8) { if (failVarObj != NULL) { @@ -802,14 +881,14 @@ EncodingConverttoObjCmd( return TCL_ERROR; } } +#endif + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj((unsigned char*) Tcl_DStringValue(&ds), Tcl_DStringLength(&ds))); Tcl_DStringFree(&ds); - /* - * We're done with the encoding - */ + /* We're done with the encoding */ Tcl_FreeEncoding(encoding); return TCL_OK; diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 77517e8..fbfa8a1 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1955,13 +1955,15 @@ EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); /* 657 */ EXTERN int Tcl_UniCharIsUnicode(int ch); /* 658 */ -EXTERN Tcl_Size Tcl_ExternalToUtfDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, - Tcl_DString *dsPtr); +EXTERN int Tcl_ExternalToUtfDStringEx(Tcl_Interp *interp, + Tcl_Encoding encoding, const char *src, + int srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr); /* 659 */ -EXTERN Tcl_Size Tcl_UtfToExternalDStringEx(Tcl_Encoding encoding, - const char *src, Tcl_Size srcLen, int flags, - Tcl_DString *dsPtr); +EXTERN int Tcl_UtfToExternalDStringEx(Tcl_Interp *interp, + Tcl_Encoding encoding, const char *src, + int srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size *errorLocationPtr); /* 660 */ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, int sigNumber); @@ -2741,8 +2743,8 @@ typedef struct TclStubs { const char * (*tcl_UtfNext) (const char *src); /* 655 */ const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ - Tcl_Size (*tcl_ExternalToUtfDStringEx) (Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr); /* 658 */ - Tcl_Size (*tcl_UtfToExternalDStringEx) (Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr); /* 659 */ + int (*tcl_ExternalToUtfDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 658 */ + int (*tcl_UtfToExternalDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 659 */ int (*tcl_AsyncMarkFromSignal) (Tcl_AsyncHandler async, int sigNumber); /* 660 */ int (*tclListObjGetElements) (Tcl_Interp *interp, Tcl_Obj *listPtr, size_t *objcPtr, Tcl_Obj ***objvPtr); /* 661 */ int (*tclListObjLength) (Tcl_Interp *interp, Tcl_Obj *listPtr, size_t *lengthPtr); /* 662 */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index daab3a9..365aa90 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1203,7 +1203,8 @@ Tcl_ExternalToUtfDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr); + Tcl_ExternalToUtfDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_TCL8, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1223,29 +1224,49 @@ Tcl_ExternalToUtfDString( * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags * - TCL_ENCODING_MODIFIED: enable Tcl internal conversion mapping \xC0\x80 * to 0x00. Only valid for "utf-8" and "cesu-8". + * Any other flag bits will cause an error to be returned (for future + * compatibility) * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. - * + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. + * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner. + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ int Tcl_ExternalToUtfDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the source string, or NULL * for the default system encoding. */ const char *src, /* Source string in specified encoding. */ int srcLen, /* Source string length in bytes, or < 0 for * encoding-specific string length. */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; @@ -1253,14 +1274,14 @@ Tcl_ExternalToUtfDStringEx( int dstLen, result, soFar, srcRead, dstWrote, dstChars; const char *srcStart = src; - Tcl_DStringInit(dstPtr); + Tcl_DStringInit(dstPtr); /* Must always be initialized before returning */ dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; if (encoding == NULL) { - encoding = systemEncoding; + encoding = systemEncoding; } - encodingPtr = (Encoding *) encoding; + encodingPtr = (Encoding *)encoding; if (src == NULL) { srcLen = 0; @@ -1275,26 +1296,53 @@ Tcl_ExternalToUtfDStringEx( } while (1) { - result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); - soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + result = encodingPtr->toUtfProc(encodingPtr->clientData, src, + srcLen, flags, &state, dst, dstLen, + &srcRead, &dstWrote, &dstChars); + soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + + src += srcRead; + if (result != TCL_CONVERT_NOSPACE) { + Tcl_Size nBytesProcessed = (Tcl_Size)(src - srcStart); + + Tcl_DStringSetLength(dstPtr, soFar); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } + else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + char buf[TCL_INTEGER_SPACE]; + sprintf(buf, "%u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf("unexpected byte sequence starting at index %" + "u: '\\x%X'", + nBytesProcessed, + UCHAR(srcStart[nBytesProcessed]))); + Tcl_SetErrorCode( + interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL); + } + } + return result; + } - src += srcRead; - if (result != TCL_CONVERT_NOSPACE) { - Tcl_DStringSetLength(dstPtr, soFar); - return (result == TCL_OK) ? TCL_INDEX_NONE : (int)(src - srcStart); - } - flags &= ~TCL_ENCODING_START; - srcLen -= srcRead; - if (Tcl_DStringLength(dstPtr) == 0) { - Tcl_DStringSetLength(dstPtr, dstLen); - } - Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); - dst = Tcl_DStringValue(dstPtr) + soFar; - dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; + /* Expand space and continue */ + flags &= ~TCL_ENCODING_START; + srcLen -= srcRead; + if (Tcl_DStringLength(dstPtr) == 0) { + Tcl_DStringSetLength(dstPtr, dstLen); + } + Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); + dst = Tcl_DStringValue(dstPtr) + soFar; + dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; } } - + /* *------------------------------------------------------------------------- * @@ -1441,7 +1489,8 @@ Tcl_UtfToExternalDString( Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { - Tcl_UtfToExternalDStringEx(encoding, src, srcLen, TCL_ENCODING_PROFILE_DEFAULT, dstPtr); + Tcl_UtfToExternalDStringEx( + NULL, encoding, src, srcLen, TCL_ENCODING_PROFILE_DEFAULT, dstPtr, NULL); return Tcl_DStringValue(dstPtr); } @@ -1462,27 +1511,45 @@ Tcl_UtfToExternalDString( * of 0x00. Only valid for "utf-8" and "cesu-8". * * Results: - * The converted bytes are stored in the DString, which is then NULL - * terminated in an encoding-specific manner. The return value is - * the error position in the source string or -1 if no conversion error - * is reported. + * The return value is one of + * TCL_OK: success. Converted string in *dstPtr + * TCL_ERROR: error in passed parameters. Error message in interp + * TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence + * TCL_CONVERT_SYNTAX: source is not conformant to encoding definition + * TCL_CONVERT_UNKNOWN: source contained a character that could not + * be represented in target encoding. * * Side effects: - * None. + * + * TCL_OK: The converted bytes are stored in the DString and NUL + * terminated in an encoding-specific manner + * TCL_ERROR: an error, message is stored in the interp if not NULL. + * TCL_CONVERT_*: if errorLocPtr is NULL, an error message is stored + * in the interpreter (if not NULL). If errorLocPtr is not NULL, + * no error message is stored as it is expected the caller is + * interested in whatever is decoded so far and not treating this + * as an error condition. + * + * In addition, *dstPtr is always initialized and must be cleared + * by the caller irrespective of the return code. * *------------------------------------------------------------------------- */ int Tcl_UtfToExternalDStringEx( + Tcl_Interp *interp, /* For error messages. May be NULL. */ Tcl_Encoding encoding, /* The encoding for the converted string, or * NULL for the default system encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ int flags, /* Conversion control flags. */ - Tcl_DString *dstPtr) /* Uninitialized or free DString in which the + Tcl_DString *dstPtr, /* Uninitialized or free DString in which the * converted string is stored. */ + Tcl_Size *errorLocPtr) /* Where to store the error location + (or TCL_INDEX_NONE if no error). May + be NULL. */ { char *dst; Tcl_EncodingState state; @@ -1505,21 +1572,49 @@ Tcl_UtfToExternalDStringEx( } else if (srcLen < 0) { srcLen = strlen(src); } + flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, - srcLen, flags, &state, dst, dstLen, - &srcRead, &dstWrote, &dstChars); + srcLen, flags, &state, dst, dstLen, + &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); src += srcRead; if (result != TCL_CONVERT_NOSPACE) { + Tcl_Size nBytesProcessed = (Tcl_Size)(src - srcStart); int i = soFar + encodingPtr->nullSize - 1; while (i >= soFar) { Tcl_DStringSetLength(dstPtr, i--); } - return (result == TCL_OK) ? TCL_INDEX_NONE : (int)(src - srcStart); + if (errorLocPtr) { + /* + * Do not write error message into interpreter if caller + * wants to know error location. + */ + *errorLocPtr = result == TCL_OK ? TCL_INDEX_NONE : nBytesProcessed; + } + else { + /* Caller wants error message on failure */ + if (result != TCL_OK && interp != NULL) { + Tcl_Size pos = Tcl_NumUtfChars(srcStart, nBytesProcessed); + int ucs4; + char buf[TCL_INTEGER_SPACE]; + TclUtfToUCS4(&srcStart[nBytesProcessed], &ucs4); + sprintf(buf, "%u", nBytesProcessed); + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf( + "unexpected character at index %" TCL_Z_MODIFIER + "u: 'U+%06X'", + pos, + ucs4)); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", + buf, NULL); + } + } + return result; } flags &= ~TCL_ENCODING_START; @@ -2682,6 +2777,8 @@ Utf32ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } + + /* * If we had a truncated code unit at the end AND this is the last * fragment AND profile is not "strict", stick FFFD in its place. @@ -2917,6 +3014,7 @@ Utf16ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } + /* * If we had a truncated code unit at the end AND this is the last * fragment AND profile is not "strict", stick FFFD in its place. diff --git a/tests/cmdAH.test b/tests/cmdAH.test index f8eba4e..471d46a 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -703,15 +703,25 @@ lappend encInvalidBytes {*}{ # happen when the sequence is at the end (including by itself) Thus {solo tail} # in some cases. lappend encInvalidBytes {*}{ - utf-16le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} - utf-16le 41 strict {} 0 {solo tail} {Truncated} + utf-16le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-16le 41 strict {} 0 {solo tail} {Truncated} utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} utf-16le 00D8 replace \uFFFD -1 {knownBug} {Missing low surrogate} utf-16le 00D8 strict {} 0 {knownBug} {Missing low surrogate} utf-16le 00DC tcl8 \uDC00 -1 {} {Missing high surrogate} utf-16le 00DC replace \uFFFD -1 {knownBug} {Missing high surrogate} utf-16le 00DC strict {} 0 {knownBug} {Missing high surrogate} + + utf-16be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-16be 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-16be 41 strict {} 0 {solo tail} {Truncated} + utf-16be D800 tcl8 \uD800 -1 {} {Missing low surrogate} + utf-16be D800 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16be D800 strict {} 0 {knownBug} {Missing low surrogate} + utf-16be DC00 tcl8 \uDC00 -1 {} {Missing high surrogate} + utf-16be DC00 replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16be DC00 strict {} 0 {knownBug} {Missing high surrogate} } # utf32-le and utf32-be test cases. Note utf32 cases are automatically generated @@ -727,7 +737,7 @@ lappend encInvalidBytes {*}{ utf-32le 4100 strict {} 0 {solo tail} {Truncated} utf-32le 410000 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-32le 410000 replace \uFFFD -1 {solo} {Truncated} - utf-32le 410000 strict {} 0 {solo tail} {Truncated} + utf-32le 410000 strict {} 0 {solo tail} {Truncated} utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate} utf-32le 00D80000 replace \uFFFD -1 {} {High-surrogate} utf-32le 00D80000 strict {} 0 {} {High-surrogate} @@ -745,8 +755,14 @@ lappend encInvalidBytes {*}{ utf-32le FFFFFFFF strict {} 0 {} {Out of range} utf-32be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-32be 41 strict {} 0 {solo tail} {Truncated} utf-32be 0041 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 0041 replace \uFFFD -1 {solo} {Truncated} + utf-32be 0041 strict {} 0 {solo tail} {Truncated} utf-32be 000041 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 000041 replace \uFFFD -1 {solo} {Truncated} + utf-32be 000041 strict {} 0 {solo tail} {Truncated} utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate} utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate} utf-32be 0000D800 strict {} 0 {} {High-surrogate} -- cgit v0.12 From 186cc71273a606360094ccb275bc239c6c17235a Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 23 Feb 2023 13:24:58 +0000 Subject: Had forgotten to remove disabled code --- generic/tclCmdAH.c | 52 ---------------------------------------------------- 1 file changed, 52 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 24b2038..93c3416 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -734,31 +734,6 @@ EncodingConvertfromObjCmd( return TCL_ERROR; } } -#ifdef OBSOLETE - if (result != TCL_INDEX_NONE && - TCL_ENCODING_PROFILE_GET(flags) != TCL_ENCODING_PROFILE_TCL8) { - if (failVarObj != NULL) { - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { - return TCL_ERROR; - } - } else { - char buf[TCL_INTEGER_SPACE]; - sprintf(buf, "%u", result); - Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected byte sequence starting at index %" - "u: '\\x%X'", result, UCHAR(bytesPtr[result]))); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", - buf, NULL); - Tcl_DStringFree(&ds); - return TCL_ERROR; - } - } - else if (failVarObj != NULL) { - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { - return TCL_ERROR; - } - } -#endif - /* * Note that we cannot use Tcl_DStringResult here because it will * truncate the string at the first null byte. @@ -855,33 +830,6 @@ EncodingConverttoObjCmd( return TCL_ERROR; } } -#ifdef OBSOLETE - if (result != TCL_INDEX_NONE && - TCL_ENCODING_PROFILE_GET(flags) != TCL_ENCODING_PROFILE_TCL8) { - if (failVarObj != NULL) { - /* I hope, wide int will cover size_t data type */ - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { - return TCL_ERROR; - } - } else { - size_t pos = Tcl_NumUtfChars(stringPtr, result); - int ucs4; - char buf[TCL_INTEGER_SPACE]; - TclUtfToUCS4(&stringPtr[result], &ucs4); - sprintf(buf, "%u", result); - Tcl_SetObjResult(interp, Tcl_ObjPrintf("unexpected character at index %" - TCL_Z_MODIFIER "u: 'U+%06X'", pos, ucs4)); - Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", - buf, NULL); - Tcl_DStringFree(&ds); - return TCL_ERROR; - } - } else if (failVarObj != NULL) { - if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewIntObj(-1), TCL_LEAVE_ERR_MSG) == NULL) { - return TCL_ERROR; - } - } -#endif Tcl_SetObjResult(interp, Tcl_NewByteArrayObj((unsigned char*) Tcl_DStringValue(&ds), -- cgit v0.12 From 10c559acbfbd8c8848e7f8fb9166e00e2aec2dc5 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 23 Feb 2023 21:20:21 +0000 Subject: Remove left-over traces of [0a74820b6d], which was merged into the apn-encoding-profile and landed into tip-656. This commit was merged premature into core-8-branch, leaving a [dab7fd5973|memory leak] --- generic/tclIO.c | 59 +------ generic/tclIOCmd.c | 25 +-- tests/io.test | 474 +++++++++++------------------------------------------ 3 files changed, 99 insertions(+), 459 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 880b669..b12adf6 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -4645,7 +4645,6 @@ Tcl_GetsObj( /* State info for channel */ ChannelBuffer *bufPtr; int inEofChar, skip, copiedTotal, oldFlags, oldRemoved; - int reportError = 0; int oldLength; Tcl_Encoding encoding; char *dst, *dstEnd, *eol, *eof; @@ -4654,7 +4653,6 @@ Tcl_GetsObj( if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { UpdateInterest(chanPtr); Tcl_SetErrno(EILSEQ); - ResetFlag(statePtr, CHANNEL_ENCODING_ERROR); return TCL_INDEX_NONE; } @@ -4914,19 +4912,6 @@ Tcl_GetsObj( goto done; } goto gotEOL; - } else if (gs.bytesWrote == 0 - && GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { - /* Set eol to the position that caused the encoding error, and then - * coninue to gotEOL, which stores the data that was decoded - * without error to objPtr. This allows the caller to do something - * useful with the data decoded so far, and also results in the - * position of the file being the first byte that was not - * succesfully decoded, allowing further processing at exactly that - * point, if desired. - */ - eol = dstEnd; - reportError = 1; - goto gotEOL; } dst = dstEnd; } @@ -4970,16 +4955,7 @@ Tcl_GetsObj( Tcl_SetObjLength(objPtr, eol - objPtr->bytes); CommonGetsCleanup(chanPtr); ResetFlag(statePtr, CHANNEL_BLOCKED); - if (reportError) { - ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR|CHANNEL_ENCODING_ERROR); - /* reset CHANNEL_ENCODING_ERROR to afford a chance to reconfigure - * the channel and try again - */ - Tcl_SetErrno(EILSEQ); - copiedTotal = -1; - } else { - copiedTotal = gs.totalChars + gs.charsWrote - skip; - } + copiedTotal = gs.totalChars + gs.charsWrote - skip; goto done; /* @@ -6007,9 +5983,8 @@ DoReadChars( } if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR)) { - /* TODO: UpdateInterest not needed here? */ + /* TODO: We don't need this call? */ UpdateInterest(chanPtr); - Tcl_SetErrno(EILSEQ); return -1; } @@ -6025,7 +6000,7 @@ DoReadChars( assert(statePtr->inputEncodingFlags & TCL_ENCODING_END); assert(!GotFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR)); - /* TODO: UpdateInterest not needed here? */ + /* TODO: We don't need this call? */ UpdateInterest(chanPtr); return 0; } @@ -6039,7 +6014,7 @@ DoReadChars( } ResetFlag(statePtr, CHANNEL_BLOCKED|CHANNEL_EOF); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; - /* TODO: UpdateInterest not needed here? */ + /* TODO: We don't need this call? */ UpdateInterest(chanPtr); return 0; } @@ -6070,7 +6045,7 @@ DoReadChars( } /* - * Recycle current buffer if empty. + * If the current buffer is empty recycle it. */ bufPtr = statePtr->inQueueHead; @@ -6083,24 +6058,6 @@ DoReadChars( statePtr->inQueueTail = NULL; } } - - /* - * If CHANNEL_ENCODING_ERROR and CHANNEL_STICKY_EOF are both set, - * then CHANNEL_ENCODING_ERROR was caused by data that occurred - * after the EOF character was encountered, so it doesn't count as - * a real error. - */ - - if (GotFlag(statePtr, CHANNEL_ENCODING_ERROR) - && !GotFlag(statePtr, CHANNEL_STICKY_EOF) - && !GotFlag(statePtr, CHANNEL_NONBLOCKING)) { - /* Channel is synchronous. Return an error so that callers - * like [read] can return an error. - */ - Tcl_SetErrno(EILSEQ); - copied = -1; - goto finish; - } } if (copiedNow < 0) { @@ -6129,7 +6086,6 @@ DoReadChars( } } -finish: /* * Failure to fill a channel buffer may have left channel reporting a * "blocked" state, but so long as we fulfilled the request here, the @@ -6793,14 +6749,11 @@ TranslateInputEOL( * EOF character was seen in EOL translated range. Leave current file * position pointing at the EOF character, but don't store the EOF * character in the output string. - * - * If CHANNEL_ENCODING_ERROR is set, it can only be because of data - * encountered after the EOF character, so it is nonsense. Unset it. */ SetFlag(statePtr, CHANNEL_EOF | CHANNEL_STICKY_EOF); statePtr->inputEncodingFlags |= TCL_ENCODING_END; - ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR|CHANNEL_ENCODING_ERROR); + ResetFlag(statePtr, CHANNEL_BLOCKED|INPUT_SAW_CR); } } diff --git a/generic/tclIOCmd.c b/generic/tclIOCmd.c index 507e06c..e8a534f 100644 --- a/generic/tclIOCmd.c +++ b/generic/tclIOCmd.c @@ -296,9 +296,6 @@ Tcl_GetsObjCmd( int lineLen; /* Length of line just read. */ int mode; /* Mode in which channel is opened. */ Tcl_Obj *linePtr, *chanObjPtr; - /* - Tcl_Obj *resultDictPtr, *returnOptsPtr; - */ int code = TCL_OK; if ((objc != 2) && (objc != 3)) { @@ -321,6 +318,7 @@ Tcl_GetsObjCmd( lineLen = Tcl_GetsObj(chan, linePtr); if (lineLen < 0) { if (!Tcl_Eof(chan) && !Tcl_InputBlocked(chan)) { + Tcl_DecrRefCount(linePtr); /* * TIP #219. @@ -334,15 +332,6 @@ Tcl_GetsObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } - /* - resultDictPtr = Tcl_NewDictObj(); - Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) - , linePtr); - returnOptsPtr = Tcl_NewDictObj(); - Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) - , resultDictPtr); - Tcl_SetReturnOptions(interp, returnOptsPtr); - */ code = TCL_ERROR; goto done; } @@ -393,9 +382,6 @@ Tcl_ReadObjCmd( int charactersRead; /* How many characters were read? */ int mode; /* Mode in which channel is opened. */ Tcl_Obj *resultPtr, *chanObjPtr; - /* - Tcl_Obj *resultDictPtr, *returnOptsPtr; - */ if ((objc != 2) && (objc != 3)) { Interp *iPtr; @@ -484,17 +470,8 @@ Tcl_ReadObjCmd( "error reading \"%s\": %s", TclGetString(chanObjPtr), Tcl_PosixError(interp))); } - /* - resultDictPtr = Tcl_NewDictObj(); - Tcl_DictObjPut(NULL, resultDictPtr, Tcl_NewStringObj("read", -1) - , resultPtr); - returnOptsPtr = Tcl_NewDictObj(); - Tcl_DictObjPut(NULL, returnOptsPtr, Tcl_NewStringObj("-result", -1) - , resultDictPtr); TclChannelRelease(chan); Tcl_DecrRefCount(resultPtr); - Tcl_SetReturnOptions(interp, returnOptsPtr); - */ return TCL_ERROR; } diff --git a/tests/io.test b/tests/io.test index 0f47a8e..4578a93 100644 --- a/tests/io.test +++ b/tests/io.test @@ -1547,53 +1547,19 @@ test io-12.8 {ReadChars: multibyte chars split} { close $f scan [string index $in end] %c } 160 - - -apply [list {} { - set template { - test io-12.9.@variant@ {ReadChars: multibyte chars split, default (strict)} -body { - set res {} - set f [open $path(test1) w] - fconfigure $f -translation binary - puts -nonewline $f [string repeat a 9]\xC2 - close $f - set f [open $path(test1)] - fconfigure $f -encoding utf-8 @strict@ -buffersize 10 - set status [catch {read $f} cres copts] - #set in [dict get $copts -result] - #lappend res $in - lappend res $status $cres - set status [catch {read $f} cres copts] - #set in [dict get $copts -result] - #lappend res $in - lappend res $status $cres - set res - } -cleanup { - catch {close $f} - } -match glob\ - } - - #append template {\ - # -result {{read aaaaaaaaa} 1\ - # {error reading "*": illegal byte sequence}\ - # {read {}} 1 {error reading "*": illegal byte sequence}} - #} - - append template {\ - -result {1\ - {error reading "*": illegal byte sequence}\ - 1 {error reading "*": illegal byte sequence}} - } - - # strict encoding may be the default in Tcl 9, but in 8 it is not - foreach variant {encodingstrict} strict {{-encodingprofile strict}} { - set script [string map [ - list @variant@ $variant @strict@ $strict] $template] - uplevel 1 $script - } -} [namespace current]] - - +test io-12.9 {ReadChars: multibyte chars split} -body { + set f [open $path(test1) w] + fconfigure $f -translation binary + puts -nonewline $f [string repeat a 9]\xC2 + close $f + set f [open $path(test1)] + fconfigure $f -encoding utf-8 -buffersize 10 + set in [read $f] + close $f + scan [string index $in end] %c +} -cleanup { + catch {close $f} +} -result 194 test io-12.10 {ReadChars: multibyte chars split} -body { set f [open $path(test1) w] fconfigure $f -translation binary @@ -9177,136 +9143,68 @@ test io-75.5 {invalid utf-8 encoding read is ignored (-encodingprofile tcl8)} -s removeFile io-75.5 } -result 4181 +test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { + set fn [makeFile {} io-75.6] + set f [open $fn w+] + fconfigure $f -encoding binary + # \x81 is invalid in utf-8 + puts -nonewline $f A\x81 + flush $f + seek $f 0 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 +} -body { + set d [read $f] + binary scan $d H* hd + lappend hd [catch {read $f} msg] + close $f + lappend hd $msg +} -cleanup { + removeFile io-75.6 +} -match glob -result {41 1 {error reading "*": illegal byte sequence}} -apply [list {} { - - - set test { - test io-75.6 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { - set hd {} - set fn [makeFile {} io-75.6] - set f [open $fn w+] - fconfigure $f -encoding binary - # \x81 is invalid in utf-8 - puts -nonewline $f A\x81 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile strict - } -body { - set status [catch {read $f} cres copts] - #set d [dict get $copts -result read] - #binary scan $d H* hd - lappend hd $status $cres - } -cleanup { - close $f - removeFile io-75.6 - } -match glob\ - } - - #append test {\ - # -result {41 1 {error reading "*": illegal byte sequence}} - #} - - append test {\ - -result {1 {error reading "*": illegal byte sequence}} - } - - uplevel 1 $test - - set test { - test io-75.7 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -setup { - set hd {} - set fn [makeFile {} io-75.7] - set f [open $fn w+] - fconfigure $f -encoding binary - # \xA1 is invalid in utf-8. -eofchar is not detected, because it comes later. - puts -nonewline $f A\xA1\x1A - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict - } -body { - set status [catch {read $f} cres copts] - #set d [dict get $copts -result read] - #binary scan $d H* hd - lappend hd [eof $f] - lappend hd $status - lappend hd $cres - fconfigure $f -encoding iso8859-1 - lappend hd [read $f];# We changed encoding, so now we can read the \xA1 - close $f - set hd - } -cleanup { - removeFile io-75.7 - } -match glob\ - } - - #append test {\ - # -result {41 0 1 {error reading "*": illegal byte sequence} ¡} - #} - - append test {\ - -result {0 1 {error reading "*": illegal byte sequence} ¡} - } - - uplevel 1 $test - - -} [namespace current]] - - - -test io-75.8.incomplete { - incomplete uft-8 char after eof char is not an error (-encodingprofile strict) -} -setup { - set hd {} - set fn [makeFile {} io-75.8] +test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { + set fn [makeFile {} io-75.7] set f [open $fn w+] fconfigure $f -encoding binary - # \x81 is invalid and also incomplete utf-8 data, but because the eof - # character \x1A appears first, it's not an error. - puts -nonewline $f A\x1A\x81 + # \xA1 is invalid in utf-8. -eofchar is not detected, because it comes later. + puts -nonewline $f A\xA1\x1A flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 } -body { set d [read $f] binary scan $d H* hd lappend hd [eof $f] - # there should be no error on additional reads - lappend hd [read $f] + lappend hd [catch {read $f} msg] + lappend hd $msg + fconfigure $f -encoding iso8859-1 + lappend hd [read $f];# We changed encoding, so now we can read the \xA1 close $f set hd } -cleanup { - removeFile io-75.8 -} -result {41 1 {}} + removeFile io-75.7 +} -match glob -result {41 0 1 {error reading "*": illegal byte sequence} ¡} - -test io-75.8.invalid {invalid utf-8 after eof char is not an error (-encodingprofile strict)} -setup { - set res {} +test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { set fn [makeFile {} io-75.8] set f [open $fn w+] fconfigure $f -encoding binary - # \xc0\x80 is invalid utf-8 data, but because the eof character \x1A - # appears first, it's not an error. - puts -nonewline $f A\x1a\xc0\x80 + # \x81 is invalid in utf-8, but since \x1A comes first, -eofchar takes precedence. + puts -nonewline $f A\x1A\x81 flush $f seek $f 0 fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict } -body { set d [read $f] - foreach char [split $d {}] { - lappend res [format %x [scan $char %c]] - } - lappend res [eof $f] - # there should be no error on additional reads - lappend res [read $f] + binary scan $d H* hd + lappend hd [eof $f] + lappend hd [read $f] close $f - set res + set hd } -cleanup { removeFile io-75.8 } -result {41 1 {}} - test io-75.9 {unrepresentable character write passes and is replaced by ?} -setup { set fn [makeFile {} io-75.9] set f [open $fn w+] @@ -9321,7 +9219,9 @@ test io-75.9 {unrepresentable character write passes and is replaced by ?} -setu removeFile io-75.9 } -match glob -result [list {A} {error writing "*": illegal byte sequence}] - +# Incomplete sequence test. +# This error may IMHO only be detected with the close. +# But the read already returns the incomplete sequence. test io-75.10 {incomplete multibyte encoding read is ignored} -setup { set fn [makeFile {} io-75.10] set f [open $fn w+] @@ -9329,7 +9229,7 @@ test io-75.10 {incomplete multibyte encoding read is ignored} -setup { puts -nonewline $f A\xC0 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -encodingprofile tcl8 -buffering none + fconfigure $f -encoding utf-8 -buffering none } -body { set d [read $f] close $f @@ -9338,135 +9238,39 @@ test io-75.10 {incomplete multibyte encoding read is ignored} -setup { } -cleanup { removeFile io-75.10 } -result 41c0 +# The current result returns the orphan byte as byte. +# This may be expected due to special utf-8 handling. +# As utf-8 has a special treatment in multi-byte decoding, also test another +# one. +test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { + set fn [makeFile {} io-75.11] + set f [open $fn w+] + fconfigure $f -encoding binary + # In shiftjis, \x81 starts a two-byte sequence. + # But 2nd byte \xFF is not allowed + puts -nonewline $f A\x81\xFFA + flush $f + seek $f 0 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -strictencoding 1 +} -body { + set d [read $f] + binary scan $d H* hd + lappend hd [catch {set d [read $f]} msg] + lappend hd $msg +} -cleanup { + close $f + removeFile io-75.11 +} -match glob -result {41 1 {error reading "*": illegal byte sequence}} -apply [list {} { - - set test { - test io-75.10_strict {incomplete multibyte encoding read is an error} -setup { - set res {} - set fn [makeFile {} io-75.10] - set f [open $fn w+] - fconfigure $f -encoding binary - puts -nonewline $f A\xC0 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -encodingprofile strict -buffering none - } -body { - set status [catch {read $f} cres copts] - - #set d [dict get $copts -result read] - #binary scan $d H* hd - #lappend res $hd $cres - lappend res $cres - - chan configure $f -encoding iso8859-1 - - set d [read $f] - binary scan $d H* hd - lappend res $hd - close $f - return $res - } -cleanup { - removeFile io-75.10 - } -match glob\ - } - - #append test {\ - # -result {41 {error reading "*": illegal byte sequence} c0} - #} - - append test {\ - -result {{error reading "*": illegal byte sequence} c0} - } - - uplevel 1 $test - - - - set test { - # As utf-8 has a special treatment in multi-byte decoding, also test another - # one. - test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { - set hd {} - set fn [makeFile {} io-75.11] - set f [open $fn w+] - fconfigure $f -encoding binary - # In shiftjis, \x81 starts a two-byte sequence. - # But 2nd byte \xFF is not allowed - puts -nonewline $f A\x81\xFFA - flush $f - seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" \ - -translation lf -encodingprofile strict - } -body { - set status [catch {read $f} cres copts] - #set d [dict get $copts -result read] - #binary scan $d H* hd - lappend hd $status - lappend hd $cres - } -cleanup { - close $f - removeFile io-75.11 - } -match glob - } - - #append test {\ - # -result {41 1 {error reading "*": illegal byte sequence}} - #} - - append test {\ - -result {1 {error reading "*": illegal byte sequence}} - } - - - set test { - test io-75.12 {invalid utf-8 encoding read is an error} -setup { - set hd {} - set res {} - set fn [makeFile {} io-75.12] - set f [open $fn w+] - fconfigure $f -encoding binary - puts -nonewline $f A\x81 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf \ - -encodingprofile strict - } -body { - set status [catch {read $f} cres copts] - #set d [dict get $copts -result read] - #binary scan $d H* hd - #lappend res $hd - lappend res $status $cres - return $res - } -cleanup { - catch {close $f} - removeFile io-75.12 - } -match glob\ - } - - #append test {\ - # -result {41 1 {error reading "*": illegal byte sequence}} - #} - - - append test {\ - -result {1 {error reading "*": illegal byte sequence}} - } - - uplevel 1 $test -} [namespace current]] - - -test io-75.12_ignore {invalid utf-8 encoding read is ignored} -setup { +test io-75.12 {invalid utf-8 encoding read is ignored} -setup { set fn [makeFile {} io-75.12] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar {} \ - -translation lf -encodingprofile tcl8 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf } -body { set d [read $f] close $f @@ -9475,121 +9279,27 @@ test io-75.12_ignore {invalid utf-8 encoding read is ignored} -setup { } -cleanup { removeFile io-75.12 } -result 4181 - - -apply [list {} { - - set test { - test io-75.13 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { - set hd {} - set fn [makeFile {} io-75.13] - set f [open $fn w+] - fconfigure $f -encoding binary - # \x81 is invalid in utf-8 - puts -nonewline $f A\x81 - flush $f - seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" \ - -translation lf -encodingprofile strict - } -body { - set status [catch {read $f} cres copts] - #set d [dict get $copts -result read] - #binary scan $d H* hd - lappend hd $status - lappend hd $cres - } -cleanup { - catch {close $f} - removeFile io-75.13 - } -match glob\ - } - - #append test {\ - # -result {41 1 {error reading "*": illegal byte sequence}} - #} - - append test {\ - -result {1 {error reading "*": illegal byte sequence}} - } - - uplevel 1 $test - - set test { - } - -} [namespace current]] - - -test io-75.14 { - invalid utf-8 encoding [gets] continues in non-strict mode after error -} -setup { - set res {} - set fn [makeFile {} io-75.14] +test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { + set fn [makeFile {} io-75.13] set f [open $fn w+] - fconfigure $f -translation binary - # \xc0 is invalid in utf-8 - puts -nonewline $f a\nb\xc0\nc\n + fconfigure $f -encoding binary + # \x81 is invalid in utf-8 + puts -nonewline $f "A\x81" flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar {} -translation lf -encodingprofile strict + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 } -body { - lappend res [gets $f] - set status [catch {gets $f} cres copts] - lappend res $status $cres - chan configure $f -encodingprofile tcl8 - lappend res [gets $f] - lappend res [gets $f] - close $f - return $res + set d [read $f] + binary scan $d H* hd + lappend hd [catch {read $f} msg] + close $f + lappend hd $msg } -cleanup { - removeFile io-75.14 -} -match glob -result {a 1 {error reading "*": illegal byte sequence} bÀ c} - - - -apply [list {} { - set test { - test io-75.15 {invalid utf-8 encoding strict gets should not hang} -setup { - set res {} - set fn [makeFile {} io-75.15] - set chan [open $fn w+] - fconfigure $chan -encoding binary - # This is not valid UTF-8 - puts $chan hello\nAB\xc0\x40CD\nEFG - close $chan - } -body { - #Now try to read it with [gets] - set chan [open $fn] - fconfigure $chan -encoding utf-8 -encodingprofile strict - lappend res [gets $chan] - set status [catch {gets $chan} cres copts] - lappend res $status $cres - set status [catch {gets $chan} cres copts] - lappend res $status $cres - #lappend res [dict get $copts -result] - chan configur $chan -encoding binary - foreach char [split [read $chan 2] {}] { - lappend res [format %x [scan $char %c]] - } - return $res - } -cleanup { - close $chan - removeFile io-75.15 - } -match glob\ - } + removeFile io-75.13 +} -match glob -result {41 1 {error reading "*": illegal byte sequence}} - #append test {\ - # -result {hello 1 {error reading "*": illegal byte sequence}\ - # 1 {error reading "*": illegal byte sequence} {read AB} c0 40} - #} - - append test {\ - -result {hello 1 {error reading "*": illegal byte sequence}\ - 1 {error reading "*": illegal byte sequence} c0 40} - } - - uplevel 1 $test +# ### ### ### ######### ######### ######### -} [namespace current]] test io-76.0 {channel modes} -setup { -- cgit v0.12 From 6caf48437905145c68bd35e5c12819a86540b235 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 23 Feb 2023 21:31:04 +0000 Subject: -strictencoding 1 -> -encodingprofile strict (since the testcases placed back in previous commit didn't have that yet) --- tests/io.test | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/io.test b/tests/io.test index 4578a93..a8f7bc7 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9143,7 +9143,7 @@ test io-75.5 {invalid utf-8 encoding read is ignored (-encodingprofile tcl8)} -s removeFile io-75.5 } -result 4181 -test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { +test io-75.6 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.6] set f [open $fn w+] fconfigure $f -encoding binary @@ -9151,7 +9151,7 @@ test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -s puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9162,7 +9162,7 @@ test io-75.6 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -s removeFile io-75.6 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} -test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { +test io-75.7 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.7] set f [open $fn w+] fconfigure $f -encoding binary @@ -9170,7 +9170,7 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { puts -nonewline $f A\xA1\x1A flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9185,7 +9185,7 @@ test io-75.7 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { removeFile io-75.7 } -match glob -result {41 0 1 {error reading "*": illegal byte sequence} ¡} -test io-75.8 {invalid utf-8 encoding eof handling (-strictencoding 1)} -setup { +test io-75.8 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.8] set f [open $fn w+] fconfigure $f -encoding binary @@ -9252,7 +9252,7 @@ test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd @@ -9279,7 +9279,7 @@ test io-75.12 {invalid utf-8 encoding read is ignored} -setup { } -cleanup { removeFile io-75.12 } -result 4181 -test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -setup { +test io-75.13 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { set fn [makeFile {} io-75.13] set f [open $fn w+] fconfigure $f -encoding binary @@ -9287,7 +9287,7 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - puts -nonewline $f "A\x81" flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -strictencoding 1 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile strict } -body { set d [read $f] binary scan $d H* hd -- cgit v0.12 From 485bc2fd887abb2501321c670e66c849da1b026c Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Fri, 24 Feb 2023 03:35:31 +0000 Subject: Bug [40c61a5d10]. Fix syntax error message. --- generic/tclCmdAH.c | 11 ++++++----- tests/cmdAH.test | 4 ++-- tests/safe.test | 8 ++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 93c3416..19a5bc3 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -585,11 +585,12 @@ EncodingConvertParseOptions ( if (objc == 1) { numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ - Tcl_WrongNumArgs( - interp, - 1, - objv, - "? ?-profile profile? ?-failindex var? encoding ? data"); + Tcl_WrongNumArgs(interp, + 1, + objv, + "?-profile profile? ?-failindex var? encoding data"); + ((Interp *)interp)->flags |= INTERP_ALTERNATE_WRONG_ARGS; + Tcl_WrongNumArgs(interp, 1, objv, "data"); return TCL_ERROR; } diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 471d46a..ba78c23 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -175,8 +175,8 @@ test cmdAH-3.2 {Tcl_ContinueObjCmd, success} { # encoding command set "numargErrors(encoding system)" {^wrong # args: should be "(encoding |::tcl::encoding::)system \?encoding\?"$} -set "numargErrors(encoding convertfrom)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertfrom \? \?-profile profile\? \?-failindex var\? encoding \? data"$} -set "numargErrors(encoding convertto)" {^wrong # args: should be "(encoding |::tcl::encoding::)convertto \? \?-profile profile\? \?-failindex var\? encoding \? data"$} +set "numargErrors(encoding convertfrom)" {wrong # args: should be "(encoding |::tcl::encoding::)convertfrom \?-profile profile\? \?-failindex var\? encoding data" or "(encoding |::tcl::encoding::)convertfrom data"} +set "numargErrors(encoding convertto)" {wrong # args: should be "(encoding |::tcl::encoding::)convertto \?-profile profile\? \?-failindex var\? encoding data" or "(encoding |::tcl::encoding::)convertto data"} set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} set "numargErrors(encoding profiles)" {wrong # args: should be "encoding profiles"} diff --git a/tests/safe.test b/tests/safe.test index 8c8382a..f3890b7 100644 --- a/tests/safe.test +++ b/tests/safe.test @@ -1473,7 +1473,7 @@ test safe-11.7 {testing safe encoding} -setup { interp eval $i encoding convertfrom } -returnCodes error -cleanup { safe::interpDelete $i -} -result {wrong # args: should be "encoding convertfrom ??-profile profile? ?-failindex var? ?encoding?? data"} +} -result {wrong # args: should be "encoding convertfrom ?-profile profile? ?-failindex var? encoding data" or "encoding convertfrom data"} test safe-11.7.1 {testing safe encoding} -setup { set i [safe::interpCreate] } -body { @@ -1482,7 +1482,7 @@ test safe-11.7.1 {testing safe encoding} -setup { } -match glob -cleanup { unset -nocomplain m o safe::interpDelete $i -} -result {wrong # args: should be "encoding convertfrom ??-profile profile? ?-failindex var? ?encoding?? data" +} -result {wrong # args: should be "encoding convertfrom ?-profile profile? ?-failindex var? encoding data" or "encoding convertfrom data" while executing "encoding convertfrom" invoked from within @@ -1495,7 +1495,7 @@ test safe-11.8 {testing safe encoding} -setup { interp eval $i encoding convertto } -returnCodes error -cleanup { safe::interpDelete $i -} -result {wrong # args: should be "encoding convertto ??-profile profile? ?-failindex var? ?encoding?? data"} +} -result {wrong # args: should be "encoding convertto ?-profile profile? ?-failindex var? encoding data" or "encoding convertto data"} test safe-11.8.1 {testing safe encoding} -setup { set i [safe::interpCreate] } -body { @@ -1504,7 +1504,7 @@ test safe-11.8.1 {testing safe encoding} -setup { } -match glob -cleanup { unset -nocomplain m o safe::interpDelete $i -} -result {wrong # args: should be "encoding convertto ??-profile profile? ?-failindex var? ?encoding?? data" +} -result {wrong # args: should be "encoding convertto ?-profile profile? ?-failindex var? encoding data" or "encoding convertto data" while executing "encoding convertto" invoked from within -- cgit v0.12 From 854369a67c1719356d036c3fe11e052a7fe62e80 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Fri, 24 Feb 2023 09:35:09 +0000 Subject: Factor out encoding test vectors into separate file so they can be used for file IO tests --- tests/cmdAH.test | 634 +------------------------------------------- tests/encodingVectors.tcl | 655 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 656 insertions(+), 633 deletions(-) create mode 100644 tests/encodingVectors.tcl diff --git a/tests/cmdAH.test b/tests/cmdAH.test index ba78c23..cec93d2 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -180,640 +180,8 @@ set "numargErrors(encoding convertto)" {wrong # args: should be "(encoding |::tc set "numargErrors(encoding names)" {wrong # args: should be "encoding names"} set "numargErrors(encoding profiles)" {wrong # args: should be "encoding profiles"} -set encProfiles {tcl8 strict replace} -set encDefaultProfile tcl8; # Should reflect the default from implementation - -# TODO - valid sequences for different encodings - shiftjis etc. -# Note utf-16, utf-32 missing because they are automatically -# generated based on le/be versions. -lappend encValidStrings {*}{ - ascii \u0000 00 {} {Lowest ASCII} - ascii \u007F 7F knownBug {Highest ASCII} - ascii \u007D 7D {} {Brace - just to verify test scripts are escaped correctly} - ascii \u007B 7B {} {Terminating brace - just to verify test scripts are escaped correctly} - - utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1} - utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1} - utf-8 \u0080 C280 {} {Unicode Table 3.7 Row 2} - utf-8 \u07FF DFBF {} {Unicode Table 3.7 Row 2} - utf-8 \u0800 E0A080 {} {Unicode Table 3.7 Row 3} - utf-8 \u0FFF E0BFBF {} {Unicode Table 3.7 Row 3} - utf-8 \u1000 E18080 {} {Unicode Table 3.7 Row 4} - utf-8 \uCFFF ECBFBF {} {Unicode Table 3.7 Row 4} - utf-8 \uD000 ED8080 {} {Unicode Table 3.7 Row 5} - utf-8 \uD7FF ED9FBF {} {Unicode Table 3.7 Row 5} - utf-8 \uE000 EE8080 {} {Unicode Table 3.7 Row 6} - utf-8 \uFFFF EFBFBF {} {Unicode Table 3.7 Row 6} - utf-8 \U10000 F0908080 {} {Unicode Table 3.7 Row 7} - utf-8 \U3FFFF F0BFBFBF {} {Unicode Table 3.7 Row 7} - utf-8 \U40000 F1808080 {} {Unicode Table 3.7 Row 8} - utf-8 \UFFFFF F3BFBFBF {} {Unicode Table 3.7 Row 8} - utf-8 \U100000 F4808080 {} {Unicode Table 3.7 Row 9} - utf-8 \U10FFFF F48FBFBF {} {Unicode Table 3.7 Row 9} - utf-8 A\u03A9\u8A9E\U00010384 41CEA9E8AA9EF0908E84 {} {Unicode 2.5} - - utf-16le \u0000 0000 {} {Lowest code unit} - utf-16le \uD7FF FFD7 {} {Below high surrogate range} - utf-16le \uE000 00E0 {} {Above low surrogate range} - utf-16le \uFFFF FFFF {} {Highest code unit} - utf-16le \U010000 00D800DC {} {First surrogate pair} - utf-16le \U10FFFF FFDBFFDF {} {First surrogate pair} - utf-16le A\u03A9\u8A9E\U00010384 4100A9039E8A00D884DF {} {Unicode 2.5} - - utf-16be \u0000 0000 {} {Lowest code unit} - utf-16be \uD7FF D7FF {} {Below high surrogate range} - utf-16be \uE000 E000 {} {Above low surrogate range} - utf-16be \uFFFF FFFF {} {Highest code unit} - utf-16be \U010000 D800DC00 {} {First surrogate pair} - utf-16be \U10FFFF DBFFDFFF {} {First surrogate pair} - utf-16be A\u03A9\u8A9E\U00010384 004103A98A9ED800DF84 {} {Unicode 2.5} - - utf-32le \u0000 00000000 {} {Lowest code unit} - utf-32le \uFFFF FFFF0000 {} {Highest BMP} - utf-32le \U010000 00000100 {} {First supplementary} - utf-32le \U10FFFF ffff1000 {} {Last supplementary} - utf-32le A\u03A9\u8A9E\U00010384 41000000A90300009E8A000084030100 {} {Unicode 2.5} - - utf-32be \u0000 00000000 {} {Lowest code unit} - utf-32be \uFFFF 0000FFFF {} {Highest BMP} - utf-32be \U010000 00010000 {} {First supplementary} - utf-32be \U10FFFF 0010FFFF {} {Last supplementary} - utf-32be A\u03A9\u8A9E\U00010384 00000041000003A900008A9E00010384 {} {Unicode 2.5} -} - -# Invalid byte sequences. These are driven from a table with format -# {encoding bytes profile expectedresult expectedfailindex ctrl comment} -# -# should be unique for test ids to be unique. Note utf-16, -# utf-32 missing because they are automatically generated based on le/be -# versions. Each entry potentially results in generation of multiple tests. -# This is controlled by the ctrl field. This should be a list of -# zero or more of the following: -# solo - the test data is the string itself -# lead - the test data is the string followed by a valid suffix -# tail - the test data is the string preceded by a prefix -# middle - the test data is the string wrapped by a prefix and suffix -# If the ctrl field is empty it is treated as all of the above -# Note if there is any other value by itself, it will cause the test to -# be skipped. This is intentional to skip known bugs. -# TODO - non-UTF encodings - -# ascii - Any byte above 127 is invalid and is mapped -# to the same numeric code point except for the range -# 80-9F which is treated as cp1252. -# This tests the TableToUtfProc code path. -lappend encInvalidBytes {*}{ - ascii 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} - ascii 80 replace \uFFFD -1 {} {Smallest invalid byte} - ascii 80 strict {} 0 {} {Smallest invalid byte} - ascii 81 tcl8 \u0081 -1 {knownBug} {map to cp1252} - ascii 82 tcl8 \u201A -1 {knownBug} {map to cp1252} - ascii 83 tcl8 \u0192 -1 {knownBug} {map to cp1252} - ascii 84 tcl8 \u201E -1 {knownBug} {map to cp1252} - ascii 85 tcl8 \u2026 -1 {knownBug} {map to cp1252} - ascii 86 tcl8 \u2020 -1 {knownBug} {map to cp1252} - ascii 87 tcl8 \u2021 -1 {knownBug} {map to cp1252} - ascii 88 tcl8 \u0276 -1 {knownBug} {map to cp1252} - ascii 89 tcl8 \u2030 -1 {knownBug} {map to cp1252} - ascii 8A tcl8 \u0160 -1 {knownBug} {map to cp1252} - ascii 8B tcl8 \u2039 -1 {knownBug} {map to cp1252} - ascii 8C tcl8 \u0152 -1 {knownBug} {map to cp1252} - ascii 8D tcl8 \u008D -1 {knownBug} {map to cp1252} - ascii 8E tcl8 \u017D -1 {knownBug} {map to cp1252} - ascii 8F tcl8 \u008F -1 {knownBug} {map to cp1252} - ascii 90 tcl8 \u0090 -1 {knownBug} {map to cp1252} - ascii 91 tcl8 \u2018 -1 {knownBug} {map to cp1252} - ascii 92 tcl8 \u2019 -1 {knownBug} {map to cp1252} - ascii 93 tcl8 \u201C -1 {knownBug} {map to cp1252} - ascii 94 tcl8 \u201D -1 {knownBug} {map to cp1252} - ascii 95 tcl8 \u2022 -1 {knownBug} {map to cp1252} - ascii 96 tcl8 \u2013 -1 {knownBug} {map to cp1252} - ascii 97 tcl8 \u2014 -1 {knownBug} {map to cp1252} - ascii 98 tcl8 \u02DC -1 {knownBug} {map to cp1252} - ascii 99 tcl8 \u2122 -1 {knownBug} {map to cp1252} - ascii 9A tcl8 \u0161 -1 {knownBug} {map to cp1252} - ascii 9B tcl8 \u203A -1 {knownBug} {map to cp1252} - ascii 9C tcl8 \u0153 -1 {knownBug} {map to cp1252} - ascii 9D tcl8 \u009D -1 {knownBug} {map to cp1252} - ascii 9E tcl8 \u017E -1 {knownBug} {map to cp1252} - ascii 9F tcl8 \u0178 -1 {knownBug} {map to cp1252} - - ascii FF tcl8 \u00FF -1 {} {Largest invalid byte} - ascii FF replace \uFFFD -1 {} {Largest invalid byte} - ascii FF strict {} 0 {} {Largest invalid byte} -} - -# utf-8 - valid sequences based on Table 3.7 in the Unicode -# standard. -# -# Code Points First Second Third Fourth Byte -# U+0000..U+007F 00..7F -# U+0080..U+07FF C2..DF 80..BF -# U+0800..U+0FFF E0 A0..BF 80..BF -# U+1000..U+CFFF E1..EC 80..BF 80..BF -# U+D000..U+D7FF ED 80..9F 80..BF -# U+E000..U+FFFF EE..EF 80..BF 80..BF -# U+10000..U+3FFFF F0 90..BF 80..BF 80..BF -# U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF -# U+100000..U+10FFFF F4 80..8F 80..BF 80..BF -# -# Tests below are based on the "gaps" in the above table. Note ascii test -# values are repeated because internally a different code path is used -# (UtfToUtfProc). -# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080 -lappend encInvalidBytes {*}{ - utf-8 80 tcl8 \u20AC -1 {} {map to cp1252} - utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte} - utf-8 80 strict {} 0 {} {Smallest invalid byte} - utf-8 81 tcl8 \u0081 -1 {} {map to cp1252} - utf-8 82 tcl8 \u201A -1 {} {map to cp1252} - utf-8 83 tcl8 \u0192 -1 {} {map to cp1252} - utf-8 84 tcl8 \u201E -1 {} {map to cp1252} - utf-8 85 tcl8 \u2026 -1 {} {map to cp1252} - utf-8 86 tcl8 \u2020 -1 {} {map to cp1252} - utf-8 87 tcl8 \u2021 -1 {} {map to cp1252} - utf-8 88 tcl8 \u02C6 -1 {} {map to cp1252} - utf-8 89 tcl8 \u2030 -1 {} {map to cp1252} - utf-8 8A tcl8 \u0160 -1 {} {map to cp1252} - utf-8 8B tcl8 \u2039 -1 {} {map to cp1252} - utf-8 8C tcl8 \u0152 -1 {} {map to cp1252} - utf-8 8D tcl8 \u008D -1 {} {map to cp1252} - utf-8 8E tcl8 \u017D -1 {} {map to cp1252} - utf-8 8F tcl8 \u008F -1 {} {map to cp1252} - utf-8 90 tcl8 \u0090 -1 {} {map to cp1252} - utf-8 91 tcl8 \u2018 -1 {} {map to cp1252} - utf-8 92 tcl8 \u2019 -1 {} {map to cp1252} - utf-8 93 tcl8 \u201C -1 {} {map to cp1252} - utf-8 94 tcl8 \u201D -1 {} {map to cp1252} - utf-8 95 tcl8 \u2022 -1 {} {map to cp1252} - utf-8 96 tcl8 \u2013 -1 {} {map to cp1252} - utf-8 97 tcl8 \u2014 -1 {} {map to cp1252} - utf-8 98 tcl8 \u02DC -1 {} {map to cp1252} - utf-8 99 tcl8 \u2122 -1 {} {map to cp1252} - utf-8 9A tcl8 \u0161 -1 {} {map to cp1252} - utf-8 9B tcl8 \u203A -1 {} {map to cp1252} - utf-8 9C tcl8 \u0153 -1 {} {map to cp1252} - utf-8 9D tcl8 \u009D -1 {} {map to cp1252} - utf-8 9E tcl8 \u017E -1 {} {map to cp1252} - utf-8 9F tcl8 \u0178 -1 {} {map to cp1252} - - utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} - utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} - utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} - utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} - utf-8 C080 strict {} 0 {} {C080 -> invalid} - utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} - utf-8 C0A2 tcl8 \u00C0\u00A2 -1 {} {websec.github.io - A} - utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A} - utf-8 C0A2 strict {} 0 {} {websec.github.io - A} - utf-8 C0A7 tcl8 \u00C0\u00A7 -1 {} {websec.github.io - double quote} - utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote} - utf-8 C0A7 strict {} 0 {} {websec.github.io - double quote} - utf-8 C0AE tcl8 \u00C0\u00AE -1 {} {websec.github.io - full stop} - utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop} - utf-8 C0AE strict {} 0 {} {websec.github.io - full stop} - utf-8 C0AF tcl8 \u00C0\u00AF -1 {} {websec.github.io - solidus} - utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus} - utf-8 C0AF strict {} 0 {} {websec.github.io - solidus} - - utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} - utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} - utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} - utf-8 C181 tcl8 \u00C1\u0081 -1 {} {websec.github.io - base test (A)} - utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)} - utf-8 C181 strict {} 0 {} {websec.github.io - base test (A)} - utf-8 C19C tcl8 \u00C1\u0153 -1 {} {websec.github.io - reverse solidus} - utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} - utf-8 C19C strict {} 0 {} {websec.github.io - reverse solidus} - - utf-8 C2 tcl8 \u00C2 -1 {} {Missing trail byte} - utf-8 C2 replace \uFFFD -1 {} {Missing trail byte} - utf-8 C2 strict {} 0 {} {Missing trail byte} - utf-8 C27F tcl8 \u00C2\x7F -1 {} {Trail byte must be 80:BF} - utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} - utf-8 C27F strict {} 0 {} {Trail byte must be 80:BF} - utf-8 DF tcl8 \u00DF -1 {} {Missing trail byte} - utf-8 DF replace \uFFFD -1 {} {Missing trail byte} - utf-8 DF strict {} 0 {} {Missing trail byte} - utf-8 DF7F tcl8 \u00DF\x7F -1 {} {Trail byte must be 80:BF} - utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} - utf-8 DF7F strict {} 0 {} {Trail byte must be 80:BF} - utf-8 DFE0A080 tcl8 \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence} - utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence} - utf-8 DFE0A080 strict {} 0 {} {Invalid trail byte is start of valid sequence} - - utf-8 E0 tcl8 \u00E0 -1 {} {Missing trail byte} - utf-8 E0 replace \uFFFD -1 {} {Missing trail byte} - utf-8 E0 strict {} 0 {} {Missing trail byte} - utf-8 E080 tcl8 \u00E0\u20AC -1 {} {First trail byte must be A0:BF} - utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} - utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} - utf-8 E0819C tcl8 \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus} - utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} - utf-8 E0819C strict {} 0 {} {websec.github.io - reverse solidus} - utf-8 E09F tcl8 \u00E0\u0178 -1 {} {First trail byte must be A0:BF} - utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} - utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} - utf-8 E0A0 tcl8 \u00E0\u00A0 -1 {} {Missing second trail byte} - utf-8 E0A0 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 E0A0 strict {} 0 {} {Missing second trail byte} - utf-8 E0BF tcl8 \u00E0\u00BF -1 {} {Missing second trail byte} - utf-8 E0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 E0BF strict {} 0 {} {Missing second trail byte} - utf-8 E0A07F tcl8 \u00E0\u00A0\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 E0A07F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 E0A07F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 E0BF7F tcl8 \u00E0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 E0BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 E0BF7F strict {} 0 {} {Second trail byte must be 80:BF} - - utf-8 E1 tcl8 \u00E1 -1 {} {Missing trail byte} - utf-8 E1 replace \uFFFD -1 {} {Missing trail byte} - utf-8 E1 strict {} 0 {} {Missing trail byte} - utf-8 E17F tcl8 \u00E1\x7F -1 {} {Trail byte must be 80:BF} - utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} - utf-8 E17F strict {} 0 {} {Trail byte must be 80:BF} - utf-8 E181 tcl8 \u00E1\u0081 -1 {} {Missing second trail byte} - utf-8 E181 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 E181 strict {} 0 {} {Missing second trail byte} - utf-8 E1BF tcl8 \u00E1\u00BF -1 {} {Missing second trail byte} - utf-8 E1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 E1BF strict {} 0 {} {Missing second trail byte} - utf-8 E1807F tcl8 \u00E1\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 E1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 E1807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 E1BF7F tcl8 \u00E1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 E1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 E1BF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 EC tcl8 \u00EC -1 {} {Missing trail byte} - utf-8 EC replace \uFFFD -1 {} {Missing trail byte} - utf-8 EC strict {} 0 {} {Missing trail byte} - utf-8 EC7F tcl8 \u00EC\x7F -1 {} {Trail byte must be 80:BF} - utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} - utf-8 EC7F strict {} 0 {} {Trail byte must be 80:BF} - utf-8 EC81 tcl8 \u00EC\u0081 -1 {} {Missing second trail byte} - utf-8 EC81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 EC81 strict {} 0 {} {Missing second trail byte} - utf-8 ECBF tcl8 \u00EC\u00BF -1 {} {Missing second trail byte} - utf-8 ECBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 ECBF strict {} 0 {} {Missing second trail byte} - utf-8 EC807F tcl8 \u00EC\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 EC807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 EC807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 ECBF7F tcl8 \u00EC\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 ECBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 ECBF7F strict {} 0 {} {Second trail byte must be 80:BF} - - utf-8 ED tcl8 \u00ED -1 {} {Missing trail byte} - utf-8 ED replace \uFFFD -1 {} {Missing trail byte} - utf-8 ED strict {} 0 {} {Missing trail byte} - utf-8 ED7F tcl8 \u00ED\u7F -1 {} {First trail byte must be 80:9F} - utf-8 ED7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:9F} - utf-8 ED7F strict {} 0 {} {First trail byte must be 80:9F} - utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {} {First trail byte must be 80:9F} - utf-8 EDA0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:9F} - utf-8 EDA0 strict {} 0 {} {First trail byte must be 80:9F} - utf-8 ED81 tcl8 \u00ED\u0081 -1 {} {Missing second trail byte} - utf-8 ED81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 ED81 strict {} 0 {} {Missing second trail byte} - utf-8 EDBF tcl8 \u00ED\u00BF -1 {} {Missing second trail byte} - utf-8 EDBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 EDBF strict {} 0 {} {Missing second trail byte} - utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 ED807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 ED807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 ED9F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 ED9F7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 EDA080 tcl8 \uD800 -1 {} {High surrogate} - utf-8 EDA080 replace \uFFFD -1 {} {High surrogate} - utf-8 EDA080 strict {} 0 {} {High surrogate} - utf-8 EDAFBF tcl8 \uDBFF -1 {} {High surrogate} - utf-8 EDAFBF replace \uFFFD -1 {} {High surrogate} - utf-8 EDAFBF strict {} 0 {} {High surrogate} - utf-8 EDB080 tcl8 \uDC00 -1 {} {Low surrogate} - utf-8 EDB080 replace \uFFFD -1 {} {Low surrogate} - utf-8 EDB080 strict {} 0 {} {Low surrogate} - utf-8 EDBFBF tcl8 \uDFFF -1 {} {Low surrogate} - utf-8 EDBFBF replace \uFFFD -1 {} {Low surrogate} - utf-8 EDBFBF strict {} 0 {} {Low surrogate} - utf-8 EDA080EDB080 tcl8 \U00010000 -1 {} {High low surrogate pair} - utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {} {High low surrogate pair} - utf-8 EDA080EDB080 strict {} 0 {} {High low surrogate pair} - utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF -1 {} {High low surrogate pair} - utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {} {High low surrogate pair} - utf-8 EDAFBFEDBFBF strict {} 0 {} {High low surrogate pair} - - utf-8 EE tcl8 \u00EE -1 {} {Missing trail byte} - utf-8 EE replace \uFFFD -1 {} {Missing trail byte} - utf-8 EE strict {} 0 {} {Missing trail byte} - utf-8 EE7F tcl8 \u00EE\u7F -1 {} {First trail byte must be 80:BF} - utf-8 EE7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} - utf-8 EE7F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 EED0 tcl8 \u00EE\u00D0 -1 {} {First trail byte must be 80:BF} - utf-8 EED0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} - utf-8 EED0 strict {} 0 {} {First trail byte must be 80:BF} - utf-8 EE81 tcl8 \u00EE\u0081 -1 {} {Missing second trail byte} - utf-8 EE81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 EE81 strict {} 0 {} {Missing second trail byte} - utf-8 EEBF tcl8 \u00EE\u00BF -1 {} {Missing second trail byte} - utf-8 EEBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 EEBF strict {} 0 {} {Missing second trail byte} - utf-8 EE807F tcl8 \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 EE807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 EE807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 EEBF7F tcl8 \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 EEBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 EEBF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 EF tcl8 \u00EF -1 {} {Missing trail byte} - utf-8 EF replace \uFFFD -1 {} {Missing trail byte} - utf-8 EF strict {} 0 {} {Missing trail byte} - utf-8 EF7F tcl8 \u00EF\u7F -1 {} {First trail byte must be 80:BF} - utf-8 EF7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} - utf-8 EF7F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 EFD0 tcl8 \u00EF\u00D0 -1 {} {First trail byte must be 80:BF} - utf-8 EFD0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} - utf-8 EFD0 strict {} 0 {} {First trail byte must be 80:BF} - utf-8 EF81 tcl8 \u00EF\u0081 -1 {} {Missing second trail byte} - utf-8 EF81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 EF81 strict {} 0 {} {Missing second trail byte} - utf-8 EFBF tcl8 \u00EF\u00BF -1 {} {Missing second trail byte} - utf-8 EFBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 EFBF strict {} 0 {} {Missing second trail byte} - utf-8 EF807F tcl8 \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 EF807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 EF807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 EFBF7F tcl8 \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 EFBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 EFBF7F strict {} 0 {} {Second trail byte must be 80:BF} - - utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte} - utf-8 F0 replace \uFFFD -1 {} {Missing trail byte} - utf-8 F0 strict {} 0 {} {Missing trail byte} - utf-8 F080 tcl8 \u00F0\u20AC -1 {} {First trail byte must be 90:BF} - utf-8 F080 replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} - utf-8 F080 strict {} 0 {} {First trail byte must be 90:BF} - utf-8 F08F tcl8 \u00F0\u8F -1 {} {First trail byte must be 90:BF} - utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} - utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF} - utf-8 F0D0 tcl8 \u00F0\u00D0 -1 {} {First trail byte must be 90:BF} - utf-8 F0D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 90:BF} - utf-8 F0D0 strict {} 0 {} {First trail byte must be 90:BF} - utf-8 F090 tcl8 \u00F0\u0090 -1 {} {Missing second trail byte} - utf-8 F090 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F090 strict {} 0 {} {Missing second trail byte} - utf-8 F0BF tcl8 \u00F0\u00BF -1 {} {Missing second trail byte} - utf-8 F0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F0BF strict {} 0 {} {Missing second trail byte} - utf-8 F0907F tcl8 \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F0907F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F0907F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F0BF7F tcl8 \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F0BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F0BF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F090BF tcl8 \u00F0\u0090\u00BF -1 {} {Missing third trail byte} - utf-8 F090BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F090BF strict {} 0 {} {Missing third trail byte} - utf-8 F0BF81 tcl8 \u00F0\u00BF\u0081 -1 {} {Missing third trail byte} - utf-8 F0BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F0BF81 strict {} 0 {} {Missing third trail byte} - utf-8 F0BF807F tcl8 \u00F0\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} - utf-8 F0BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F0BF817F strict {} 0 {} {Third trail byte must be 80:BF} - utf-8 F090BFD0 tcl8 \u00F0\u0090\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} - utf-8 F090BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F090BFD0 strict {} 0 {} {Third trail byte must be 80:BF} - - utf-8 F1 tcl8 \u00F1 -1 {} {Missing trail byte} - utf-8 F1 replace \uFFFD -1 {} {Missing trail byte} - utf-8 F1 strict {} 0 {} {Missing trail byte} - utf-8 F17F tcl8 \u00F1\u7F -1 {} {First trail byte must be 80:BF} - utf-8 F17F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} - utf-8 F17F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 F1D0 tcl8 \u00F1\u00D0 -1 {} {First trail byte must be 80:BF} - utf-8 F1D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} - utf-8 F1D0 strict {} 0 {} {First trail byte must be 80:BF} - utf-8 F180 tcl8 \u00F1\u20AC -1 {} {Missing second trail byte} - utf-8 F180 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F180 strict {} 0 {} {Missing second trail byte} - utf-8 F1BF tcl8 \u00F1\u00BF -1 {} {Missing second trail byte} - utf-8 F1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F1BF strict {} 0 {} {Missing second trail byte} - utf-8 F1807F tcl8 \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F1807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F1BF7F tcl8 \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F1BF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F180BF tcl8 \u00F1\u20AC\u00BF -1 {} {Missing third trail byte} - utf-8 F180BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F180BF strict {} 0 {} {Missing third trail byte} - utf-8 F1BF81 tcl8 \u00F1\u00BF\u0081 -1 {} {Missing third trail byte} - utf-8 F1BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F1BF81 strict {} 0 {} {Missing third trail byte} - utf-8 F1BF807F tcl8 \u00F1\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} - utf-8 F1BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F1BF817F strict {} 0 {} {Third trail byte must be 80:BF} - utf-8 F180BFD0 tcl8 \u00F1\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} - utf-8 F180BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F180BFD0 strict {} 0 {} {Third trail byte must be 80:BF} - utf-8 F3 tcl8 \u00F3 -1 {} {Missing trail byte} - utf-8 F3 replace \uFFFD -1 {} {Missing trail byte} - utf-8 F3 strict {} 0 {} {Missing trail byte} - utf-8 F37F tcl8 \u00F3\x7F -1 {} {First trail byte must be 80:BF} - utf-8 F37F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} - utf-8 F37F strict {} 0 {} {First trail byte must be 80:BF} - utf-8 F3D0 tcl8 \u00F3\u00D0 -1 {} {First trail byte must be 80:BF} - utf-8 F3D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} - utf-8 F3D0 strict {} 0 {} {First trail byte must be 80:BF} - utf-8 F380 tcl8 \u00F3\u20AC -1 {} {Missing second trail byte} - utf-8 F380 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F380 strict {} 0 {} {Missing second trail byte} - utf-8 F3BF tcl8 \u00F3\u00BF -1 {} {Missing second trail byte} - utf-8 F3BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F3BF strict {} 0 {} {Missing second trail byte} - utf-8 F3807F tcl8 \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F3807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F3807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F3BF7F tcl8 \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F3BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F3BF7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F380BF tcl8 \u00F3\u20AC\u00BF -1 {} {Missing third trail byte} - utf-8 F380BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F380BF strict {} 0 {} {Missing third trail byte} - utf-8 F3BF81 tcl8 \u00F3\u00BF\u0081 -1 {} {Missing third trail byte} - utf-8 F3BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F3BF81 strict {} 0 {} {Missing third trail byte} - utf-8 F3BF807F tcl8 \u00F3\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} - utf-8 F3BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F3BF817F strict {} 0 {} {Third trail byte must be 80:BF} - utf-8 F380BFD0 tcl8 \u00F3\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} - utf-8 F380BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F380BFD0 strict {} 0 {} {Third trail byte must be 80:BF} - - utf-8 F4 tcl8 \u00F4 -1 {} {Missing trail byte} - utf-8 F4 replace \uFFFD -1 {} {Missing trail byte} - utf-8 F4 strict {} 0 {} {Missing trail byte} - utf-8 F47F tcl8 \u00F4\u7F -1 {} {First trail byte must be 80:8F} - utf-8 F47F replace \uFFFD\u7F -1 {knownW3C} {First trail byte must be 80:8F} - utf-8 F47F strict {} 0 {} {First trail byte must be 80:8F} - utf-8 F490 tcl8 \u00F4\u0090 -1 {} {First trail byte must be 80:8F} - utf-8 F490 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:8F} - utf-8 F490 strict {} 0 {} {First trail byte must be 80:8F} - utf-8 F480 tcl8 \u00F4\u20AC -1 {} {Missing second trail byte} - utf-8 F480 replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F480 strict {} 0 {} {Missing second trail byte} - utf-8 F48F tcl8 \u00F4\u008F -1 {} {Missing second trail byte} - utf-8 F48F replace \uFFFD -1 {knownW3C} {Missing second trail byte} - utf-8 F48F strict {} 0 {} {Missing second trail byte} - utf-8 F4807F tcl8 \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F4807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F4807F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F48F7F tcl8 \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF} - utf-8 F48F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} - utf-8 F48F7F strict {} 0 {} {Second trail byte must be 80:BF} - utf-8 F48081 tcl8 \u00F4\u20AC\u0081 -1 {} {Missing third trail byte} - utf-8 F48081 replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F48081 strict {} 0 {} {Missing third trail byte} - utf-8 F48F81 tcl8 \u00F4\u008F\u0081 -1 {} {Missing third trail byte} - utf-8 F48F81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} - utf-8 F48F81 strict {} 0 {} {Missing third trail byte} - utf-8 F481817F tcl8 \u00F4\u0081\u0081\x7F -1 {} {Third trail byte must be 80:BF} - utf-8 F480817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F480817F strict {} 0 {} {Third trail byte must be 80:BF} - utf-8 F48FBFD0 tcl8 \u00F4\u008F\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} - utf-8 F48FBFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} - utf-8 F48FBFD0 strict {} 0 {} {Third trail byte must be 80:BF} - - utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} - utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} - utf-8 F5 strict {} 0 {} {F5:FF are invalid everywhere} - utf-8 FF tcl8 \u00FF -1 {} {F5:FF are invalid everywhere} - utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere} - utf-8 FF strict {} 0 {} {F5:FF are invalid everywhere} - - utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8} - utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9} - utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10} - utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3.11} -} - -# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated -# based on these depending on platform endianness. Note truncated tests can only -# happen when the sequence is at the end (including by itself) Thus {solo tail} -# in some cases. -lappend encInvalidBytes {*}{ - utf-16le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} - utf-16le 41 strict {} 0 {solo tail} {Truncated} - utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} - utf-16le 00D8 replace \uFFFD -1 {knownBug} {Missing low surrogate} - utf-16le 00D8 strict {} 0 {knownBug} {Missing low surrogate} - utf-16le 00DC tcl8 \uDC00 -1 {} {Missing high surrogate} - utf-16le 00DC replace \uFFFD -1 {knownBug} {Missing high surrogate} - utf-16le 00DC strict {} 0 {knownBug} {Missing high surrogate} - - utf-16be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-16be 41 replace \uFFFD -1 {solo tail} {Truncated} - utf-16be 41 strict {} 0 {solo tail} {Truncated} - utf-16be D800 tcl8 \uD800 -1 {} {Missing low surrogate} - utf-16be D800 replace \uFFFD -1 {knownBug} {Missing low surrogate} - utf-16be D800 strict {} 0 {knownBug} {Missing low surrogate} - utf-16be DC00 tcl8 \uDC00 -1 {} {Missing high surrogate} - utf-16be DC00 replace \uFFFD -1 {knownBug} {Missing high surrogate} - utf-16be DC00 strict {} 0 {knownBug} {Missing high surrogate} -} - -# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated -# based on these depending on platform endianness. Note truncated tests can only -# happen when the sequence is at the end (including by itself) Thus {solo tail} -# in some cases. -lappend encInvalidBytes {*}{ - utf-32le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-32le 41 replace \uFFFD -1 {solo} {Truncated} - utf-32le 41 strict {} 0 {solo tail} {Truncated} - utf-32le 4100 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-32le 4100 replace \uFFFD -1 {solo} {Truncated} - utf-32le 4100 strict {} 0 {solo tail} {Truncated} - utf-32le 410000 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-32le 410000 replace \uFFFD -1 {solo} {Truncated} - utf-32le 410000 strict {} 0 {solo tail} {Truncated} - utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate} - utf-32le 00D80000 replace \uFFFD -1 {} {High-surrogate} - utf-32le 00D80000 strict {} 0 {} {High-surrogate} - utf-32le 00DC0000 tcl8 \uDC00 -1 {} {Low-surrogate} - utf-32le 00DC0000 replace \uFFFD -1 {} {Low-surrogate} - utf-32le 00DC0000 strict {} 0 {} {Low-surrogate} - utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} - utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} - utf-32le 00D8000000DC0000 strict {} 0 {} {High-low-surrogate-pair} - utf-32le 00001100 tcl8 \UFFFD -1 {} {Out of range} - utf-32le 00001100 replace \UFFFD -1 {} {Out of range} - utf-32le 00001100 strict {} 0 {} {Out of range} - utf-32le FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} - utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range} - utf-32le FFFFFFFF strict {} 0 {} {Out of range} - - utf-32be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-32be 41 replace \uFFFD -1 {solo tail} {Truncated} - utf-32be 41 strict {} 0 {solo tail} {Truncated} - utf-32be 0041 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-32be 0041 replace \uFFFD -1 {solo} {Truncated} - utf-32be 0041 strict {} 0 {solo tail} {Truncated} - utf-32be 000041 tcl8 \uFFFD -1 {solo tail} {Truncated} - utf-32be 000041 replace \uFFFD -1 {solo} {Truncated} - utf-32be 000041 strict {} 0 {solo tail} {Truncated} - utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate} - utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate} - utf-32be 0000D800 strict {} 0 {} {High-surrogate} - utf-32be 0000DC00 tcl8 \uDC00 -1 {} {Low-surrogate} - utf-32be 0000DC00 replace \uFFFD -1 {} {Low-surrogate} - utf-32be 0000DC00 strict {} 0 {} {Low-surrogate} - utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} - utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} - utf-32be 0000D8000000DC00 strict {} 0 {} {High-low-surrogate-pair} - utf-32be 00110000 tcl8 \UFFFD -1 {} {Out of range} - utf-32be 00110000 replace \UFFFD -1 {} {Out of range} - utf-32be 00110000 strict {} 0 {} {Out of range} - utf-32be FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} - utf-32be FFFFFFFF replace \UFFFD -1 {} {Out of range} - utf-32be FFFFFFFF strict {} 0 {} {Out of range} -} - - -# Strings that cannot be encoded for specific encoding / profiles -# {encoding string profile exptedresult expectedfailindex ctrl comment} -# should be unique for test ids to be unique. -# Note utf-16, utf-32 missing because they are automatically -# generated based on le/be versions. -# Each entry potentially results in generation of multiple tests. -# This is controlled by the ctrl field. This should be a list of -# zero or more of the following: -# solo - the test data is the string itself -# lead - the test data is the string followed by a valid suffix -# tail - the test data is the string preceded by a prefix -# middle - the test data is the string wrapped by a prefix and suffix -# If the ctrl field is empty it is treated as all of the above -# Note if there is any other value by itself, it will cause the test to -# be skipped. This is intentional to skip known bugs. -# TODO - other encodings -# TODO - out of range code point (note cannot be generated by \U notation) -lappend encUnencodableStrings {*}{ - ascii \u00e0 tcl8 3f -1 {} {unencodable} - ascii \u00e0 strict {} 0 {} {unencodable} - - iso8859-1 \u0141 tcl8 3f -1 {} unencodable - iso8859-1 \u0141 strict {} 0 {} unencodable - - utf-8 \uD800 tcl8 eda080 -1 {} High-surrogate - utf-8 \uD800 strict {} 0 {} High-surrogate - utf-8 \uDC00 tcl8 edb080 -1 {} High-surrogate - utf-8 \uDC00 strict {} 0 {} High-surrogate -} +source [file join [file dirname [info script]] encodingVectors.tcl] -# Generated tests comparing against ICU -# TODO - commented out for now as generating a lot of mismatches. -# source [file join [file dirname [info script]] icuUcmTests.tcl] # Maps utf-{16,32}{le,be} to utf-16, utf-32 and # others to "". Used to test utf-16, utf-32 based diff --git a/tests/encodingVectors.tcl b/tests/encodingVectors.tcl new file mode 100644 index 0000000..986e221 --- /dev/null +++ b/tests/encodingVectors.tcl @@ -0,0 +1,655 @@ +# This file contains test vectors for verifying various encodings. They are +# stored in a common file so that they can be sourced into the various test +# modules that are dependent on encodings. This file contains statically defined +# test vectors. In addition, it sources the ICU-generated test vectors from +# icuUcmTests.tcl. +# +# Note that sourcing the file will reinitialize any existing encoding test +# vectors. +# + +# List of defined encoding profiles +set encProfiles {tcl8 strict replace} +set encDefaultProfile tcl8; # Should reflect the default from implementation + +# encValidStrings - Table of valid strings. +# +# Each row is +# The pair should be unique for generated test ids to be unique. +# STR is a string that can be encoded in the encoding ENCODING resulting +# in the byte sequence BYTES. The CTRL field is a list that controls test +# generation. It may contain zero or more of `solo`, `lead`, `tail` and +# `middle` indicating that the generated tests should include the string +# by itself, as the lead of a longer string, as the tail of a longer string +# and in the middle of a longer string. If CTRL is empty, it is treated as +# containing all four of the above. The CTRL field may also contain the +# words knownBug or knownW3C which will cause the test generation for that +# vector to be skipped. +# +# utf-16, utf-32 missing because they are automatically +# generated based on le/be versions. +set encValidStrings {}; # Reset the table + +lappend encValidStrings {*}{ + ascii \u0000 00 {} {Lowest ASCII} + ascii \u007F 7F knownBug {Highest ASCII} + ascii \u007D 7D {} {Brace - just to verify test scripts are escaped correctly} + ascii \u007B 7B {} {Terminating brace - just to verify test scripts are escaped correctly} + + utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1} + utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1} + utf-8 \u0080 C280 {} {Unicode Table 3.7 Row 2} + utf-8 \u07FF DFBF {} {Unicode Table 3.7 Row 2} + utf-8 \u0800 E0A080 {} {Unicode Table 3.7 Row 3} + utf-8 \u0FFF E0BFBF {} {Unicode Table 3.7 Row 3} + utf-8 \u1000 E18080 {} {Unicode Table 3.7 Row 4} + utf-8 \uCFFF ECBFBF {} {Unicode Table 3.7 Row 4} + utf-8 \uD000 ED8080 {} {Unicode Table 3.7 Row 5} + utf-8 \uD7FF ED9FBF {} {Unicode Table 3.7 Row 5} + utf-8 \uE000 EE8080 {} {Unicode Table 3.7 Row 6} + utf-8 \uFFFF EFBFBF {} {Unicode Table 3.7 Row 6} + utf-8 \U10000 F0908080 {} {Unicode Table 3.7 Row 7} + utf-8 \U3FFFF F0BFBFBF {} {Unicode Table 3.7 Row 7} + utf-8 \U40000 F1808080 {} {Unicode Table 3.7 Row 8} + utf-8 \UFFFFF F3BFBFBF {} {Unicode Table 3.7 Row 8} + utf-8 \U100000 F4808080 {} {Unicode Table 3.7 Row 9} + utf-8 \U10FFFF F48FBFBF {} {Unicode Table 3.7 Row 9} + utf-8 A\u03A9\u8A9E\U00010384 41CEA9E8AA9EF0908E84 {} {Unicode 2.5} + + utf-16le \u0000 0000 {} {Lowest code unit} + utf-16le \uD7FF FFD7 {} {Below high surrogate range} + utf-16le \uE000 00E0 {} {Above low surrogate range} + utf-16le \uFFFF FFFF {} {Highest code unit} + utf-16le \U010000 00D800DC {} {First surrogate pair} + utf-16le \U10FFFF FFDBFFDF {} {First surrogate pair} + utf-16le A\u03A9\u8A9E\U00010384 4100A9039E8A00D884DF {} {Unicode 2.5} + + utf-16be \u0000 0000 {} {Lowest code unit} + utf-16be \uD7FF D7FF {} {Below high surrogate range} + utf-16be \uE000 E000 {} {Above low surrogate range} + utf-16be \uFFFF FFFF {} {Highest code unit} + utf-16be \U010000 D800DC00 {} {First surrogate pair} + utf-16be \U10FFFF DBFFDFFF {} {First surrogate pair} + utf-16be A\u03A9\u8A9E\U00010384 004103A98A9ED800DF84 {} {Unicode 2.5} + + utf-32le \u0000 00000000 {} {Lowest code unit} + utf-32le \uFFFF FFFF0000 {} {Highest BMP} + utf-32le \U010000 00000100 {} {First supplementary} + utf-32le \U10FFFF ffff1000 {} {Last supplementary} + utf-32le A\u03A9\u8A9E\U00010384 41000000A90300009E8A000084030100 {} {Unicode 2.5} + + utf-32be \u0000 00000000 {} {Lowest code unit} + utf-32be \uFFFF 0000FFFF {} {Highest BMP} + utf-32be \U010000 00010000 {} {First supplementary} + utf-32be \U10FFFF 0010FFFF {} {Last supplementary} + utf-32be A\u03A9\u8A9E\U00010384 00000041000003A900008A9E00010384 {} {Unicode 2.5} +} + +# encInvalidBytes - Table of invalid byte sequences +# These are byte sequences that should appear for an encoding. Each row is +# of the form +# +# The triple should be unique for test ids to be +# unique. BYTES is a byte sequence that is invalid. EXPECTEDRESULT is the +# expected string when the bytes are decoded using the PROFILE profile. +# FAILINDEX gives the expected index of the invalid byte under that profile. The +# CTRL field is a list that controls test generation. It may contain zero or +# more of `solo`, `lead`, `tail` and `middle` indicating that the generated the +# tail of a longer and in the middle of a longer string. If empty, it is treated +# as containing all four of the above. The CTRL field may also contain the words +# knownBug or knownW3C which will cause the test generation for that vector to +# be skipped. +# +# utf-32 missing because they are automatically generated based on le/be +# versions. +set encInvalidBytes {}; # Reset the table + +# ascii - Any byte above 127 is invalid and is mapped +# to the same numeric code point except for the range +# 80-9F which is treated as cp1252. +# This tests the TableToUtfProc code path. +lappend encInvalidBytes {*}{ + ascii 80 tcl8 \u20AC -1 {knownBug} {map to cp1252} + ascii 80 replace \uFFFD -1 {} {Smallest invalid byte} + ascii 80 strict {} 0 {} {Smallest invalid byte} + ascii 81 tcl8 \u0081 -1 {knownBug} {map to cp1252} + ascii 82 tcl8 \u201A -1 {knownBug} {map to cp1252} + ascii 83 tcl8 \u0192 -1 {knownBug} {map to cp1252} + ascii 84 tcl8 \u201E -1 {knownBug} {map to cp1252} + ascii 85 tcl8 \u2026 -1 {knownBug} {map to cp1252} + ascii 86 tcl8 \u2020 -1 {knownBug} {map to cp1252} + ascii 87 tcl8 \u2021 -1 {knownBug} {map to cp1252} + ascii 88 tcl8 \u0276 -1 {knownBug} {map to cp1252} + ascii 89 tcl8 \u2030 -1 {knownBug} {map to cp1252} + ascii 8A tcl8 \u0160 -1 {knownBug} {map to cp1252} + ascii 8B tcl8 \u2039 -1 {knownBug} {map to cp1252} + ascii 8C tcl8 \u0152 -1 {knownBug} {map to cp1252} + ascii 8D tcl8 \u008D -1 {knownBug} {map to cp1252} + ascii 8E tcl8 \u017D -1 {knownBug} {map to cp1252} + ascii 8F tcl8 \u008F -1 {knownBug} {map to cp1252} + ascii 90 tcl8 \u0090 -1 {knownBug} {map to cp1252} + ascii 91 tcl8 \u2018 -1 {knownBug} {map to cp1252} + ascii 92 tcl8 \u2019 -1 {knownBug} {map to cp1252} + ascii 93 tcl8 \u201C -1 {knownBug} {map to cp1252} + ascii 94 tcl8 \u201D -1 {knownBug} {map to cp1252} + ascii 95 tcl8 \u2022 -1 {knownBug} {map to cp1252} + ascii 96 tcl8 \u2013 -1 {knownBug} {map to cp1252} + ascii 97 tcl8 \u2014 -1 {knownBug} {map to cp1252} + ascii 98 tcl8 \u02DC -1 {knownBug} {map to cp1252} + ascii 99 tcl8 \u2122 -1 {knownBug} {map to cp1252} + ascii 9A tcl8 \u0161 -1 {knownBug} {map to cp1252} + ascii 9B tcl8 \u203A -1 {knownBug} {map to cp1252} + ascii 9C tcl8 \u0153 -1 {knownBug} {map to cp1252} + ascii 9D tcl8 \u009D -1 {knownBug} {map to cp1252} + ascii 9E tcl8 \u017E -1 {knownBug} {map to cp1252} + ascii 9F tcl8 \u0178 -1 {knownBug} {map to cp1252} + + ascii FF tcl8 \u00FF -1 {} {Largest invalid byte} + ascii FF replace \uFFFD -1 {} {Largest invalid byte} + ascii FF strict {} 0 {} {Largest invalid byte} +} + +# utf-8 - valid sequences based on Table 3.7 in the Unicode +# standard. +# +# Code Points First Second Third Fourth Byte +# U+0000..U+007F 00..7F +# U+0080..U+07FF C2..DF 80..BF +# U+0800..U+0FFF E0 A0..BF 80..BF +# U+1000..U+CFFF E1..EC 80..BF 80..BF +# U+D000..U+D7FF ED 80..9F 80..BF +# U+E000..U+FFFF EE..EF 80..BF 80..BF +# U+10000..U+3FFFF F0 90..BF 80..BF 80..BF +# U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF +# U+100000..U+10FFFF F4 80..8F 80..BF 80..BF +# +# Tests below are based on the "gaps" in the above table. Note ascii test +# values are repeated because internally a different code path is used +# (UtfToUtfProc). +# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080 +lappend encInvalidBytes {*}{ + utf-8 80 tcl8 \u20AC -1 {} {map to cp1252} + utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte} + utf-8 80 strict {} 0 {} {Smallest invalid byte} + utf-8 81 tcl8 \u0081 -1 {} {map to cp1252} + utf-8 82 tcl8 \u201A -1 {} {map to cp1252} + utf-8 83 tcl8 \u0192 -1 {} {map to cp1252} + utf-8 84 tcl8 \u201E -1 {} {map to cp1252} + utf-8 85 tcl8 \u2026 -1 {} {map to cp1252} + utf-8 86 tcl8 \u2020 -1 {} {map to cp1252} + utf-8 87 tcl8 \u2021 -1 {} {map to cp1252} + utf-8 88 tcl8 \u02C6 -1 {} {map to cp1252} + utf-8 89 tcl8 \u2030 -1 {} {map to cp1252} + utf-8 8A tcl8 \u0160 -1 {} {map to cp1252} + utf-8 8B tcl8 \u2039 -1 {} {map to cp1252} + utf-8 8C tcl8 \u0152 -1 {} {map to cp1252} + utf-8 8D tcl8 \u008D -1 {} {map to cp1252} + utf-8 8E tcl8 \u017D -1 {} {map to cp1252} + utf-8 8F tcl8 \u008F -1 {} {map to cp1252} + utf-8 90 tcl8 \u0090 -1 {} {map to cp1252} + utf-8 91 tcl8 \u2018 -1 {} {map to cp1252} + utf-8 92 tcl8 \u2019 -1 {} {map to cp1252} + utf-8 93 tcl8 \u201C -1 {} {map to cp1252} + utf-8 94 tcl8 \u201D -1 {} {map to cp1252} + utf-8 95 tcl8 \u2022 -1 {} {map to cp1252} + utf-8 96 tcl8 \u2013 -1 {} {map to cp1252} + utf-8 97 tcl8 \u2014 -1 {} {map to cp1252} + utf-8 98 tcl8 \u02DC -1 {} {map to cp1252} + utf-8 99 tcl8 \u2122 -1 {} {map to cp1252} + utf-8 9A tcl8 \u0161 -1 {} {map to cp1252} + utf-8 9B tcl8 \u203A -1 {} {map to cp1252} + utf-8 9C tcl8 \u0153 -1 {} {map to cp1252} + utf-8 9D tcl8 \u009D -1 {} {map to cp1252} + utf-8 9E tcl8 \u017E -1 {} {map to cp1252} + utf-8 9F tcl8 \u0178 -1 {} {map to cp1252} + + utf-8 C0 tcl8 \u00C0 -1 {} {C0 is invalid anywhere} + utf-8 C0 strict {} 0 {} {C0 is invalid anywhere} + utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere} + utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} + utf-8 C080 strict {} 0 {} {C080 -> invalid} + utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} + utf-8 C0A2 tcl8 \u00C0\u00A2 -1 {} {websec.github.io - A} + utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A} + utf-8 C0A2 strict {} 0 {} {websec.github.io - A} + utf-8 C0A7 tcl8 \u00C0\u00A7 -1 {} {websec.github.io - double quote} + utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote} + utf-8 C0A7 strict {} 0 {} {websec.github.io - double quote} + utf-8 C0AE tcl8 \u00C0\u00AE -1 {} {websec.github.io - full stop} + utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop} + utf-8 C0AE strict {} 0 {} {websec.github.io - full stop} + utf-8 C0AF tcl8 \u00C0\u00AF -1 {} {websec.github.io - solidus} + utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus} + utf-8 C0AF strict {} 0 {} {websec.github.io - solidus} + + utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} + utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} + utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} + utf-8 C181 tcl8 \u00C1\u0081 -1 {} {websec.github.io - base test (A)} + utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)} + utf-8 C181 strict {} 0 {} {websec.github.io - base test (A)} + utf-8 C19C tcl8 \u00C1\u0153 -1 {} {websec.github.io - reverse solidus} + utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} + utf-8 C19C strict {} 0 {} {websec.github.io - reverse solidus} + + utf-8 C2 tcl8 \u00C2 -1 {} {Missing trail byte} + utf-8 C2 replace \uFFFD -1 {} {Missing trail byte} + utf-8 C2 strict {} 0 {} {Missing trail byte} + utf-8 C27F tcl8 \u00C2\x7F -1 {} {Trail byte must be 80:BF} + utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 C27F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 DF tcl8 \u00DF -1 {} {Missing trail byte} + utf-8 DF replace \uFFFD -1 {} {Missing trail byte} + utf-8 DF strict {} 0 {} {Missing trail byte} + utf-8 DF7F tcl8 \u00DF\x7F -1 {} {Trail byte must be 80:BF} + utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 DF7F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 DFE0A080 tcl8 \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence} + utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence} + utf-8 DFE0A080 strict {} 0 {} {Invalid trail byte is start of valid sequence} + + utf-8 E0 tcl8 \u00E0 -1 {} {Missing trail byte} + utf-8 E0 replace \uFFFD -1 {} {Missing trail byte} + utf-8 E0 strict {} 0 {} {Missing trail byte} + utf-8 E080 tcl8 \u00E0\u20AC -1 {} {First trail byte must be A0:BF} + utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} + utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E0819C tcl8 \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus} + utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} + utf-8 E0819C strict {} 0 {} {websec.github.io - reverse solidus} + utf-8 E09F tcl8 \u00E0\u0178 -1 {} {First trail byte must be A0:BF} + utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} + utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E0A0 tcl8 \u00E0\u00A0 -1 {} {Missing second trail byte} + utf-8 E0A0 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E0A0 strict {} 0 {} {Missing second trail byte} + utf-8 E0BF tcl8 \u00E0\u00BF -1 {} {Missing second trail byte} + utf-8 E0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E0BF strict {} 0 {} {Missing second trail byte} + utf-8 E0A07F tcl8 \u00E0\u00A0\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E0A07F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E0A07F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 E0BF7F tcl8 \u00E0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E0BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E0BF7F strict {} 0 {} {Second trail byte must be 80:BF} + + utf-8 E1 tcl8 \u00E1 -1 {} {Missing trail byte} + utf-8 E1 replace \uFFFD -1 {} {Missing trail byte} + utf-8 E1 strict {} 0 {} {Missing trail byte} + utf-8 E17F tcl8 \u00E1\x7F -1 {} {Trail byte must be 80:BF} + utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 E17F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 E181 tcl8 \u00E1\u0081 -1 {} {Missing second trail byte} + utf-8 E181 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E181 strict {} 0 {} {Missing second trail byte} + utf-8 E1BF tcl8 \u00E1\u00BF -1 {} {Missing second trail byte} + utf-8 E1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 E1BF strict {} 0 {} {Missing second trail byte} + utf-8 E1807F tcl8 \u00E1\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E1807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 E1BF7F tcl8 \u00E1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 E1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 E1BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EC tcl8 \u00EC -1 {} {Missing trail byte} + utf-8 EC replace \uFFFD -1 {} {Missing trail byte} + utf-8 EC strict {} 0 {} {Missing trail byte} + utf-8 EC7F tcl8 \u00EC\x7F -1 {} {Trail byte must be 80:BF} + utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF} + utf-8 EC7F strict {} 0 {} {Trail byte must be 80:BF} + utf-8 EC81 tcl8 \u00EC\u0081 -1 {} {Missing second trail byte} + utf-8 EC81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EC81 strict {} 0 {} {Missing second trail byte} + utf-8 ECBF tcl8 \u00EC\u00BF -1 {} {Missing second trail byte} + utf-8 ECBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 ECBF strict {} 0 {} {Missing second trail byte} + utf-8 EC807F tcl8 \u00EC\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EC807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EC807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 ECBF7F tcl8 \u00EC\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 ECBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ECBF7F strict {} 0 {} {Second trail byte must be 80:BF} + + utf-8 ED tcl8 \u00ED -1 {} {Missing trail byte} + utf-8 ED replace \uFFFD -1 {} {Missing trail byte} + utf-8 ED strict {} 0 {} {Missing trail byte} + utf-8 ED7F tcl8 \u00ED\u7F -1 {} {First trail byte must be 80:9F} + utf-8 ED7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:9F} + utf-8 ED7F strict {} 0 {} {First trail byte must be 80:9F} + utf-8 EDA0 tcl8 \u00ED\u00A0 -1 {} {First trail byte must be 80:9F} + utf-8 EDA0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:9F} + utf-8 EDA0 strict {} 0 {} {First trail byte must be 80:9F} + utf-8 ED81 tcl8 \u00ED\u0081 -1 {} {Missing second trail byte} + utf-8 ED81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 ED81 strict {} 0 {} {Missing second trail byte} + utf-8 EDBF tcl8 \u00ED\u00BF -1 {} {Missing second trail byte} + utf-8 EDBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EDBF strict {} 0 {} {Missing second trail byte} + utf-8 ED807F tcl8 \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 ED807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ED807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 ED9F7F tcl8 \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 ED9F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 ED9F7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EDA080 tcl8 \uD800 -1 {} {High surrogate} + utf-8 EDA080 replace \uFFFD -1 {} {High surrogate} + utf-8 EDA080 strict {} 0 {} {High surrogate} + utf-8 EDAFBF tcl8 \uDBFF -1 {} {High surrogate} + utf-8 EDAFBF replace \uFFFD -1 {} {High surrogate} + utf-8 EDAFBF strict {} 0 {} {High surrogate} + utf-8 EDB080 tcl8 \uDC00 -1 {} {Low surrogate} + utf-8 EDB080 replace \uFFFD -1 {} {Low surrogate} + utf-8 EDB080 strict {} 0 {} {Low surrogate} + utf-8 EDBFBF tcl8 \uDFFF -1 {} {Low surrogate} + utf-8 EDBFBF replace \uFFFD -1 {} {Low surrogate} + utf-8 EDBFBF strict {} 0 {} {Low surrogate} + utf-8 EDA080EDB080 tcl8 \U00010000 -1 {} {High low surrogate pair} + utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {} {High low surrogate pair} + utf-8 EDA080EDB080 strict {} 0 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF -1 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {} {High low surrogate pair} + utf-8 EDAFBFEDBFBF strict {} 0 {} {High low surrogate pair} + + utf-8 EE tcl8 \u00EE -1 {} {Missing trail byte} + utf-8 EE replace \uFFFD -1 {} {Missing trail byte} + utf-8 EE strict {} 0 {} {Missing trail byte} + utf-8 EE7F tcl8 \u00EE\u7F -1 {} {First trail byte must be 80:BF} + utf-8 EE7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} + utf-8 EE7F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EED0 tcl8 \u00EE\u00D0 -1 {} {First trail byte must be 80:BF} + utf-8 EED0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 EED0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EE81 tcl8 \u00EE\u0081 -1 {} {Missing second trail byte} + utf-8 EE81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EE81 strict {} 0 {} {Missing second trail byte} + utf-8 EEBF tcl8 \u00EE\u00BF -1 {} {Missing second trail byte} + utf-8 EEBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EEBF strict {} 0 {} {Missing second trail byte} + utf-8 EE807F tcl8 \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EE807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EE807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EEBF7F tcl8 \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EEBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EEBF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EF tcl8 \u00EF -1 {} {Missing trail byte} + utf-8 EF replace \uFFFD -1 {} {Missing trail byte} + utf-8 EF strict {} 0 {} {Missing trail byte} + utf-8 EF7F tcl8 \u00EF\u7F -1 {} {First trail byte must be 80:BF} + utf-8 EF7F replace \uFFFD\u7F -1 {} {First trail byte must be 80:BF} + utf-8 EF7F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EFD0 tcl8 \u00EF\u00D0 -1 {} {First trail byte must be 80:BF} + utf-8 EFD0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 EFD0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 EF81 tcl8 \u00EF\u0081 -1 {} {Missing second trail byte} + utf-8 EF81 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EF81 strict {} 0 {} {Missing second trail byte} + utf-8 EFBF tcl8 \u00EF\u00BF -1 {} {Missing second trail byte} + utf-8 EFBF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 EFBF strict {} 0 {} {Missing second trail byte} + utf-8 EF807F tcl8 \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EF807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EF807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 EFBF7F tcl8 \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 EFBF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 EFBF7F strict {} 0 {} {Second trail byte must be 80:BF} + + utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte} + utf-8 F0 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F0 strict {} 0 {} {Missing trail byte} + utf-8 F080 tcl8 \u00F0\u20AC -1 {} {First trail byte must be 90:BF} + utf-8 F080 replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} + utf-8 F080 strict {} 0 {} {First trail byte must be 90:BF} + utf-8 F08F tcl8 \u00F0\u8F -1 {} {First trail byte must be 90:BF} + utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} + utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF} + utf-8 F0D0 tcl8 \u00F0\u00D0 -1 {} {First trail byte must be 90:BF} + utf-8 F0D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 90:BF} + utf-8 F0D0 strict {} 0 {} {First trail byte must be 90:BF} + utf-8 F090 tcl8 \u00F0\u0090 -1 {} {Missing second trail byte} + utf-8 F090 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F090 strict {} 0 {} {Missing second trail byte} + utf-8 F0BF tcl8 \u00F0\u00BF -1 {} {Missing second trail byte} + utf-8 F0BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F0BF strict {} 0 {} {Missing second trail byte} + utf-8 F0907F tcl8 \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F0907F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F0907F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F0BF7F tcl8 \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F0BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F0BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F090BF tcl8 \u00F0\u0090\u00BF -1 {} {Missing third trail byte} + utf-8 F090BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F090BF strict {} 0 {} {Missing third trail byte} + utf-8 F0BF81 tcl8 \u00F0\u00BF\u0081 -1 {} {Missing third trail byte} + utf-8 F0BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F0BF81 strict {} 0 {} {Missing third trail byte} + utf-8 F0BF807F tcl8 \u00F0\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} + utf-8 F0BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F0BF817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F090BFD0 tcl8 \u00F0\u0090\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F090BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F090BFD0 strict {} 0 {} {Third trail byte must be 80:BF} + + utf-8 F1 tcl8 \u00F1 -1 {} {Missing trail byte} + utf-8 F1 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F1 strict {} 0 {} {Missing trail byte} + utf-8 F17F tcl8 \u00F1\u7F -1 {} {First trail byte must be 80:BF} + utf-8 F17F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} + utf-8 F17F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F1D0 tcl8 \u00F1\u00D0 -1 {} {First trail byte must be 80:BF} + utf-8 F1D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 F1D0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F180 tcl8 \u00F1\u20AC -1 {} {Missing second trail byte} + utf-8 F180 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F180 strict {} 0 {} {Missing second trail byte} + utf-8 F1BF tcl8 \u00F1\u00BF -1 {} {Missing second trail byte} + utf-8 F1BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F1BF strict {} 0 {} {Missing second trail byte} + utf-8 F1807F tcl8 \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F1807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F1807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F1BF7F tcl8 \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F1BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F1BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F180BF tcl8 \u00F1\u20AC\u00BF -1 {} {Missing third trail byte} + utf-8 F180BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F180BF strict {} 0 {} {Missing third trail byte} + utf-8 F1BF81 tcl8 \u00F1\u00BF\u0081 -1 {} {Missing third trail byte} + utf-8 F1BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F1BF81 strict {} 0 {} {Missing third trail byte} + utf-8 F1BF807F tcl8 \u00F1\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} + utf-8 F1BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F1BF817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F180BFD0 tcl8 \u00F1\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F180BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F180BFD0 strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F3 tcl8 \u00F3 -1 {} {Missing trail byte} + utf-8 F3 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F3 strict {} 0 {} {Missing trail byte} + utf-8 F37F tcl8 \u00F3\x7F -1 {} {First trail byte must be 80:BF} + utf-8 F37F replace \uFFFD -1 {knownW3C} {First trail byte must be 80:BF} + utf-8 F37F strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F3D0 tcl8 \u00F3\u00D0 -1 {} {First trail byte must be 80:BF} + utf-8 F3D0 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:BF} + utf-8 F3D0 strict {} 0 {} {First trail byte must be 80:BF} + utf-8 F380 tcl8 \u00F3\u20AC -1 {} {Missing second trail byte} + utf-8 F380 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F380 strict {} 0 {} {Missing second trail byte} + utf-8 F3BF tcl8 \u00F3\u00BF -1 {} {Missing second trail byte} + utf-8 F3BF replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F3BF strict {} 0 {} {Missing second trail byte} + utf-8 F3807F tcl8 \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F3807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F3807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F3BF7F tcl8 \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F3BF7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F3BF7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F380BF tcl8 \u00F3\u20AC\u00BF -1 {} {Missing third trail byte} + utf-8 F380BF replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F380BF strict {} 0 {} {Missing third trail byte} + utf-8 F3BF81 tcl8 \u00F3\u00BF\u0081 -1 {} {Missing third trail byte} + utf-8 F3BF81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F3BF81 strict {} 0 {} {Missing third trail byte} + utf-8 F3BF807F tcl8 \u00F3\u00BF\u20AC\x7F -1 {} {Third trail byte must be 80:BF} + utf-8 F3BF817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F3BF817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F380BFD0 tcl8 \u00F3\u20AC\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F380BFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F380BFD0 strict {} 0 {} {Third trail byte must be 80:BF} + + utf-8 F4 tcl8 \u00F4 -1 {} {Missing trail byte} + utf-8 F4 replace \uFFFD -1 {} {Missing trail byte} + utf-8 F4 strict {} 0 {} {Missing trail byte} + utf-8 F47F tcl8 \u00F4\u7F -1 {} {First trail byte must be 80:8F} + utf-8 F47F replace \uFFFD\u7F -1 {knownW3C} {First trail byte must be 80:8F} + utf-8 F47F strict {} 0 {} {First trail byte must be 80:8F} + utf-8 F490 tcl8 \u00F4\u0090 -1 {} {First trail byte must be 80:8F} + utf-8 F490 replace \uFFFD\uFFFD -1 {} {First trail byte must be 80:8F} + utf-8 F490 strict {} 0 {} {First trail byte must be 80:8F} + utf-8 F480 tcl8 \u00F4\u20AC -1 {} {Missing second trail byte} + utf-8 F480 replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F480 strict {} 0 {} {Missing second trail byte} + utf-8 F48F tcl8 \u00F4\u008F -1 {} {Missing second trail byte} + utf-8 F48F replace \uFFFD -1 {knownW3C} {Missing second trail byte} + utf-8 F48F strict {} 0 {} {Missing second trail byte} + utf-8 F4807F tcl8 \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F4807F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F4807F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F48F7F tcl8 \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF} + utf-8 F48F7F replace \uFFFD\u7F -1 {knownW3C} {Second trail byte must be 80:BF} + utf-8 F48F7F strict {} 0 {} {Second trail byte must be 80:BF} + utf-8 F48081 tcl8 \u00F4\u20AC\u0081 -1 {} {Missing third trail byte} + utf-8 F48081 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F48081 strict {} 0 {} {Missing third trail byte} + utf-8 F48F81 tcl8 \u00F4\u008F\u0081 -1 {} {Missing third trail byte} + utf-8 F48F81 replace \uFFFD -1 {knownW3C} {Missing third trail byte} + utf-8 F48F81 strict {} 0 {} {Missing third trail byte} + utf-8 F481817F tcl8 \u00F4\u0081\u0081\x7F -1 {} {Third trail byte must be 80:BF} + utf-8 F480817F replace \uFFFD\x7F -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F480817F strict {} 0 {} {Third trail byte must be 80:BF} + utf-8 F48FBFD0 tcl8 \u00F4\u008F\u00BF\u00D0 -1 {} {Third trail byte must be 80:BF} + utf-8 F48FBFD0 replace \uFFFD -1 {knownW3C} {Third trail byte must be 80:BF} + utf-8 F48FBFD0 strict {} 0 {} {Third trail byte must be 80:BF} + + utf-8 F5 tcl8 \u00F5 -1 {} {F5:FF are invalid everywhere} + utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere} + utf-8 F5 strict {} 0 {} {F5:FF are invalid everywhere} + utf-8 FF tcl8 \u00FF -1 {} {F5:FF are invalid everywhere} + utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere} + utf-8 FF strict {} 0 {} {F5:FF are invalid everywhere} + + utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8} + utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9} + utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10} + utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3.11} +} + +# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated +# based on these depending on platform endianness. Note truncated tests can only +# happen when the sequence is at the end (including by itself) Thus {solo tail} +# in some cases. +lappend encInvalidBytes {*}{ + utf-16le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-16le 41 strict {} 0 {solo tail} {Truncated} + utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} + utf-16le 00D8 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16le 00D8 strict {} 0 {knownBug} {Missing low surrogate} + utf-16le 00DC tcl8 \uDC00 -1 {} {Missing high surrogate} + utf-16le 00DC replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16le 00DC strict {} 0 {knownBug} {Missing high surrogate} + + utf-16be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-16be 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-16be 41 strict {} 0 {solo tail} {Truncated} + utf-16be D800 tcl8 \uD800 -1 {} {Missing low surrogate} + utf-16be D800 replace \uFFFD -1 {knownBug} {Missing low surrogate} + utf-16be D800 strict {} 0 {knownBug} {Missing low surrogate} + utf-16be DC00 tcl8 \uDC00 -1 {} {Missing high surrogate} + utf-16be DC00 replace \uFFFD -1 {knownBug} {Missing high surrogate} + utf-16be DC00 strict {} 0 {knownBug} {Missing high surrogate} +} + +# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated +# based on these depending on platform endianness. Note truncated tests can only +# happen when the sequence is at the end (including by itself) Thus {solo tail} +# in some cases. +lappend encInvalidBytes {*}{ + utf-32le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32le 41 replace \uFFFD -1 {solo} {Truncated} + utf-32le 41 strict {} 0 {solo tail} {Truncated} + utf-32le 4100 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32le 4100 replace \uFFFD -1 {solo} {Truncated} + utf-32le 4100 strict {} 0 {solo tail} {Truncated} + utf-32le 410000 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32le 410000 replace \uFFFD -1 {solo} {Truncated} + utf-32le 410000 strict {} 0 {solo tail} {Truncated} + utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate} + utf-32le 00D80000 replace \uFFFD -1 {} {High-surrogate} + utf-32le 00D80000 strict {} 0 {} {High-surrogate} + utf-32le 00DC0000 tcl8 \uDC00 -1 {} {Low-surrogate} + utf-32le 00DC0000 replace \uFFFD -1 {} {Low-surrogate} + utf-32le 00DC0000 strict {} 0 {} {Low-surrogate} + utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} + utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} + utf-32le 00D8000000DC0000 strict {} 0 {} {High-low-surrogate-pair} + utf-32le 00001100 tcl8 \UFFFD -1 {} {Out of range} + utf-32le 00001100 replace \UFFFD -1 {} {Out of range} + utf-32le 00001100 strict {} 0 {} {Out of range} + utf-32le FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} + utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range} + utf-32le FFFFFFFF strict {} 0 {} {Out of range} + + utf-32be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 41 replace \uFFFD -1 {solo tail} {Truncated} + utf-32be 41 strict {} 0 {solo tail} {Truncated} + utf-32be 0041 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 0041 replace \uFFFD -1 {solo} {Truncated} + utf-32be 0041 strict {} 0 {solo tail} {Truncated} + utf-32be 000041 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 000041 replace \uFFFD -1 {solo} {Truncated} + utf-32be 000041 strict {} 0 {solo tail} {Truncated} + utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate} + utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate} + utf-32be 0000D800 strict {} 0 {} {High-surrogate} + utf-32be 0000DC00 tcl8 \uDC00 -1 {} {Low-surrogate} + utf-32be 0000DC00 replace \uFFFD -1 {} {Low-surrogate} + utf-32be 0000DC00 strict {} 0 {} {Low-surrogate} + utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00 -1 {} {High-low-surrogate-pair} + utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair} + utf-32be 0000D8000000DC00 strict {} 0 {} {High-low-surrogate-pair} + utf-32be 00110000 tcl8 \UFFFD -1 {} {Out of range} + utf-32be 00110000 replace \UFFFD -1 {} {Out of range} + utf-32be 00110000 strict {} 0 {} {Out of range} + utf-32be FFFFFFFF tcl8 \UFFFD -1 {} {Out of range} + utf-32be FFFFFFFF replace \UFFFD -1 {} {Out of range} + utf-32be FFFFFFFF strict {} 0 {} {Out of range} +} + +# Strings that cannot be encoded for specific encoding / profiles +# +# should be unique for test ids to be unique. +# See earlier comments about CTRL field. +# +# Note utf-16, utf-32 missing because they are automatically +# generated based on le/be versions. +# TODO - out of range code point (note cannot be generated by \U notation) +lappend encUnencodableStrings {*}{ + ascii \u00e0 tcl8 3f -1 {} {unencodable} + ascii \u00e0 strict {} 0 {} {unencodable} + + iso8859-1 \u0141 tcl8 3f -1 {} unencodable + iso8859-1 \u0141 strict {} 0 {} unencodable + + utf-8 \uD800 tcl8 eda080 -1 {} High-surrogate + utf-8 \uD800 strict {} 0 {} High-surrogate + utf-8 \uDC00 tcl8 edb080 -1 {} High-surrogate + utf-8 \uDC00 strict {} 0 {} High-surrogate +} + + +# The icuUcmTests.tcl is generated by the tools/ucm2tests.tcl script +# and generates test vectors for the above tables for various encodings +# based on ICU UCM files. +# TODO - commented out for now as generating a lot of mismatches. +# source [file join [file dirname [info script]] icuUcmTests.tcl] -- cgit v0.12 From 99a24e7883c680bb555d044a04e458a57be677a1 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Fri, 24 Feb 2023 10:32:37 +0000 Subject: Raise error on invalid flags --- generic/tclEncoding.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d969779..00ca5e8 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1275,7 +1275,18 @@ Tcl_ExternalToUtfDStringEx( Tcl_Size dstLen; const char *srcStart = src; - Tcl_DStringInit(dstPtr); /* Must always be initialized before returning */ + /* DO FIRST - Must always be initialized before returning */ + Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetResult(interp, + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_STATIC); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; @@ -1559,7 +1570,18 @@ Tcl_UtfToExternalDStringEx( const char *srcStart = src; Tcl_Size dstLen; + /* DO FIRST - must always be initialized on return */ Tcl_DStringInit(dstPtr); + + if (flags & (TCL_ENCODING_START|TCL_ENCODING_END)) { + /* TODO - what other flags are illegal? - See TIP 656 */ + Tcl_SetResult(interp, + "Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.", + TCL_STATIC); + Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL); + return TCL_ERROR; + } + dst = Tcl_DStringValue(dstPtr); dstLen = dstPtr->spaceAvl - 1; -- cgit v0.12 From 58db3d68eb1d0fba5c0e0b3ffff602acbfb2a12a Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Fri, 24 Feb 2023 13:34:15 +0000 Subject: Add teststringobj newunicode command to test invalid input to Tcl_NewUnicodeObj --- generic/tclTestObj.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index c9a910a..fa91d67 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -1269,7 +1269,7 @@ TeststringobjCmd( static const char *const options[] = { "append", "appendstrings", "get", "get2", "length", "length2", "set", "set2", "setlength", "maxchars", "range", "appendself", - "appendself2", NULL + "appendself2", "newunicode", NULL }; if (objc < 3) { @@ -1513,7 +1513,24 @@ TeststringobjCmd( Tcl_AppendUnicodeToObj(varPtr[varIndex], unicode + length, size - length); Tcl_SetObjResult(interp, varPtr[varIndex]); break; - } + case 13: /* newunicode*/ + unicode = ckalloc((objc - 3) * sizeof(Tcl_UniChar)); + for (i = 0; i < (objc - 3); ++i) { + int val; + if (Tcl_GetIntFromObj(interp, objv[i + 3], &val) != TCL_OK) { + break; + } + unicode[i] = (Tcl_UniChar)val; + } + if (i < (objc-3)) { + ckfree(unicode); + return TCL_ERROR; + } + SetVarToObj(varPtr, varIndex, Tcl_NewUnicodeObj(unicode, objc - 3)); + Tcl_SetObjResult(interp, varPtr[varIndex]); + ckfree(unicode); + break; + } return TCL_OK; } -- cgit v0.12 From 1eac8ab060855f0454c234be78839a46d8a9241e Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 28 Feb 2023 12:25:34 +0000 Subject: Move setting of profile in flags parameter to lower level functions in case they are called directly --- generic/tclCmdAH.c | 11 +++-------- generic/tclEncoding.c | 19 +++++++++++++++---- generic/tclInt.h | 2 +- generic/tclTestObj.c | 2 +- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 19a5bc3..ff0d00f 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -611,16 +611,11 @@ numArgsError: /* ONLY jump here if nothing needs to be freed!!! */ } switch (optIndex) { case PROFILE: - if (TclEncodingProfileNameToId( - interp, Tcl_GetString(objv[argIndex]), &profile) - != TCL_OK) { + if (TclEncodingProfileNameToId(interp, + Tcl_GetString(objv[argIndex]), + &profile) != TCL_OK) { return TCL_ERROR; } -#ifdef NOTNEEDED - /* TODO - next line probably not needed as the conversion - functions already take care of mapping profile to flags */ - profile = TclEncodingExternalFlagsToInternal(profile); -#endif break; case FAILINDEX: failVarObj = objv[argIndex]; diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 00ca5e8..05d231f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1301,7 +1301,6 @@ Tcl_ExternalToUtfDStringEx( srcLen = encodingPtr->lengthProc(src); } - flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { flags |= ENCODING_INPUT; @@ -1596,7 +1595,6 @@ Tcl_UtfToExternalDStringEx( srcLen = strlen(src); } - flags = TclEncodingExternalFlagsToInternal(flags); flags |= TCL_ENCODING_START | TCL_ENCODING_END; while (1) { result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, @@ -2432,6 +2430,7 @@ BinaryProc( if (dstLen < 0) { dstLen = 0; } + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { srcLen = *dstCharsPtr; } @@ -2499,6 +2498,7 @@ UtfToUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= 6; } @@ -2721,6 +2721,7 @@ Utf32ToUtfProc( int result, numChars, charLimit = INT_MAX; int ch = 0, bytesLeft = srcLen % 4; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -2874,6 +2875,7 @@ UtfToUtf32Proc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -2971,6 +2973,7 @@ Utf16ToUtfProc( int result, numChars, charLimit = INT_MAX; unsigned short ch = 0; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -3110,6 +3113,7 @@ UtfToUtf16Proc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3215,6 +3219,7 @@ UtfToUcs2Proc( int result, numChars, len; Tcl_UniChar ch = 0; + flags = TclEncodingSetProfileFlags(flags); flags |= PTR2INT(clientData); srcStart = src; srcEnd = src + srcLen; @@ -3337,6 +3342,7 @@ TableToUtfProc( const unsigned short *pageZero; TableEncodingData *dataPtr = (TableEncodingData *)clientData; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3464,6 +3470,7 @@ TableFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3570,6 +3577,7 @@ Iso88591ToUtfProc( const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -3654,6 +3662,7 @@ Iso88591FromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -3801,6 +3810,7 @@ EscapeToUtfProc( int state, result, numChars, charLimit = INT_MAX; const char *dstStart, *dstEnd; + flags = TclEncodingSetProfileFlags(flags); if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -4024,6 +4034,7 @@ EscapeFromUtfProc( srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; + flags = TclEncodingSetProfileFlags(flags); if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } @@ -4463,7 +4474,7 @@ TclEncodingProfileIdToName( /* *------------------------------------------------------------------------ * - * TclEncodingExternalFlagsToInternal -- + * TclEncodingSetProfileFlags -- * * Maps the flags supported in the encoding C API's to internal flags. * @@ -4482,7 +4493,7 @@ TclEncodingProfileIdToName( * *------------------------------------------------------------------------ */ -int TclEncodingExternalFlagsToInternal(int flags) +int TclEncodingSetProfileFlags(int flags) { if (flags & TCL_ENCODING_STOPONERROR) { TCL_ENCODING_PROFILE_SET(flags, TCL_ENCODING_PROFILE_STRICT); diff --git a/generic/tclInt.h b/generic/tclInt.h index 538b177..bf5310b 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2890,7 +2890,7 @@ TclEncodingProfileNameToId(Tcl_Interp *interp, int *profilePtr); MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp, int profileId); -MODULE_SCOPE int TclEncodingExternalFlagsToInternal(int flags); +MODULE_SCOPE int TclEncodingSetProfileFlags(int flags); MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp); /* diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index fa91d67..4a2032c 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -1514,7 +1514,7 @@ TeststringobjCmd( Tcl_SetObjResult(interp, varPtr[varIndex]); break; case 13: /* newunicode*/ - unicode = ckalloc((objc - 3) * sizeof(Tcl_UniChar)); + unicode = (unsigned short *) ckalloc((objc - 3) * sizeof(Tcl_UniChar)); for (i = 0; i < (objc - 3); ++i) { int val; if (Tcl_GetIntFromObj(interp, objv[i + 3], &val) != TCL_OK) { -- cgit v0.12 From 3d177bd8b588eb3f64773a86cabc290208e031a5 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 28 Feb 2023 14:08:19 +0000 Subject: int -> Tcl_Size to match TIP --- generic/tcl.decls | 4 ++-- generic/tclDecls.h | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index a789ef6..f2ba187 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2445,12 +2445,12 @@ declare 657 { # TIP 656 declare 658 { int Tcl_ExternalToUtfDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, - const char *src, int srcLen, int flags, Tcl_DString *dsPtr, + const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr) } declare 659 { int Tcl_UtfToExternalDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, - const char *src, int srcLen, int flags, Tcl_DString *dsPtr, + const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr) } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index fbfa8a1..adad630 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1957,12 +1957,14 @@ EXTERN int Tcl_UniCharIsUnicode(int ch); /* 658 */ EXTERN int Tcl_ExternalToUtfDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, - int srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size srcLen, int flags, + Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 659 */ EXTERN int Tcl_UtfToExternalDStringEx(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, - int srcLen, int flags, Tcl_DString *dsPtr, + Tcl_Size srcLen, int flags, + Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 660 */ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, @@ -2743,8 +2745,8 @@ typedef struct TclStubs { const char * (*tcl_UtfNext) (const char *src); /* 655 */ const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ - int (*tcl_ExternalToUtfDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 658 */ - int (*tcl_UtfToExternalDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 659 */ + int (*tcl_ExternalToUtfDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 658 */ + int (*tcl_UtfToExternalDStringEx) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_DString *dsPtr, Tcl_Size *errorLocationPtr); /* 659 */ int (*tcl_AsyncMarkFromSignal) (Tcl_AsyncHandler async, int sigNumber); /* 660 */ int (*tclListObjGetElements) (Tcl_Interp *interp, Tcl_Obj *listPtr, size_t *objcPtr, Tcl_Obj ***objvPtr); /* 661 */ int (*tclListObjLength) (Tcl_Interp *interp, Tcl_Obj *listPtr, size_t *lengthPtr); /* 662 */ -- cgit v0.12 From b2cdedcec2bbb94929cef675635c5864db8db8de Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 2 Mar 2023 04:16:44 +0000 Subject: Eliminate TCL_ENCODING_MODIFIED flag --- generic/tcl.h | 13 +++++++------ generic/tclEncoding.c | 33 +++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 3fc53db..a92680d 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2123,12 +2123,12 @@ typedef struct Tcl_EncodingType { * content. Otherwise, the number of chars * produced is controlled only by other limiting * factors. - * TCL_ENCODING_MODIFIED - Convert NULL bytes to \xC0\x80 in stead of - * 0x00. Only valid for "utf-8" and "cesu-8". - * This flag is implicit for external -> internal conversions, - * optional for internal -> external conversions. * TCL_ENCODING_PROFILE_* - Mutually exclusive encoding profile ids. Note * these are bit masks. + * + * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS + * DEFINED IN tclEncoding.c (ENCODING_INPUT et al). Be cognizant of this + * when adding bits. */ #define TCL_ENCODING_START 0x01 @@ -2136,8 +2136,9 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_STOPONERROR 0x04 #define TCL_ENCODING_NO_TERMINATE 0x08 #define TCL_ENCODING_CHAR_LIMIT 0x10 -#define TCL_ENCODING_MODIFIED 0x20 -/* Reserve top byte for profile values (disjoint) */ +/* Internal use bits, do not define bits in this space. See above comment */ +#define TCL_ENCODING_INTERNAL_USE_MASK 0xFF00 +/* Reserve top byte for profile values (disjoint, not a mask) */ #define TCL_ENCODING_PROFILE_TCL8 0x01000000 #define TCL_ENCODING_PROFILE_STRICT 0x02000000 #define TCL_ENCODING_PROFILE_REPLACE 0x03000000 diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 05d231f..1d336f5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -548,11 +548,16 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and - * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ -#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ +/* + * NOTE: THESE BIT DEFINITIONS SHOULD NOT OVERLAP WITH INTERNAL USE BITS + * DEFINED IN tcl.h (TCL_ENCODING_* et al). Be cognizant of this + * when adding bits. TODO - should really be defined in a single file. + * + * To prevent conflicting bits, only define bits within 0xff00 mask here. + */ +#define TCL_ENCODING_LE 0x100 /* Used to distinguish LE/BE variants */ #define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ -#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ +#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ void TclInitEncodingSubsystem(void) @@ -565,12 +570,16 @@ TclInitEncodingSubsystem(void) char c; short s; } isLe; + int leFlags; if (encodingsInitialized) { return; } - isLe.s = TCL_ENCODING_LE; + /* Note: This DEPENDS on TCL_ENCODING_LE being defined in least sig byte */ + isLe.s = 1; + leFlags = isLe.c ? TCL_ENCODING_LE : 0; + Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); @@ -611,7 +620,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); type.toUtfProc = Utf32ToUtfProc; @@ -625,7 +634,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(0); Tcl_CreateEncoding(&type); type.encodingName = "utf-32"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(leFlags); Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -639,7 +648,7 @@ TclInitEncodingSubsystem(void) type.clientData = INT2PTR(ENCODING_UTF); Tcl_CreateEncoding(&type); type.encodingName = "utf-16"; - type.clientData = INT2PTR(isLe.c|ENCODING_UTF); + type.clientData = INT2PTR(leFlags|ENCODING_UTF); Tcl_CreateEncoding(&type); #ifndef TCL_NO_DEPRECATED @@ -1222,8 +1231,6 @@ Tcl_ExternalToUtfDString( * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags - * - TCL_ENCODING_MODIFIED: enable Tcl internal conversion mapping \xC0\x80 - * to 0x00. Only valid for "utf-8" and "cesu-8". * Any other flag bits will cause an error to be returned (for future * compatibility) * @@ -1518,8 +1525,6 @@ Tcl_UtfToExternalDString( * - *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT} * - TCL_ENCODING_STOPONERROR: Backward compatibility. Sets the profile * to TCL_ENCODING_PROFILE_STRICT overriding any specified profile flags - * - TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 instead - * of 0x00. Only valid for "utf-8" and "cesu-8". * * Results: * The return value is one of @@ -2466,7 +2471,7 @@ BinaryProc( static int UtfToUtfProc( - void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ + void *clientData, /* additional flags */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* TCL_ENCODING_* conversion control flags. */ @@ -2536,7 +2541,7 @@ UtfToUtfProc( *dst++ = *src++; } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && - (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED) && + (UCHAR(src[1]) == 0x80) && (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) || PROFILE_REPLACE(profile))) { /* Special sequence \xC0\x80 */ -- cgit v0.12 From 44fdf09f7bf8a1e0ae30d1eaea83d5cd1d2fdca2 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 2 Mar 2023 06:41:39 +0000 Subject: Bug [e778e3f804]. Fix error message for invalid profile name. --- generic/tclEncoding.c | 28 +++++++++++++++++++--------- tests/encoding.test | 8 ++++++++ tests/io.test | 2 +- tests/ioCmd.test | 4 ++++ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1d336f5..b32db7c 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -188,15 +188,16 @@ static Tcl_Encoding systemEncoding = NULL; Tcl_Encoding tclIdentityEncoding = NULL; /* - * Names of encoding profiles and corresponding integer values + * Names of encoding profiles and corresponding integer values. + * Keep alphabetical order for error messages. */ static struct TclEncodingProfiles { const char *name; int value; } encodingProfiles[] = { - {"tcl8", TCL_ENCODING_PROFILE_TCL8}, - {"strict", TCL_ENCODING_PROFILE_STRICT}, {"replace", TCL_ENCODING_PROFILE_REPLACE}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, }; #define PROFILE_STRICT(flags_) \ ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ @@ -4418,19 +4419,28 @@ TclEncodingProfileNameToId( int *profilePtr) /* Output */ { size_t i; + size_t numProfiles = sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); - for (i = 0; i < sizeof(encodingProfiles) / sizeof(encodingProfiles[0]); ++i) { + for (i = 0; i < numProfiles; ++i) { if (!strcmp(profileName, encodingProfiles[i].name)) { *profilePtr = encodingProfiles[i].value; return TCL_OK; } } if (interp) { - Tcl_SetObjResult( - interp, - Tcl_ObjPrintf( - "bad profile \"%s\". Must be \"tcl8\" or \"strict\".", - profileName)); + Tcl_Obj *errorObj; + /* This code assumes at least two profiles :-) */ + errorObj = + Tcl_ObjPrintf("bad profile name \"%s\": must be", + profileName); + for (i = 0; i < (numProfiles - 1); ++i) { + Tcl_AppendStringsToObj( + errorObj, " ", encodingProfiles[i].name, ",", NULL); + } + Tcl_AppendStringsToObj( + errorObj, " or ", encodingProfiles[numProfiles-1].name, NULL); + + Tcl_SetObjResult(interp, errorObj); Tcl_SetErrorCode( interp, "TCL", "ENCODING", "PROFILE", profileName, NULL); } diff --git a/tests/encoding.test b/tests/encoding.test index 800d93b..a51b6c0 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -105,6 +105,14 @@ test encoding-3.2 {Tcl_GetEncodingName, non-null} -setup { } -cleanup { fconfigure stdout -encoding $old } -result {jis0208} +test encoding-3.3 {fconfigure -encodingprofile} -setup { + set old [fconfigure stdout -encodingprofile] +} -body { + fconfigure stdout -encodingprofile replace + fconfigure stdout -encodingprofile +} -cleanup { + fconfigure stdout -encodingprofile $old +} -result replace test encoding-4.1 {Tcl_GetEncodingNames} -constraints {testencoding} -setup { cd [makeDirectory tmp] diff --git a/tests/io.test b/tests/io.test index 66dee7d..836a9b8 100644 --- a/tests/io.test +++ b/tests/io.test @@ -7622,7 +7622,7 @@ test io-52.20 {TclCopyChannel & encodings} -setup { set out [open $path(kyrillic.txt) w] # Using "-encoding ascii" means reading the "Á" gives an error - fconfigure $in -encoding ascii -strictencoding 1 + fconfigure $in -encoding ascii -encodingprofile strict fconfigure $out -encoding koi8-r -translation lf fcopy $in $out diff --git a/tests/ioCmd.test b/tests/ioCmd.test index 8c9d870..23cd67e 100644 --- a/tests/ioCmd.test +++ b/tests/ioCmd.test @@ -390,6 +390,10 @@ test iocmd-8.22 {fconfigure command / -nocomplainencoding 0, no error if -strict } -result 0 +test iocmd-8.21 {fconfigure -encodingprofile badprofile} -body { + fconfigure stdin -encodingprofile froboz +} -returnCodes error -result {bad profile name "froboz": must be replace, strict, or tcl8} + test iocmd-9.1 {eof command} { list [catch {eof} msg] $msg $::errorCode } {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}} -- cgit v0.12 From 56f5c7751c0f9e4da9c1a40ee533ce392a43e4a2 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 9 Mar 2023 10:47:12 +0000 Subject: Fix SetChannelOption parsing of -encoding* to match GetChannelOption --- generic/tclIO.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 97ca8d0..4a6dbf4 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -8194,7 +8194,7 @@ Tcl_SetChannelOption( } Tcl_SetChannelBufferSize(chan, newBufferSize); return TCL_OK; - } else if (HaveOpt(2, "-encoding")) { + } else if (HaveOpt(8, "-encoding")) { Tcl_Encoding encoding; int profile; @@ -8230,6 +8230,15 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); UpdateInterest(chanPtr); return TCL_OK; + } else if (HaveOpt(9, "-encodingprofile")) { + int profile; + if (TclEncodingProfileNameToId(interp, newValue, &profile) != TCL_OK) { + return TCL_ERROR; + } + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); + ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); + return TCL_OK; } else if (HaveOpt(2, "-eofchar")) { if (!newValue[0] || (!(newValue[0] & 0x80) && !newValue[1])) { if (GotFlag(statePtr, TCL_READABLE)) { @@ -8285,15 +8294,6 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_EOF|CHANNEL_STICKY_EOF|CHANNEL_BLOCKED); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; return TCL_OK; - } else if (HaveOpt(1, "-encodingprofile")) { - int profile; - if (TclEncodingProfileNameToId(interp, newValue, &profile) != TCL_OK) { - return TCL_ERROR; - } - TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); - TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); - ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); - return TCL_OK; } else if (HaveOpt(1, "-translation")) { const char *readMode, *writeMode; -- cgit v0.12 From 6f85588bab4bad23425a2fea4e953546b8fa7ca3 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sat, 11 Mar 2023 16:43:36 +0000 Subject: Add testencoding Tcl_ExternalToUtf/Tcl_UtfToExternal for raw testing of corresponding C functions --- generic/tclTest.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 2 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index b3df8ec..a398797 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -2016,6 +2016,156 @@ static void SpecialFree( } /* + *------------------------------------------------------------------------ + * + * UtfTransformFn -- + * + * Implements a direct call into Tcl_UtfToExternal and Tcl_ExternalToUtf + * as otherwise there is no script level command that directly exercises + * these functions (i/o command cannot test all combinations) + * The arguments at the script level are roughly those of the above + * functions: + * encodingname srcbytes flags state dstlen ?srcreadvar? ?dstwrotevar? ?dstcharsvar? + * + * Results: + * TCL_OK or TCL_ERROR. This any errors running the test, NOT the + * result of Tcl_UtfToExternal or Tcl_ExternalToUtf. + * + * Side effects: + * The result in the interpreter is a list of the return code from the + * Tcl_UtfToExternal/Tcl_ExternalToUtf functions, the encoding state, and + * the encoded binary string. If any of the srcreadvar, dstwrotevar and + * dstcharsvar are specified and not empty, they are treated as names + * of variables where the *srcRead, *dstWrote and *dstChars output + * from the functions are stored. + *------------------------------------------------------------------------ + */ +typedef int +UtfTransformFn(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, Tcl_Size srcLen, int flags, Tcl_EncodingState *statePtr, + char *dst, Tcl_Size dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); +static int UtfExtWrapper( + Tcl_Interp *interp, UtfTransformFn *transformer, int objc, Tcl_Obj *const objv[]) +{ + Tcl_Encoding encoding; + int encStateValue; /* Assumes Tcl_EncodingState points to integer!!! */ + Tcl_EncodingState encState; + int flags; + Tcl_Size srcLen, bufLen; + const unsigned char *bytes; + unsigned char *bufPtr; + int srcRead, dstLen, dstWrote, dstChars; + Tcl_Obj *srcReadVar, *dstWroteVar, *dstCharsVar; + int result; + + if (objc < 7 || objc > 10) { + Tcl_WrongNumArgs(interp, + 2, + objv, + "encoding srcbytes flags state dstlen ?srcreadvar? ?dstwrotevar? ?dstcharsvar?"); + return TCL_ERROR; + } + if (Tcl_GetEncodingFromObj(interp, objv[2], &encoding) != TCL_OK) { + return TCL_ERROR; + } + if (Tcl_GetIntFromObj(interp, objv[4], &flags) != TCL_OK) { + return TCL_ERROR; + } + /* Assumes state is integer if not "" */ + if (Tcl_GetIntFromObj(interp, objv[5], &encStateValue) == TCL_OK) { + encState = (Tcl_EncodingState)&encStateValue; + } else if (Tcl_GetCharLength(objv[5]) == 0) { + encState = NULL; + } else { + return TCL_ERROR; + } + if (Tcl_GetIntFromObj(interp, objv[6], &dstLen) != TCL_OK) { + return TCL_ERROR; + } + srcReadVar = NULL; + dstWroteVar = NULL; + dstCharsVar = NULL; + if (objc > 7) { + /* Has caller requested srcRead? */ + if (Tcl_GetCharLength(objv[7])) { + srcReadVar = objv[7]; + } + if (objc > 8) { + /* Ditto for dstWrote */ + if (Tcl_GetCharLength(objv[8])) { + dstWroteVar = objv[8]; + } + if (objc > 9) { + if (Tcl_GetCharLength(objv[9])) { + dstCharsVar = objv[9]; + } + } + } + } + + bufLen = dstLen + 4; /* 4 -> overflow detection */ + bufPtr = ckalloc(bufLen); + memmove(bufPtr + dstLen, "\xAB\xCD\xEF\x00", 4); /* overflow detection */ + bytes = Tcl_GetByteArrayFromObj(objv[3], &srcLen); /* Last! to avoid shimmering */ + result = (*transformer)(interp, encoding, bytes, srcLen, flags, + &encState, bufPtr, dstLen, + srcReadVar ? &srcRead : NULL, + &dstWrote, + dstCharsVar ? &dstChars : NULL); + if (memcmp(bufPtr + bufLen - 4, "\xAB\xCD\xEF\x00", 4)) { + Tcl_SetResult(interp, + "Tcl_ExternalToUtf wrote past output buffer", + TCL_STATIC); + result = TCL_ERROR; + } else { + Tcl_Obj *resultObjs[3]; + switch (result) { + case TCL_OK: + resultObjs[0] = Tcl_NewStringObj("ok", -1); + break; + case TCL_CONVERT_MULTIBYTE: + resultObjs[0] = Tcl_NewStringObj("multibyte", -1); + break; + case TCL_CONVERT_SYNTAX: + resultObjs[0] = Tcl_NewStringObj("syntax", -1); + break; + case TCL_CONVERT_UNKNOWN: + resultObjs[0] = Tcl_NewStringObj("unknown", -1); + break; + case TCL_CONVERT_NOSPACE: + resultObjs[0] = Tcl_NewStringObj("nospace", -1); + break; + default: + resultObjs[0] = Tcl_NewIntObj(result); + break; + } + result = TCL_OK; + resultObjs[1] = + encState ? Tcl_NewIntObj(encStateValue) : Tcl_NewObj(); + resultObjs[2] = Tcl_NewByteArrayObj(bufPtr, dstWrote); + if (srcReadVar) { + if (Tcl_ObjSetVar2(interp, srcReadVar, NULL, Tcl_NewIntObj(srcRead), 0) == NULL) { + result = TCL_ERROR; + } + } + if (dstWroteVar) { + if (Tcl_ObjSetVar2(interp, dstWroteVar, NULL, Tcl_NewIntObj(dstWrote), 0) == NULL) { + result = TCL_ERROR; + } + } + if (dstCharsVar) { + if (Tcl_ObjSetVar2(interp, dstCharsVar, NULL, Tcl_NewIntObj(dstChars), 0) == NULL) { + result = TCL_ERROR; + } + } + Tcl_SetObjResult(interp, Tcl_NewListObj(3, resultObjs)); + } + + ckfree(bufPtr); + Tcl_FreeEncoding(encoding); /* Free returned reference */ + return result; +} + +/* *---------------------------------------------------------------------- * * TestencodingCmd -- @@ -2044,10 +2194,10 @@ TestencodingObjCmd( const char *string; TclEncoding *encodingPtr; static const char *const optionStrings[] = { - "create", "delete", "nullength", NULL + "create", "delete", "nullength", "Tcl_ExternalToUtf", "Tcl_UtfToExternal", NULL }; enum options { - ENC_CREATE, ENC_DELETE, ENC_NULLENGTH + ENC_CREATE, ENC_DELETE, ENC_NULLENGTH, ENC_EXTTOUTF, ENC_UTFTOEXT }; if (objc < 2) { @@ -2116,6 +2266,11 @@ TestencodingObjCmd( Tcl_SetObjResult(interp, Tcl_NewIntObj(Tcl_GetEncodingNulLength(encoding))); Tcl_FreeEncoding(encoding); + break; + case ENC_EXTTOUTF: + return UtfExtWrapper(interp,Tcl_ExternalToUtf,objc,objv); + case ENC_UTFTOEXT: + return UtfExtWrapper(interp,Tcl_UtfToExternal,objc,objv); } return TCL_OK; } -- cgit v0.12 From 22239fb7d2e4d9fae7bc87076d655170b791c46b Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Sun, 12 Mar 2023 16:47:08 +0000 Subject: Start on Tcl_ExternalToUtf/Tcl_UtfToExternal tests --- generic/tclTest.c | 124 +++++++++++++++++++++++++++++++++++++++++++++--------- tests/utfext.test | 96 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+), 21 deletions(-) create mode 100644 tests/utfext.test diff --git a/generic/tclTest.c b/generic/tclTest.c index a398797..eab3eab 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -2032,12 +2032,21 @@ static void SpecialFree( * result of Tcl_UtfToExternal or Tcl_ExternalToUtf. * * Side effects: + * * The result in the interpreter is a list of the return code from the * Tcl_UtfToExternal/Tcl_ExternalToUtf functions, the encoding state, and - * the encoded binary string. If any of the srcreadvar, dstwrotevar and + * an encoded binary string of length dstLen. Note the string is the + * entire output buffer, not just the part containing the decoded + * portion. This allows for additional checks at test script level. + * + * If any of the srcreadvar, dstwrotevar and * dstcharsvar are specified and not empty, they are treated as names * of variables where the *srcRead, *dstWrote and *dstChars output * from the functions are stored. + * + * The function also checks internally whether nuls are correctly + * appended as requested but the TCL_ENCODING_NO_TERMINATE flag + * and that no buffer overflows occur. *------------------------------------------------------------------------ */ typedef int @@ -2049,13 +2058,15 @@ static int UtfExtWrapper( Tcl_Encoding encoding; int encStateValue; /* Assumes Tcl_EncodingState points to integer!!! */ Tcl_EncodingState encState; - int flags; Tcl_Size srcLen, bufLen; const unsigned char *bytes; unsigned char *bufPtr; int srcRead, dstLen, dstWrote, dstChars; Tcl_Obj *srcReadVar, *dstWroteVar, *dstCharsVar; int result; + int flags; + Tcl_Obj **flagObjs; + int nflags; if (objc < 7 || objc > 10) { Tcl_WrongNumArgs(interp, @@ -2067,9 +2078,48 @@ static int UtfExtWrapper( if (Tcl_GetEncodingFromObj(interp, objv[2], &encoding) != TCL_OK) { return TCL_ERROR; } - if (Tcl_GetIntFromObj(interp, objv[4], &flags) != TCL_OK) { - return TCL_ERROR; + + /* Flags may be specified as list of integers and keywords */ + flags = 0; + if (Tcl_ListObjGetElements(interp, objv[4], &nflags, &flagObjs) != TCL_OK) { + return TCL_ERROR; + } + + struct { + const char *flagKey; + int flag; + } flagMap[] = { + {"start", TCL_ENCODING_START}, + {"end", TCL_ENCODING_END}, + {"stoponerror", TCL_ENCODING_STOPONERROR}, + {"noterminate", TCL_ENCODING_NO_TERMINATE}, + {"charlimit", TCL_ENCODING_CHAR_LIMIT}, + {"profiletcl8", TCL_ENCODING_PROFILE_TCL8}, + {"profilestrict", TCL_ENCODING_PROFILE_STRICT}, + {"profilereplace", TCL_ENCODING_PROFILE_REPLACE}, + {NULL, 0} + }; + int i; + for (i = 0; i < nflags; ++i) { + int flag; + if (Tcl_GetIntFromObj(NULL, flagObjs[i], &flag) == TCL_OK) { + flags |= flag; + } + else { + int idx; + if (Tcl_GetIndexFromObjStruct(interp, + flagObjs[i], + flagMap, + sizeof(flagMap[0]), + "flag", + 0, + &idx) != TCL_OK) { + return TCL_ERROR; + } + flags |= flagMap[idx].flag; + } } + /* Assumes state is integer if not "" */ if (Tcl_GetIntFromObj(interp, objv[5], &encStateValue) == TCL_OK) { encState = (Tcl_EncodingState)&encStateValue; @@ -2097,27 +2147,47 @@ static int UtfExtWrapper( if (objc > 9) { if (Tcl_GetCharLength(objv[9])) { dstCharsVar = objv[9]; - } + } } } } + if (flags & TCL_ENCODING_CHAR_LIMIT) { + /* Caller should have specified the dest char limit */ + Tcl_Obj *valueObj; + if (dstCharsVar == NULL || + (valueObj = Tcl_ObjGetVar2(interp, dstCharsVar, NULL, 0)) == NULL + ) { + Tcl_SetResult(interp, + "dstCharsVar must be specified with integer value if " + "TCL_ENCODING_CHAR_LIMIT set in flags.", TCL_STATIC); + return TCL_ERROR; + } + if (Tcl_GetIntFromObj(interp, dstCharsVar, &dstChars) != TCL_OK) { + return TCL_ERROR; + } + } else { + dstChars = 0; /* Only used for output */ + } bufLen = dstLen + 4; /* 4 -> overflow detection */ bufPtr = ckalloc(bufLen); - memmove(bufPtr + dstLen, "\xAB\xCD\xEF\x00", 4); /* overflow detection */ + memset(bufPtr, 0xFF, dstLen); /* Need to check nul terminator */ + memmove(bufPtr + dstLen, "\xAB\xCD\xEF\xAB", 4); /* overflow detection */ bytes = Tcl_GetByteArrayFromObj(objv[3], &srcLen); /* Last! to avoid shimmering */ result = (*transformer)(interp, encoding, bytes, srcLen, flags, &encState, bufPtr, dstLen, srcReadVar ? &srcRead : NULL, &dstWrote, dstCharsVar ? &dstChars : NULL); - if (memcmp(bufPtr + bufLen - 4, "\xAB\xCD\xEF\x00", 4)) { + if (memcmp(bufPtr + bufLen - 4, "\xAB\xCD\xEF\xAB", 4)) { Tcl_SetResult(interp, "Tcl_ExternalToUtf wrote past output buffer", TCL_STATIC); result = TCL_ERROR; - } else { + } else if (result != TCL_ERROR) { + Tcl_Obj *resultObjs[3]; + switch (result) { case TCL_OK: resultObjs[0] = Tcl_NewStringObj("ok", -1); @@ -2141,22 +2211,34 @@ static int UtfExtWrapper( result = TCL_OK; resultObjs[1] = encState ? Tcl_NewIntObj(encStateValue) : Tcl_NewObj(); - resultObjs[2] = Tcl_NewByteArrayObj(bufPtr, dstWrote); + resultObjs[2] = Tcl_NewByteArrayObj(bufPtr, dstLen); if (srcReadVar) { - if (Tcl_ObjSetVar2(interp, srcReadVar, NULL, Tcl_NewIntObj(srcRead), 0) == NULL) { - result = TCL_ERROR; - } - } + if (Tcl_ObjSetVar2(interp, + srcReadVar, + NULL, + Tcl_NewIntObj(srcRead), + TCL_LEAVE_ERR_MSG) == NULL) { + result = TCL_ERROR; + } + } if (dstWroteVar) { - if (Tcl_ObjSetVar2(interp, dstWroteVar, NULL, Tcl_NewIntObj(dstWrote), 0) == NULL) { - result = TCL_ERROR; - } - } + if (Tcl_ObjSetVar2(interp, + dstWroteVar, + NULL, + Tcl_NewIntObj(dstWrote), + TCL_LEAVE_ERR_MSG) == NULL) { + result = TCL_ERROR; + } + } if (dstCharsVar) { - if (Tcl_ObjSetVar2(interp, dstCharsVar, NULL, Tcl_NewIntObj(dstChars), 0) == NULL) { - result = TCL_ERROR; - } - } + if (Tcl_ObjSetVar2(interp, + dstCharsVar, + NULL, + Tcl_NewIntObj(dstChars), + TCL_LEAVE_ERR_MSG) == NULL) { + result = TCL_ERROR; + } + } Tcl_SetObjResult(interp, Tcl_NewListObj(3, resultObjs)); } diff --git a/tests/utfext.test b/tests/utfext.test new file mode 100644 index 0000000..61e36b8 --- /dev/null +++ b/tests/utfext.test @@ -0,0 +1,96 @@ +# This file contains a collection of tests for Tcl_UtfToExternal and +# Tcl_UtfToExternal. Sourcing this file into Tcl runs the tests and generates +# errors. No output means no errors found. +# +# Copyright (c) 2023 Ashok P. Nadkarni +# +# See the file "license.terms" for information on usage and redistribution +# of this file, and for a DISCLAIMER OF ALL WARRANTIES. + +if {"::tcltest" ni [namespace children]} { + package require tcltest 2.5 + namespace import -force ::tcltest::* +} + +::tcltest::loadTestedCommands +catch [list package require -exact tcl::test [info patchlevel]] + +testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint testencoding [llength [info commands testencoding]] + +# Maps encoded bytes string to utf-8 equivalents, both in hex +# encoding utf-8 encdata +lappend utfExtMap {*}{ + ascii 414243 414243 +} + +if {[info commands printable] eq ""} { + proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } + } + return $print + } +} + +# Simple test with basic flags +proc testbasic {direction enc hexin hexout {flags {start end}}} { + if {$direction eq "toutf"} { + set cmd Tcl_ExternalToUtf + } else { + set cmd Tcl_UtfToExternal + } + set in [binary decode hex $hexin] + set out [binary decode hex $hexout] + set dstlen 40 ;# Should be enough for all encoding tests + + # The C wrapper fills entire destination buffer with FF. + # Anything beyond expected output should have FF's + set filler [string repeat \xFF $dstlen] + set result [string range "$out$filler" 0 $dstlen-1] + test $cmd-$enc-$hexin-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \ + [list testencoding $cmd $enc $in $flags {} $dstlen] \ + -result [list ok {} $result] + foreach profile [encoding profiles] { + set flags2 [linsert $flags end profile$profile] + test $cmd-$enc-$hexin-[join $flags2 -] "$cmd - $enc - $hexin - $flags" -body \ + [list testencoding $cmd $enc $in $flags2 {} $dstlen] \ + -result [list ok {} $result] + } +} + +# +# Basic tests +foreach {enc utfhex hex} $utfExtMap { + # Basic test - TCL_ENCODING_START|TCL_ENCODING_END + # Note by default output should be terminated with \0 + testbasic toutf $enc $hex ${utfhex}00 {start end} + testbasic fromutf $enc $utfhex ${hex}00 {start end} + + # Test TCL_ENCODING_NO_TERMINATE + testbasic toutf $enc $hex $utfhex {start end noterminate} + # knownBug - noterminate not obeyed by fromutf + # testbasic fromutf $enc $utfhex $hex {start end noterminate} +} + +# Test for insufficient space +test xx-bufferoverflow {buffer overflow Tcl_ExternalToUtf} -body { + testencoding Tcl_UtfToExternal unicode A {start end} {} 1 +} -result {nospace {} {}} + +::tcltest::cleanupTests +return + +# Local Variables: +# mode: tcl +# End: -- cgit v0.12 From 95158a2d57b3724c868c22025657b56c2812f4d5 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Mon, 13 Mar 2023 16:32:55 +0000 Subject: Fix passing of encoding state in testencoding Tcl_UtfToExternal --- generic/tclTest.c | 30 ++++++++++++++++-------------- tests/utfext.test | 5 +++++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index eab3eab..6860e53 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -2031,19 +2031,19 @@ static void SpecialFree( * TCL_OK or TCL_ERROR. This any errors running the test, NOT the * result of Tcl_UtfToExternal or Tcl_ExternalToUtf. * - * Side effects: + * Side effects: * * The result in the interpreter is a list of the return code from the * Tcl_UtfToExternal/Tcl_ExternalToUtf functions, the encoding state, and * an encoded binary string of length dstLen. Note the string is the * entire output buffer, not just the part containing the decoded * portion. This allows for additional checks at test script level. - * - * If any of the srcreadvar, dstwrotevar and + * + * If any of the srcreadvar, dstwrotevar and * dstcharsvar are specified and not empty, they are treated as names * of variables where the *srcRead, *dstWrote and *dstChars output * from the functions are stored. - * + * * The function also checks internally whether nuls are correctly * appended as requested but the TCL_ENCODING_NO_TERMINATE flag * and that no buffer overflows occur. @@ -2056,8 +2056,7 @@ static int UtfExtWrapper( Tcl_Interp *interp, UtfTransformFn *transformer, int objc, Tcl_Obj *const objv[]) { Tcl_Encoding encoding; - int encStateValue; /* Assumes Tcl_EncodingState points to integer!!! */ - Tcl_EncodingState encState; + Tcl_EncodingState encState, *encStatePtr; Tcl_Size srcLen, bufLen; const unsigned char *bytes; unsigned char *bufPtr; @@ -2121,13 +2120,16 @@ static int UtfExtWrapper( } /* Assumes state is integer if not "" */ - if (Tcl_GetIntFromObj(interp, objv[5], &encStateValue) == TCL_OK) { - encState = (Tcl_EncodingState)&encStateValue; + Tcl_WideInt wide; + if (Tcl_GetWideIntFromObj(interp, objv[5], &wide) == TCL_OK) { + encState = (Tcl_EncodingState) wide; + encStatePtr = &encState; } else if (Tcl_GetCharLength(objv[5]) == 0) { - encState = NULL; + encStatePtr = NULL; } else { return TCL_ERROR; } + if (Tcl_GetIntFromObj(interp, objv[6], &dstLen) != TCL_OK) { return TCL_ERROR; } @@ -2162,7 +2164,7 @@ static int UtfExtWrapper( "TCL_ENCODING_CHAR_LIMIT set in flags.", TCL_STATIC); return TCL_ERROR; } - if (Tcl_GetIntFromObj(interp, dstCharsVar, &dstChars) != TCL_OK) { + if (Tcl_GetIntFromObj(interp, valueObj, &dstChars) != TCL_OK) { return TCL_ERROR; } } else { @@ -2170,12 +2172,12 @@ static int UtfExtWrapper( } bufLen = dstLen + 4; /* 4 -> overflow detection */ - bufPtr = ckalloc(bufLen); + bufPtr = (unsigned char *) ckalloc(bufLen); memset(bufPtr, 0xFF, dstLen); /* Need to check nul terminator */ memmove(bufPtr + dstLen, "\xAB\xCD\xEF\xAB", 4); /* overflow detection */ bytes = Tcl_GetByteArrayFromObj(objv[3], &srcLen); /* Last! to avoid shimmering */ - result = (*transformer)(interp, encoding, bytes, srcLen, flags, - &encState, bufPtr, dstLen, + result = (*transformer)(interp, encoding, (const char *)bytes, srcLen, flags, + encStatePtr, (char *) bufPtr, dstLen, srcReadVar ? &srcRead : NULL, &dstWrote, dstCharsVar ? &dstChars : NULL); @@ -2210,7 +2212,7 @@ static int UtfExtWrapper( } result = TCL_OK; resultObjs[1] = - encState ? Tcl_NewIntObj(encStateValue) : Tcl_NewObj(); + encStatePtr ? Tcl_NewWideIntObj((Tcl_WideInt)encState) : Tcl_NewObj(); resultObjs[2] = Tcl_NewByteArrayObj(bufPtr, dstLen); if (srcReadVar) { if (Tcl_ObjSetVar2(interp, diff --git a/tests/utfext.test b/tests/utfext.test index 61e36b8..6cf3dd7 100644 --- a/tests/utfext.test +++ b/tests/utfext.test @@ -88,6 +88,11 @@ test xx-bufferoverflow {buffer overflow Tcl_ExternalToUtf} -body { testencoding Tcl_UtfToExternal unicode A {start end} {} 1 } -result {nospace {} {}} +# Another bug - char limit not obeyed +# % set cv 2 +# % testencoding Tcl_ExternalToUtf utf-8 abcdefgh {start end noterminate charlimit} {} 20 rv wv cv +# nospace {} abcÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ + ::tcltest::cleanupTests return -- cgit v0.12 From a3c59e320df775f0d6849e5d3163292280b3b386 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Thu, 16 Mar 2023 03:08:12 +0000 Subject: Change -encodingprofile to -profile --- generic/tclIO.c | 58 +++++++++++++++++++++++++-------------------------- tests/chanio.test | 6 +++--- tests/encoding.test | 10 ++++----- tests/io.test | 44 +++++++++++++++++++------------------- tests/ioCmd.test | 26 +++++++++++------------ tests/winConsole.test | 14 ++++++------- tests/zlib.test | 4 ++-- 7 files changed, 81 insertions(+), 81 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index f24eaa0..dbdbda5 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -7810,7 +7810,7 @@ Tcl_BadChannelOption( { if (interp != NULL) { const char *genericopt = - "blocking buffering buffersize encoding encodingprofile eofchar translation"; + "blocking buffering buffersize encoding eofchar profile translation"; const char **argv; int argc, i; Tcl_DString ds; @@ -7951,7 +7951,7 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(8, "-encoding")) { + if (len == 0 || HaveOpt(2, "-encoding")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-encoding"); } @@ -7965,23 +7965,6 @@ Tcl_GetChannelOption( return TCL_OK; } } - if (len == 0 || HaveOpt(9, "-encodingprofile")) { - int profile; - const char *profileName; - if (len == 0) { - Tcl_DStringAppendElement(dsPtr, "-encodingprofile"); - } - /* Note currently input and output profiles are same */ - profile = TCL_ENCODING_PROFILE_GET(statePtr->inputEncodingFlags); - profileName = TclEncodingProfileIdToName(interp, profile); - if (profileName == NULL) { - return TCL_ERROR; - } - Tcl_DStringAppendElement(dsPtr, profileName); - if (len > 0) { - return TCL_OK; - } - } if (len == 0 || HaveOpt(2, "-eofchar")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-eofchar"); @@ -8025,6 +8008,23 @@ Tcl_GetChannelOption( return TCL_OK; } } + if (len == 0 || HaveOpt(1, "-profile")) { + int profile; + const char *profileName; + if (len == 0) { + Tcl_DStringAppendElement(dsPtr, "-profile"); + } + /* Note currently input and output profiles are same */ + profile = TCL_ENCODING_PROFILE_GET(statePtr->inputEncodingFlags); + profileName = TclEncodingProfileIdToName(interp, profile); + if (profileName == NULL) { + return TCL_ERROR; + } + Tcl_DStringAppendElement(dsPtr, profileName); + if (len > 0) { + return TCL_OK; + } + } if (len == 0 || HaveOpt(1, "-translation")) { if (len == 0) { Tcl_DStringAppendElement(dsPtr, "-translation"); @@ -8194,7 +8194,7 @@ Tcl_SetChannelOption( } Tcl_SetChannelBufferSize(chan, newBufferSize); return TCL_OK; - } else if (HaveOpt(8, "-encoding")) { + } else if (HaveOpt(2, "-encoding")) { Tcl_Encoding encoding; int profile; @@ -8230,15 +8230,6 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); UpdateInterest(chanPtr); return TCL_OK; - } else if (HaveOpt(9, "-encodingprofile")) { - int profile; - if (TclEncodingProfileNameToId(interp, newValue, &profile) != TCL_OK) { - return TCL_ERROR; - } - TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); - TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); - ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); - return TCL_OK; } else if (HaveOpt(2, "-eofchar")) { if (!newValue[0] || (!(newValue[0] & 0x80) && !newValue[1])) { if (GotFlag(statePtr, TCL_READABLE)) { @@ -8294,6 +8285,15 @@ Tcl_SetChannelOption( ResetFlag(statePtr, CHANNEL_EOF|CHANNEL_STICKY_EOF|CHANNEL_BLOCKED); statePtr->inputEncodingFlags &= ~TCL_ENCODING_END; return TCL_OK; + } else if (HaveOpt(1, "-profile")) { + int profile; + if (TclEncodingProfileNameToId(interp, newValue, &profile) != TCL_OK) { + return TCL_ERROR; + } + TCL_ENCODING_PROFILE_SET(statePtr->inputEncodingFlags, profile); + TCL_ENCODING_PROFILE_SET(statePtr->outputEncodingFlags, profile); + ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); + return TCL_OK; } else if (HaveOpt(1, "-translation")) { const char *readMode, *writeMode; diff --git a/tests/chanio.test b/tests/chanio.test index 6da6305..d2008e6 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -254,7 +254,7 @@ test chan-io-3.3 {WriteChars: compatibility with WriteBytes: flush on line} -bod test chan-io-3.4 {WriteChars: loop over stage buffer} -body { # stage buffer maps to more than can be queued at once. set f [open $path(test1) w] - chan configure $f -encoding jis0208 -buffersize 16 -encodingprofile tcl8 + chan configure $f -encoding jis0208 -buffersize 16 -profile tcl8 chan puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] chan close $f @@ -267,7 +267,7 @@ test chan-io-3.5 {WriteChars: saved != 0} -body { # be moved to beginning of next channel buffer to preserve requested # buffersize. set f [open $path(test1) w] - chan configure $f -encoding jis0208 -buffersize 17 -encodingprofile tcl8 + chan configure $f -encoding jis0208 -buffersize 17 -profile tcl8 chan puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] chan close $f @@ -300,7 +300,7 @@ test chan-io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} -body { # on flush. The truncated bytes are moved to the beginning of the next # channel buffer. set f [open $path(test1) w] - chan configure $f -encoding jis0208 -buffersize 17 -encodingprofile tcl8 + chan configure $f -encoding jis0208 -buffersize 17 -profile tcl8 chan puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] chan close $f diff --git a/tests/encoding.test b/tests/encoding.test index 1af5a26..31f966c 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -105,13 +105,13 @@ test encoding-3.2 {Tcl_GetEncodingName, non-null} -setup { } -cleanup { fconfigure stdout -encoding $old } -result {jis0208} -test encoding-3.3 {fconfigure -encodingprofile} -setup { - set old [fconfigure stdout -encodingprofile] +test encoding-3.3 {fconfigure -profile} -setup { + set old [fconfigure stdout -profile] } -body { - fconfigure stdout -encodingprofile replace - fconfigure stdout -encodingprofile + fconfigure stdout -profile replace + fconfigure stdout -profile } -cleanup { - fconfigure stdout -encodingprofile $old + fconfigure stdout -profile $old } -result replace test encoding-4.1 {Tcl_GetEncodingNames} -constraints {testencoding} -setup { diff --git a/tests/io.test b/tests/io.test index fc126de..c3c0cdd 100644 --- a/tests/io.test +++ b/tests/io.test @@ -274,7 +274,7 @@ test io-3.4 {WriteChars: loop over stage buffer} -body { # stage buffer maps to more than can be queued at once. set f [open $path(test1) w] - fconfigure $f -encoding jis0208 -buffersize 16 -encodingprofile tcl8 + fconfigure $f -encoding jis0208 -buffersize 16 -profile tcl8 puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] close $f @@ -288,7 +288,7 @@ test io-3.5 {WriteChars: saved != 0} -body { # requested buffersize. set f [open $path(test1) w] - fconfigure $f -encoding jis0208 -buffersize 17 -encodingprofile tcl8 + fconfigure $f -encoding jis0208 -buffersize 17 -profile tcl8 puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] close $f @@ -321,7 +321,7 @@ test io-3.7 {WriteChars: (bufPtr->nextAdded > bufPtr->length)} -body { # of the next channel buffer. set f [open $path(test1) w] - fconfigure $f -encoding jis0208 -buffersize 17 -encodingprofile tcl8 + fconfigure $f -encoding jis0208 -buffersize 17 -profile tcl8 puts -nonewline $f "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" set x [list [contents $path(test1)]] close $f @@ -7634,7 +7634,7 @@ test io-52.20 {TclCopyChannel & encodings} -setup { set out [open $path(kyrillic.txt) w] # Using "-encoding ascii" means reading the "Á" gives an error - fconfigure $in -encoding ascii -encodingprofile strict + fconfigure $in -encoding ascii -profile strict fconfigure $out -encoding koi8-r -translation lf fcopy $in $out @@ -7656,7 +7656,7 @@ test io-52.21 {TclCopyChannel & encodings} -setup { # Using "-encoding ascii" means writing the "Á" gives an error fconfigure $in -encoding utf-8 - fconfigure $out -encoding ascii -translation lf -encodingprofile strict + fconfigure $out -encoding ascii -translation lf -profile strict fcopy $in $out } -cleanup { @@ -7676,7 +7676,7 @@ test io-52.22 {TclCopyChannel & encodings} -setup { set out [open $path(kyrillic.txt) w] # Using "-encoding ascii" means reading the "Á" gives an error - fconfigure $in -encoding ascii -encodingprofile strict + fconfigure $in -encoding ascii -profile strict fconfigure $out -encoding koi8-r -translation lf proc ::xxx args { set ::s0 $args @@ -7704,7 +7704,7 @@ test io-52.23 {TclCopyChannel & encodings} -setup { # Using "-encoding ascii" means writing the "Á" gives an error fconfigure $in -encoding utf-8 - fconfigure $out -encoding ascii -translation lf -encodingprofile strict + fconfigure $out -encoding ascii -translation lf -profile strict proc ::xxx args { set ::s0 $args } @@ -9073,7 +9073,7 @@ test io-75.1 {multibyte encoding error read results in raw bytes} -setup { puts -nonewline $f A\xC0\x40 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -encodingprofile tcl8 -buffering none + fconfigure $f -encoding utf-8 -profile tcl8 -buffering none } -body { set d [read $f] binary scan $d H* hd @@ -9083,10 +9083,10 @@ test io-75.1 {multibyte encoding error read results in raw bytes} -setup { removeFile io-75.1 } -result 41c040 -test io-75.2 {unrepresentable character write passes and is replaced by ? (-encodingprofile tcl8)} -setup { +test io-75.2 {unrepresentable character write passes and is replaced by ? (-profile tcl8)} -setup { set fn [makeFile {} io-75.2] set f [open $fn w+] - fconfigure $f -encoding iso8859-1 -encodingprofile tcl8 + fconfigure $f -encoding iso8859-1 -profile tcl8 } -body { puts -nonewline $f A\u2022 flush $f @@ -9100,14 +9100,14 @@ test io-75.2 {unrepresentable character write passes and is replaced by ? (-enco # Incomplete sequence test. # This error may IMHO only be detected with the close. # But the read already returns the incomplete sequence. -test io-75.3 {incomplete multibyte encoding read is ignored (-encodingprofile tcl8)} -setup { +test io-75.3 {incomplete multibyte encoding read is ignored (-profile tcl8)} -setup { set fn [makeFile {} io-75.3] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f "A\xC0" flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -encodingprofile tcl8 + fconfigure $f -encoding utf-8 -buffering none -profile tcl8 } -body { set d [read $f] close $f @@ -9119,7 +9119,7 @@ test io-75.3 {incomplete multibyte encoding read is ignored (-encodingprofile tc # As utf-8 has a special treatment in multi-byte decoding, also test another # one. -test io-75.4 {shiftjis encoding error read results in raw bytes (-encodingprofile tcl8)} -setup { +test io-75.4 {shiftjis encoding error read results in raw bytes (-profile tcl8)} -setup { set fn [makeFile {} io-75.4] set f [open $fn w+] fconfigure $f -encoding binary @@ -9128,7 +9128,7 @@ test io-75.4 {shiftjis encoding error read results in raw bytes (-encodingprofil puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -encodingprofile tcl8 + fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -profile tcl8 } -body { set d [read $f] binary scan $d H* hd @@ -9138,14 +9138,14 @@ test io-75.4 {shiftjis encoding error read results in raw bytes (-encodingprofil removeFile io-75.4 } -result 4181ff41 -test io-75.5 {invalid utf-8 encoding read is ignored (-encodingprofile tcl8)} -setup { +test io-75.5 {invalid utf-8 encoding read is ignored (-profile tcl8)} -setup { set fn [makeFile {} io-75.5] set f [open $fn w+] fconfigure $f -encoding binary puts -nonewline $f A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile tcl8 + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -profile tcl8 } -body { set d [read $f] close $f @@ -9155,7 +9155,7 @@ test io-75.5 {invalid utf-8 encoding read is ignored (-encodingprofile tcl8)} -s removeFile io-75.5 } -result 4181 -test io-75.8 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -setup { +test io-75.8 {invalid utf-8 encoding eof handling (-profile strict)} -setup { set fn [makeFile {} io-75.8] set f [open $fn w+] fconfigure $f -encoding binary @@ -9163,7 +9163,7 @@ test io-75.8 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -se puts -nonewline $f A\x1A\x81 flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -encodingprofile strict + fconfigure $f -encoding utf-8 -buffering none -eofchar \x1A -translation lf -profile strict } -body { set d [read $f] binary scan $d H* hd @@ -9178,7 +9178,7 @@ test io-75.8 {invalid utf-8 encoding eof handling (-encodingprofile strict)} -se test io-75.9 {unrepresentable character write passes and is replaced by ?} -setup { set fn [makeFile {} io-75.9] set f [open $fn w+] - fconfigure $f -encoding iso8859-1 -encodingprofile strict + fconfigure $f -encoding iso8859-1 -profile strict } -body { catch {puts -nonewline $f "A\u2022"} msg flush $f @@ -9222,7 +9222,7 @@ test io-75.11 {shiftjis encoding error read results in raw bytes} -setup { puts -nonewline $f A\x81\xFFA flush $f seek $f 0 - fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -encodingprofile strict + fconfigure $f -encoding shiftjis -buffering none -eofchar "" -translation lf -profile strict } -body { set d [read $f] binary scan $d H* hd @@ -9249,7 +9249,7 @@ test io-75.12 {invalid utf-8 encoding read is ignored} -setup { } -cleanup { removeFile io-75.12 } -result 4181 -test io-75.13 {invalid utf-8 encoding read is not ignored (-encodingprofile strict)} -setup { +test io-75.13 {invalid utf-8 encoding read is not ignored (-profile strict)} -setup { set fn [makeFile {} io-75.13] set f [open $fn w+] fconfigure $f -encoding binary @@ -9257,7 +9257,7 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-encodingprofile stri puts -nonewline $f "A\x81" flush $f seek $f 0 - fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -encodingprofile strict + fconfigure $f -encoding utf-8 -buffering none -eofchar "" -translation lf -profile strict } -body { set d [read $f] binary scan $d H* hd diff --git a/tests/ioCmd.test b/tests/ioCmd.test index 23cd67e..aeb9f87 100644 --- a/tests/ioCmd.test +++ b/tests/ioCmd.test @@ -207,7 +207,7 @@ test iocmd-7.5 {close command} -setup { proc expectedOpts {got extra} { set basicOpts { - -blocking -buffering -buffersize -encoding -encodingprofile -eofchar -translation + -blocking -buffering -buffersize -encoding -eofchar -profile -translation } set opts [list {*}$basicOpts {*}$extra] lset opts end [string cat "or " [lindex $opts end]] @@ -240,33 +240,33 @@ test iocmd-8.7 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] - fconfigure $f1 -translation lf -eofchar {} -encoding utf-16 -encodingprofile tcl8 + fconfigure $f1 -translation lf -eofchar {} -encoding utf-16 -profile tcl8 fconfigure $f1 } -cleanup { catch {close $f1} -} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -encodingprofile tcl8 -eofchar {} -translation lf} +} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -profile tcl8 -translation lf} test iocmd-8.8 {fconfigure command} -setup { file delete $path(test1) set x {} } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation lf -buffering line -buffersize 3030 \ - -eofchar {} -encoding utf-16 -encodingprofile tcl8 + -eofchar {} -encoding utf-16 -profile tcl8 lappend x [fconfigure $f1 -buffering] lappend x [fconfigure $f1] } -cleanup { catch {close $f1} -} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -encodingprofile tcl8 -eofchar {} -translation lf}} +} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -profile tcl8 -translation lf}} test iocmd-8.9 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation binary -buffering none -buffersize 4040 \ - -eofchar {} -encoding binary -encodingprofile tcl8 + -eofchar {} -encoding binary -profile tcl8 fconfigure $f1 } -cleanup { catch {close $f1} -} -result {-blocking 1 -buffering none -buffersize 4040 -encoding binary -encodingprofile tcl8 -eofchar {} -translation lf} +} -result {-blocking 1 -buffering none -buffersize 4040 -encoding binary -eofchar {} -profile tcl8 -translation lf} test iocmd-8.10 {fconfigure command} -returnCodes error -body { fconfigure a b } -result {can not find channel named "a"} @@ -378,7 +378,7 @@ test iocmd-8.21 {fconfigure command / -nocomplainencoding 0 error} -constraints } -returnCodes error -result "bad value for -nocomplainencoding: only true allowed" test iocmd-8.22 {fconfigure command / -nocomplainencoding 0, no error if -strictencoding already defined} -setup { set console stdin - set oldprofile [fconfigure $console -encodingprofile] + set oldprofile [fconfigure $console -profile] } -constraints { obsolete } -body { @@ -390,8 +390,8 @@ test iocmd-8.22 {fconfigure command / -nocomplainencoding 0, no error if -strict } -result 0 -test iocmd-8.21 {fconfigure -encodingprofile badprofile} -body { - fconfigure stdin -encodingprofile froboz +test iocmd-8.21 {fconfigure -profile badprofile} -body { + fconfigure stdin -profile froboz } -returnCodes error -result {bad profile name "froboz": must be replace, strict, or tcl8} test iocmd-9.1 {eof command} { @@ -1387,7 +1387,7 @@ test iocmd-25.1 {chan configure, cgetall, standard options} -match glob -body { close $c rename foo {} set res -} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding * -encodingprofile * -eofchar {{} {}} -translation {auto *}}} +} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding * -eofchar {{} {}} -profile * -translation {auto *}}} test iocmd-25.2 {chan configure, cgetall, no options} -match glob -body { set res {} proc foo {args} {oninit cget cgetall; onfinal; track; return ""} @@ -1396,7 +1396,7 @@ test iocmd-25.2 {chan configure, cgetall, no options} -match glob -body { close $c rename foo {} set res -} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -encodingprofile * -eofchar {{} {}} -translation {auto *}}} +} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -eofchar {{} {}} -profile * -translation {auto *}}} test iocmd-25.3 {chan configure, cgetall, regular result} -match glob -body { set res {} proc foo {args} { @@ -1408,7 +1408,7 @@ test iocmd-25.3 {chan configure, cgetall, regular result} -match glob -body { close $c rename foo {} set res -} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -encodingprofile * -eofchar {{} {}} -translation {auto *} -bar foo -snarf x}} +} -result {{cgetall rc*} {-blocking 1 -buffering full -buffersize 4096 -encoding * -eofchar {{} {}} -profile * -translation {auto *} -bar foo -snarf x}} test iocmd-25.4 {chan configure, cgetall, bad result, list of uneven length} -match glob -body { set res {} proc foo {args} { diff --git a/tests/winConsole.test b/tests/winConsole.test index 62dfbf3..f030444 100644 --- a/tests/winConsole.test +++ b/tests/winConsole.test @@ -198,7 +198,7 @@ test console-fconfigure-get-1.0 { Console get stdin configuration } -constraints {win interactive} -body { lsort [dict keys [fconfigure stdin]] -} -result {-blocking -buffering -buffersize -encoding -encodingprofile -eofchar -inputmode -translation} +} -result {-blocking -buffering -buffersize -encoding -eofchar -inputmode -profile -translation} set testnum 0 foreach {opt result} { @@ -224,7 +224,7 @@ test console-fconfigure-get-1.[incr testnum] { fconfigure -winsize } -constraints {win interactive} -body { fconfigure stdin -winsize -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, -translation, or -inputmode} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -profile, -translation, or -inputmode} -returnCodes error ## fconfigure get stdout/stderr foreach chan {stdout stderr} major {2 3} { @@ -232,7 +232,7 @@ foreach chan {stdout stderr} major {2 3} { win interactive } -body { lsort [dict keys [fconfigure $chan]] - } -result {-blocking -buffering -buffersize -encoding -encodingprofile -eofchar -translation -winsize} + } -result {-blocking -buffering -buffersize -encoding -eofchar -profile -translation -winsize} set testnum 0 foreach {opt result} { -blocking 1 @@ -260,7 +260,7 @@ foreach chan {stdout stderr} major {2 3} { fconfigure -inputmode } -constraints {win interactive} -body { fconfigure $chan -inputmode - } -result {bad option "-inputmode": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, -translation, or -winsize} -returnCodes error + } -result {bad option "-inputmode": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -profile, -translation, or -winsize} -returnCodes error } @@ -330,7 +330,7 @@ test console-fconfigure-set-1.3 { fconfigure stdin -winsize } -constraints {win interactive} -body { fconfigure stdin -winsize {10 30} -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, -translation, or -inputmode} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -profile, -translation, or -inputmode} -returnCodes error ## fconfigure set stdout,stderr @@ -338,13 +338,13 @@ test console-fconfigure-set-2.0 { fconfigure stdout -winsize } -constraints {win interactive} -body { fconfigure stdout -winsize {10 30} -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, or -translation} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, -profile, or -translation} -returnCodes error test console-fconfigure-set-3.0 { fconfigure stderr -winsize } -constraints {win interactive} -body { fconfigure stderr -winsize {10 30} -} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -encodingprofile, -eofchar, or -translation} -returnCodes error +} -result {bad option "-winsize": should be one of -blocking, -buffering, -buffersize, -encoding, -eofchar, or -profile, -translation} -returnCodes error # Multiple threads diff --git a/tests/zlib.test b/tests/zlib.test index 0566b8b..42d9e9c 100644 --- a/tests/zlib.test +++ b/tests/zlib.test @@ -292,7 +292,7 @@ test zlib-8.6 {transformation and fconfigure} -setup { } -cleanup { catch {close $fd} removeFile $file -} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf -checksum 1 -dictionary {}} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf}} +} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -profile tcl8 -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -profile tcl8 -translation lf -checksum 1 -dictionary {}} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -profile tcl8 -translation lf}} test zlib-8.7 {transformation and fconfigure} -setup { set file [makeFile {} test.gz] set fd [open $file wb] @@ -302,7 +302,7 @@ test zlib-8.7 {transformation and fconfigure} -setup { } -cleanup { catch {close $fd} removeFile $file -} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf -checksum 0} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -encodingprofile {} -eofchar {} -translation lf}} +} -result {{-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -profile tcl8 -translation lf} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -profile tcl8 -translation lf -checksum 0} {-blocking 1 -buffering full -buffersize 4096 -encoding binary -eofchar {} -profile tcl8 -translation lf}} # Input is headers from fetching SPDY draft # Dictionary is that which is proposed _in_ SPDY draft set spdyHeaders "HTTP/1.0 200 OK\r\nContent-Type: text/html; charset=utf-8\r\nX-Robots-Tag: noarchive\r\nLast-Modified: Tue, 05 Jun 2012 02:43:25 GMT\r\nETag: \"1338864205129|#public|0|en|||0\"\r\nExpires: Tue, 05 Jun 2012 16:17:11 GMT\r\nDate: Tue, 05 Jun 2012 16:17:06 GMT\r\nCache-Control: public, max-age=5\r\nX-Content-Type-Options: nosniff\r\nX-XSS-Protection: 1; mode=block\r\nServer: GSE\r\n" -- cgit v0.12