From 2ff0d1c5c1dfa32d96b3d627878eedb04c72b18f Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" <nijtmans@users.sourceforge.net> Date: Thu, 9 Mar 2023 16:20:30 +0000 Subject: Bug-fix for Utf32ToUtfProc, in case TCL_UTF_MAX=3 --- generic/tclEncoding.c | 55 ++++++++++++++++++++++++++++++++++++--------- library/tcltest/tcltest.tcl | 16 ++++++------- tests/io.test | 3 ++- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 267a667..dacc263 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2712,7 +2712,7 @@ Utf32ToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - int ch, bytesLeft = srcLen % 4; + int ch = 0, bytesLeft = srcLen % 4; flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2729,6 +2729,21 @@ Utf32ToUtfProc( srcLen -= bytesLeft; } +#if TCL_UTF_MAX < 4 + /* + * If last code point is a high surrogate, we cannot handle that yet, + * unless we are at the end. + */ + + if (!(flags & TCL_ENCODING_END) && (srcLen >= 4) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?3:2)] & 0xFC) == 0xD8) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?2:3)]) == 0) && + ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:4)]) == 0)) { + result = TCL_CONVERT_MULTIBYTE; + srcLen-= 4; + } +#endif + srcStart = src; srcEnd = src + srcLen; @@ -2741,15 +2756,27 @@ Utf32ToUtfProc( break; } +#if TCL_UTF_MAX < 4 + int prev = ch; +#endif if (flags & TCL_ENCODING_LE) { ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF); } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } +#if TCL_UTF_MAX < 4 + if (((prev & ~0x3FF) == 0xD800) && ((ch & ~0x3FF) != 0xDC00)) { + /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } +#endif if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) { if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; +#if TCL_UTF_MAX < 4 + ch = 0; +#endif break; } if (PROFILE_REPLACE(flags)) { @@ -2770,6 +2797,12 @@ Utf32ToUtfProc( src += 4; } +#if TCL_UTF_MAX < 4 + if ((ch & ~0x3FF) == 0xD800) { + /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ + dst += Tcl_UniCharToUtf(-1, dst); + } +#endif /* @@ -2780,16 +2813,16 @@ Utf32ToUtfProc( if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - if (PROFILE_STRICT(flags)) { - result = TCL_CONVERT_SYNTAX; - } else { - /* PROFILE_REPLACE or PROFILE_TCL8 */ - result = TCL_OK; - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - numChars++; - src += bytesLeft; /* Go past truncated code unit */ - } - } + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src += bytesLeft; /* Go past truncated code unit */ + } + } } *srcReadPtr = src - srcStart; diff --git a/library/tcltest/tcltest.tcl b/library/tcltest/tcltest.tcl index 7344f9f..1ba5d9f 100644 --- a/library/tcltest/tcltest.tcl +++ b/library/tcltest/tcltest.tcl @@ -400,7 +400,7 @@ namespace eval tcltest { default { set outputChannel [open $filename a] if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $outputChannel -encoding utf-8 + fconfigure $outputChannel -profile tcl8 -encoding utf-8 } set ChannelsWeOpened($outputChannel) 1 @@ -447,7 +447,7 @@ namespace eval tcltest { default { set errorChannel [open $filename a] if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $errorChannel -encoding utf-8 + fconfigure $errorChannel -profile tcl8 -encoding utf-8 } set ChannelsWeOpened($errorChannel) 1 @@ -792,7 +792,7 @@ namespace eval tcltest { if {$Option(-loadfile) eq {}} {return} set tmp [open $Option(-loadfile) r] if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $tmp -encoding utf-8 + fconfigure $tmp -profile tcl8 -encoding utf-8 } loadScript [read $tmp] close $tmp @@ -1340,7 +1340,7 @@ proc tcltest::DefineConstraintInitializers {} { set code 0 if {![catch {set f [open "|[list [interpreter]]" w]}]} { if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $f -encoding utf-8 + fconfigure $f -profile tcl8 -encoding utf-8 } if {![catch {puts $f exit}]} { if {![catch {close $f}]} { @@ -2190,7 +2190,7 @@ proc tcltest::test {name description args} { if {[file readable $testFile]} { set testFd [open $testFile r] if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $testFd -encoding utf-8 + fconfigure $testFd -profile tcl8 -encoding utf-8 } set testLine [expr {[lsearch -regexp \ [split [read $testFd] "\n"] \ @@ -2901,7 +2901,7 @@ proc tcltest::runAllTests { {shell ""} } { incr numTestFiles set pipeFd [open $cmd "r"] if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $pipeFd -encoding utf-8 + fconfigure $pipeFd -profile tcl8 -encoding utf-8 } while {[gets $pipeFd line] >= 0} { if {[regexp [join { @@ -3101,7 +3101,7 @@ proc tcltest::makeFile {contents name {directory ""}} { set fd [open $fullName w] fconfigure $fd -translation lf if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $fd -encoding utf-8 + fconfigure $fd -profile tcl8 -encoding utf-8 } if {[string index $contents end] eq "\n"} { puts -nonewline $fd $contents @@ -3252,7 +3252,7 @@ proc tcltest::viewFile {name {directory ""}} { set fullName [file join $directory $name] set f [open $fullName] if {[package vsatisfies [package provide Tcl] 8.7-]} { - fconfigure $f -encoding utf-8 + fconfigure $f -profile tcl8 -encoding utf-8 } set data [read -nonewline $f] close $f diff --git a/tests/io.test b/tests/io.test index 8dde2b2..a8ec7e5 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9292,10 +9292,10 @@ test io-75.12 {invalid utf-8 encoding read is ignored} -setup { fconfigure $f -encoding utf-8 -profile tcl8 -buffering none -eofchar "" -translation lf } -body { set d [read $f] - close $f binary scan $d H* hd set hd } -cleanup { + close $f removeFile io-75.12 } -result 4181 test io-75.13 {invalid utf-8 encoding read is not ignored (-profile strict)} -setup { @@ -9310,6 +9310,7 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-profile strict)} -se } -body { read $f } -cleanup { + close $f removeFile io-75.13 } -match glob -returnCodes 1 -result {error reading "*": illegal byte sequence} -- cgit v0.12