From 9dabd6f1accf6870c461a9ed5ae931aee09de1fe Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 19 Dec 2022 15:40:40 +0000 Subject: When changing the -nocomplainencoding or -strictencoding settings of a channel, reset the CHANNEL_NEED_MORE_DATA and CHANNEL_ENCODING_ERROR flags, so we can continue reading the channel and find out what's going on. --- generic/tclIO.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/generic/tclIO.c b/generic/tclIO.c index 730d963..e6e3560 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -8364,6 +8364,7 @@ Tcl_SetChannelOption( } #endif } + ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); return TCL_OK; } else if (HaveOpt(1, "-strictencoding")) { int newMode; @@ -8377,6 +8378,7 @@ Tcl_SetChannelOption( } else { ResetFlag(statePtr, CHANNEL_ENCODING_STRICT); } + ResetFlag(statePtr, CHANNEL_NEED_MORE_DATA|CHANNEL_ENCODING_ERROR); return TCL_OK; } else if (HaveOpt(1, "-translation")) { const char *readMode, *writeMode; -- cgit v0.12 From 5db223f8ece1c31149d24e5ebddb1d2190aa62f6 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 19 Dec 2022 15:43:22 +0000 Subject: Add two testcases, showing how we can distinguish Invalid Byte sequences, Surrogates, and Noncharacters when an exception occurred. --- tests/io.test | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/tests/io.test b/tests/io.test index d10e1e4..eb49b44 100644 --- a/tests/io.test +++ b/tests/io.test @@ -9201,6 +9201,84 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} - removeFile io-75.13 } -match glob -result {41 1 {error reading "*": illegal byte sequence}} +# Testcase for Rolf's use-case (detecting Invalid byte sequence, but allowing noncharacter) +test io-75.14 {How to use -strict, but allow non-characters} -setup { + set fn [makeFile {} io-75.14] + set f [open $fn w+] + fconfigure $f -encoding binary + # Noncharacter followed by a single + puts -nonewline $f pre\xEF\xBF\xBE\x81post + flush $f + seek $f 0 + fconfigure stdout -nocomplainencoding 1 + catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9 + fconfigure $f -encoding utf-8 -buffering none -translation lf -strictencoding 1 +} -body { + set hd {} + catch { + while {![eof $f]} { + if {[catch { + append hd [read $f] + }]} { + fconfigure $f -nocomplainencoding 1 -strictencoding 0 + set char [read $f 1] + if {[string is unicode $char]} { + error "InvalidByteSequence" + } elseif {$char >= "\uD800" && $char < "\uE000"} { + error "Surrogate" + } else { + append hd $char + } + catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9 + fconfigure $f -strictencoding 1 -encoding utf-8 + } + } + } msg + close $f + append hd +$msg +} -cleanup { + removeFile io-75.14 +} -result "pre\uFFFE+InvalidByteSequence" + +# Testcase for Rolf's use-case (detecting Surrogate, but allowing noncharacter) +test io-75.15 {How to use -strict, but allow non-characters} -setup { + set fn [makeFile {} io-75.14] + set f [open $fn w+] + fconfigure $f -encoding utf-8 -nocomplainencoding 1 + # Noncharacter followed by a single + puts -nonewline $f pre\uFFFE\uD800post + flush $f + seek $f 0 + fconfigure stdout -nocomplainencoding 1 + catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9 + fconfigure $f -buffering none -translation lf -strictencoding 1 +} -body { + set hd {} + catch { + while {![eof $f]} { + if {[catch { + append hd [read $f] + }]} { + fconfigure $f -nocomplainencoding 1 -strictencoding 0 + set char [read $f 1] + if {[string is unicode $char]} { + error "Invalid Byte Sequence" + } elseif {$char >= "\uD800" && $char < "\uE000"} { + error "Surrogate" + } else { + append hd $char + } + catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9 + fconfigure $f -strictencoding 1 + } + } + } msg + close $f + append hd +$msg +} -cleanup { + removeFile io-75.15 +} -result "pre\uFFFE+Surrogate" + # ### ### ### ######### ######### ######### -- cgit v0.12