From 4813d492cacd8473e3266e284b3d3714f49602ae Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 29 Nov 2022 14:51:03 +0000 Subject: Proposed fix for [084ab982fe]: Use -strict to disable noncharacters --- generic/tclEncoding.c | 27 ++++++++++++++------------- tests/encoding.test | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index eb217b4..5be6a2e 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -562,7 +562,7 @@ TclInitEncodingSubsystem(void) type.nullSize = 1; type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(0); + type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); @@ -571,13 +571,13 @@ TclInitEncodingSubsystem(void) type.freeProc = NULL; type.nullSize = 2; type.encodingName = "ucs-2le"; - type.clientData = INT2PTR(TCL_ENCODING_LE); + type.clientData = INT2PTR(TCL_ENCODING_LE|TCL_ENCODING_NOCOMPLAIN); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2be"; - type.clientData = INT2PTR(0); + type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); Tcl_CreateEncoding(&type); type.encodingName = "ucs-2"; - type.clientData = INT2PTR(isLe.c); + type.clientData = INT2PTR(isLe.c|TCL_ENCODING_NOCOMPLAIN); Tcl_CreateEncoding(&type); type.toUtfProc = Utf32ToUtfProc; @@ -2468,15 +2468,16 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if (!Tcl_UniCharIsUnicode(ch)) { - if (STOPONERROR) { - result = TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } - if (!(flags & TCL_ENCODING_MODIFIED)) { - ch = 0xFFFD; - } + } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && !Tcl_UniCharIsUnicode(ch) + && (((ch & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) + && (flags & TCL_ENCODING_MODIFIED) && !Tcl_UniCharIsUnicode(ch)) { + result = TCL_CONVERT_SYNTAX; + src = saveSrc; + break; } dst += Tcl_UniCharToUtf(ch, dst); } diff --git a/tests/encoding.test b/tests/encoding.test index 9aa123d..1125397 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -688,6 +688,42 @@ test encoding-24.27 {Parse invalid utf-8 with -strict} -body { test encoding-24.28 {Parse invalid utf-8 with -strict} -body { encoding convertfrom -strict utf-8 "\xFF\x00\x00" } -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xFF'} +test encoding-24.29 {Parse invalid utf-8} -body { + encoding convertfrom utf-8 \xEF\xBF\xBF +} -result \uFFFF +test encoding-24.30 {Parse invalid utf-8 with -strict} -body { + encoding convertfrom -strict utf-8 \xEF\xBF\xBF +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xEF'} +test encoding-24.31 {Parse invalid utf-8 with -nocomplain} -body { + encoding convertfrom -nocomplain utf-8 \xEF\xBF\xBF +} -result \uFFFF +test encoding-24.32 {Try to generate invalid utf-8} -body { + encoding convertto utf-8 \uFFFF +} -result \xEF\xBF\xBF +test encoding-24.33 {Try to generate invalid utf-8 with -strict} -body { + encoding convertto -strict utf-8 \uFFFF +} -returnCodes 1 -result {unexpected character at index 0: 'U+00FFFF'} +test encoding-24.34 {Try to generate invalid utf-8 with -nocomplain} -body { + encoding convertto -nocomplain utf-8 \uFFFF +} -result \xEF\xBF\xBF +test encoding-24.35 {Parse invalid utf-8} -constraints deprecated -body { + encoding convertfrom utf-8 \xED\xA0\x80 +} -result \uD800 +test encoding-24.36 {Parse invalid utf-8 with -strict} -body { + encoding convertfrom -strict utf-8 \xED\xA0\x80 +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'} +test encoding-24.37 {Parse invalid utf-8 with -nocomplain} -body { + encoding convertfrom -nocomplain utf-8 \xED\xA0\x80 +} -result \uD800 +test encoding-24.38 {Try to generate invalid utf-8} -constraints deprecated -body { + encoding convertto utf-8 \uD800 +} -result \xED\xA0\x80 +test encoding-24.39 {Try to generate invalid utf-8 with -strict} -body { + encoding convertto -strict utf-8 \uD800 +} -returnCodes 1 -result {unexpected character at index 0: 'U+00D800'} +test encoding-24.40 {Try to generate invalid utf-8 with -nocomplain} -body { + encoding convertto -nocomplain utf-8 \uD800 +} -result \xED\xA0\x80 file delete [file join [temporaryDirectory] iso2022.txt] -- cgit v0.12