From 393743bb7088f57b28cd5f98d2c9f70189807a2e Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 12 Sep 2022 20:41:29 +0000 Subject: Start TIP #346 implementation: For now only \xC0\x80 --- generic/tcl.h | 1 + generic/tclCmdAH.c | 22 ++++++++++++++++------ generic/tclEncoding.c | 8 ++++++-- tests/cmdAH.test | 24 ++++++++++++------------ tests/encoding.test | 4 ++-- tests/safe.test | 8 ++++---- 6 files changed, 41 insertions(+), 26 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index f17d43e..acff803 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2118,6 +2118,7 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_CHAR_LIMIT 0x10 #define TCL_ENCODING_MODIFIED 0x20 #define TCL_ENCODING_NOCOMPLAIN 0x40 +#define TCL_ENCODING_STRICT 0x44 /* * The following definitions are the error codes returned by the conversion diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 28fc210..572a995 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -564,8 +564,10 @@ EncodingConvertfromObjCmd( * 2) encoding data -> objc = 3 * 3) -nocomplain data -> objc = 3 * 4) -nocomplain encoding data -> objc = 4 - * 5) -failindex val data -> objc = 4 - * 6) -failindex val encoding data -> objc = 5 + * 5) -strict data -> objc = 3 + * 6) -strict encoding data -> objc = 4 + * 7) -failindex val data -> objc = 4 + * 8) -failindex val encoding data -> objc = 5 */ if (objc == 2) { @@ -579,6 +581,10 @@ EncodingConvertfromObjCmd( && !strncmp(bytesPtr, "-nocomplain", strlen(bytesPtr))) { flags = TCL_ENCODING_NOCOMPLAIN; objcUnprocessed--; + } else if (bytesPtr[0] == '-' && bytesPtr[1] == 's' + && !strncmp(bytesPtr, "-strict", strlen(bytesPtr))) { + flags = TCL_ENCODING_STRICT; + objcUnprocessed--; } else if (bytesPtr[0] == '-' && bytesPtr[1] == 'f' && !strncmp(bytesPtr, "-failindex", strlen(bytesPtr))) { /* at least two additional arguments needed */ @@ -603,7 +609,7 @@ EncodingConvertfromObjCmd( } } else { encConvFromError: - Tcl_WrongNumArgs(interp, 1, objv, "?-nocomplain? ?-failindex var? ?encoding? data"); + Tcl_WrongNumArgs(interp, 1, objv, "?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"); return TCL_ERROR; } @@ -621,7 +627,7 @@ EncodingConvertfromObjCmd( } result = Tcl_ExternalToUtfDStringEx(encoding, bytesPtr, length, flags, &ds); - if (!(flags & TCL_ENCODING_NOCOMPLAIN) && (result != TCL_INDEX_NONE)) { + if ((!(flags & TCL_ENCODING_NOCOMPLAIN) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) && (result != TCL_INDEX_NONE)) { if (failVarObj != NULL) { if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; @@ -714,6 +720,10 @@ EncodingConverttoObjCmd( && !strncmp(stringPtr, "-nocomplain", strlen(stringPtr))) { flags = TCL_ENCODING_NOCOMPLAIN; objcUnprocessed--; + } else if (stringPtr[0] == '-' && stringPtr[1] == 's' + && !strncmp(stringPtr, "-strict", strlen(stringPtr))) { + flags = TCL_ENCODING_STRICT; + objcUnprocessed--; } else if (stringPtr[0] == '-' && stringPtr[1] == 'f' && !strncmp(stringPtr, "-failindex", strlen(stringPtr))) { /* at least two additional arguments needed */ @@ -738,7 +748,7 @@ EncodingConverttoObjCmd( } } else { encConvToError: - Tcl_WrongNumArgs(interp, 1, objv, "?-nocomplain? ?-failindex var? ?encoding? data"); + Tcl_WrongNumArgs(interp, 1, objv, "?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"); return TCL_ERROR; } @@ -749,7 +759,7 @@ EncodingConverttoObjCmd( stringPtr = TclGetStringFromObj(data, &length); result = Tcl_UtfToExternalDStringEx(encoding, stringPtr, length, flags, &ds); - if (!(flags & TCL_ENCODING_NOCOMPLAIN) && (result != TCL_INDEX_NONE)) { + if ((!(flags & TCL_ENCODING_NOCOMPLAIN) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) && (result != TCL_INDEX_NONE)) { if (failVarObj != NULL) { /* I hope, wide int will cover size_t data type */ if (Tcl_ObjSetVar2(interp, failVarObj, NULL, Tcl_NewWideIntObj(result), TCL_LEAVE_ERR_MSG) == NULL) { diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0ce75b4..9c4b5ce 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2288,7 +2288,7 @@ BinaryProc( */ #if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED) -# define STOPONERROR !(flags & TCL_ENCODING_NOCOMPLAIN) +# define STOPONERROR (!(flags & TCL_ENCODING_NOCOMPLAIN) || (flags & TCL_ENCODING_STOPONERROR)) #else # define STOPONERROR (flags & TCL_ENCODING_STOPONERROR) #endif @@ -2359,10 +2359,14 @@ UtfToUtfProc( *dst++ = *src++; } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && !(flags & TCL_ENCODING_MODIFIED)) { + && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { /* * Convert 0xC080 to real nulls when we are in output mode. */ + if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + result = TCL_CONVERT_UNKNOWN; + break; + } *dst++ = 0; src += 2; diff --git a/tests/cmdAH.test b/tests/cmdAH.test index ab1a8e6..64991af 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -178,7 +178,7 @@ test cmdAH-4.2 {Tcl_EncodingObjCmd} -returnCodes error -body { } -result {unknown or ambiguous subcommand "foo": must be convertfrom, convertto, dirs, names, or system} test cmdAH-4.3 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertto -} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.4 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertto foo bar } -result {unknown encoding "foo"} @@ -200,7 +200,7 @@ test cmdAH-4.6 {Tcl_EncodingObjCmd} -setup { } -result 8C test cmdAH-4.7 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertfrom -} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.8 {Tcl_EncodingObjCmd} -returnCodes error -body { encoding convertfrom foo bar } -result {unknown encoding "foo"} @@ -237,10 +237,10 @@ test cmdAH-4.13 {Tcl_EncodingObjCmd} -setup { test cmdAH-4.14.1 {Syntax error, -nocomplain and -failindex, no encoding} -body { encoding convertfrom -nocomplain -failindex 2 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.14.2 {Syntax error, -nocomplain and -failindex, no encoding} -body { encoding convertto -nocomplain -failindex 2 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.15.1 {Syntax error, -failindex and -nocomplain, no encoding} -body { encoding convertfrom -failindex 2 -nocomplain ABC } -returnCodes 1 -result {unknown encoding "-nocomplain"} @@ -249,19 +249,19 @@ test cmdAH-4.15.2 {Syntax error, -failindex and -nocomplain, no encoding} -body } -returnCodes 1 -result {unknown encoding "-nocomplain"} test cmdAH-4.16.1 {Syntax error, -nocomplain and -failindex, encoding} -body { encoding convertfrom -nocomplain -failindex 2 utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.16.2 {Syntax error, -nocomplain and -failindex, encoding} -body { encoding convertto -nocomplain -failindex 2 utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.17.1 {Syntax error, -failindex and -nocomplain, encoding} -body { encoding convertfrom -failindex 2 -nocomplain utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.17.2 {Syntax error, -failindex and -nocomplain, encoding} -body { encoding convertto -failindex 2 -nocomplain utf-8 ABC -} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.18.1 {Syntax error, -failindex with no var, no encoding} -body { encoding convertfrom -failindex ABC -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.18.2 {Syntax error, -failindex with no var, no encoding (byte compiled)} -setup { proc encoding_test {} { encoding convertfrom -failindex ABC @@ -269,12 +269,12 @@ test cmdAH-4.18.2 {Syntax error, -failindex with no var, no encoding (byte compi } -body { # Compile and execute encoding_test -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} -cleanup { +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} -cleanup { rename encoding_test "" } test cmdAH-4.18.3 {Syntax error, -failindex with no var, no encoding} -body { encoding convertto -failindex ABC -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test cmdAH-4.18.4 {Syntax error, -failindex with no var, no encoding (byte compiled)} -setup { proc encoding_test {} { encoding convertto -failindex ABC @@ -282,7 +282,7 @@ test cmdAH-4.18.4 {Syntax error, -failindex with no var, no encoding (byte compi } -body { # Compile and execute encoding_test -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?-failindex var? ?encoding? data"} -cleanup { +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} -cleanup { rename encoding_test "" } test cmdAH-4.19.1 {convertrom -failindex with correct data} -body { diff --git a/tests/encoding.test b/tests/encoding.test index 6f11968..c8f409e 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -669,10 +669,10 @@ test encoding-24.21 {Parse with -nocomplain but without providing encoding} { } 1 test encoding-24.22 {Syntax error, two encodings} -body { encoding convertfrom iso8859-1 utf-8 "ZX\uD800" -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test encoding-24.23 {Syntax error, two encodings} -body { encoding convertto iso8859-1 utf-8 "ZX\uD800" -} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -returnCodes 1 -result {wrong # args: should be "::tcl::encoding::convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} file delete [file join [temporaryDirectory] iso2022.txt] diff --git a/tests/safe.test b/tests/safe.test index fc7c814..148215a 100644 --- a/tests/safe.test +++ b/tests/safe.test @@ -1269,7 +1269,7 @@ test safe-11.7 {testing safe encoding} -setup { interp eval $i encoding convertfrom } -returnCodes error -cleanup { safe::interpDelete $i -} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-failindex var? ?encoding? data"} +} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test safe-11.7.1 {testing safe encoding} -setup { set i [safe::interpCreate] } -body { @@ -1278,7 +1278,7 @@ test safe-11.7.1 {testing safe encoding} -setup { } -match glob -cleanup { unset -nocomplain m o safe::interpDelete $i -} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-failindex var? ?encoding? data" +} -result {wrong # args: should be "encoding convertfrom ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data" while executing "encoding convertfrom" invoked from within @@ -1291,7 +1291,7 @@ test safe-11.8 {testing safe encoding} -setup { interp eval $i encoding convertto } -returnCodes error -cleanup { safe::interpDelete $i -} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-failindex var? ?encoding? data"} +} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data"} test safe-11.8.1 {testing safe encoding} -setup { set i [safe::interpCreate] } -body { @@ -1300,7 +1300,7 @@ test safe-11.8.1 {testing safe encoding} -setup { } -match glob -cleanup { unset -nocomplain m o safe::interpDelete $i -} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-failindex var? ?encoding? data" +} -result {wrong # args: should be "encoding convertto ?-nocomplain? ?-strict? ?-failindex var? ?encoding? data" while executing "encoding convertto" invoked from within -- cgit v0.12 From a312d4a759c7d925d22e4e4fc8cbbe2d23dc9f27 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 16 Sep 2022 13:44:25 +0000 Subject: Add testcases, and fix a bug found by it --- generic/tclEncoding.c | 9 ++++++--- tests/cmdAH.test | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 9c4b5ce..3d5e474 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2361,13 +2361,16 @@ UtfToUtfProc( } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { /* - * Convert 0xC080 to real nulls when we are in output mode. + * If in input mode, and -strict is specified: This is an error. */ - if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { + if (flags & TCL_ENCODING_MODIFIED) { result = TCL_CONVERT_UNKNOWN; break; - } + } + /* + * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'. + */ *dst++ = 0; src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 38ca521..69da6c2 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -21,6 +21,7 @@ catch [list package require -exact tcl::test [info patchlevel]] testConstraint testchmod [llength [info commands testchmod]] testConstraint testsetplatform [llength [info commands testsetplatform]] testConstraint testvolumetype [llength [info commands testvolumetype]] +testConstraint testbytestring [llength [info commands testbytestring]] testConstraint time64bit [expr { $::tcl_platform(pointerSize) >= 8 || [llength [info command testsize]] && [testsize st_mtime] >= 8 @@ -349,6 +350,25 @@ test cmdAH-4.21.2 {convertto -failindex with wrong character (byte compiled)} -s } -returnCodes 0 -result {41 1} -cleanup { rename encoding_test "" } +test cmdAH-4.22 {convertfrom -strict} -body { + encoding convertfrom -strict utf-8 A\x00B +} -result A\x00B + +test cmdAH-4.23 {convertfrom -strict} -body { + encoding convertfrom -strict utf-8 A\xC0\x80B +} -returnCodes error -result {unexpected byte sequence starting at index 1: '\xC0'} + +test cmdAH-4.24 {convertto -strict} -body { + encoding convertto -strict utf-8 A\x00B +} -result A\x00B + +test cmdAH-4.25 {convertfrom -strict} -constraints knownBug -body { + encoding convertfrom -strict utf-8 A\x80B +} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'} + +test cmdAH-4.26 {convertto -strict} -constraints {testbytestring knownBug} -body { + encoding convertto -strict utf-8 A[testbytestring \x80]B +} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'} test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body { file -- cgit v0.12