diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2024-08-19 08:49:26 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2024-08-19 08:49:26 (GMT) |
commit | 22a3d17d0df80cc380166516f4f6dadddcfb8bd5 (patch) | |
tree | b1b485316248d05c479266d9b9d777b3f639c20f | |
parent | 33fcf01a622ce460dd628a489b1930e90db72a46 (diff) | |
parent | 9d36cc55cd2b92193b96f1c100c68dcf3f21a826 (diff) | |
download | tcl-22a3d17d0df80cc380166516f4f6dadddcfb8bd5.zip tcl-22a3d17d0df80cc380166516f4f6dadddcfb8bd5.tar.gz tcl-22a3d17d0df80cc380166516f4f6dadddcfb8bd5.tar.bz2 |
Beef up encoding tests for fragmented and split encodings
-rw-r--r-- | generic/tclTest.c | 22 | ||||
-rw-r--r-- | tests/icuUcmTests.tcl | 4 | ||||
-rw-r--r-- | tests/utfext.test | 252 |
3 files changed, 211 insertions, 67 deletions
diff --git a/generic/tclTest.c b/generic/tclTest.c index 9a7fa39..6c25770 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -2073,7 +2073,7 @@ static void SpecialFree( * encodingname srcbytes flags state dstlen ?srcreadvar? ?dstwrotevar? ?dstcharsvar? * * Results: - * TCL_OK or TCL_ERROR. This any errors running the test, NOT the + * TCL_OK or TCL_ERROR. This indicates any errors running the test, NOT the * result of Tcl_UtfToExternal or Tcl_ExternalToUtf. * * Side effects: @@ -2084,10 +2084,9 @@ static void SpecialFree( * entire output buffer, not just the part containing the decoded * portion. This allows for additional checks at test script level. * - * If any of the srcreadvar, dstwrotevar and - * dstcharsvar are specified and not empty, they are treated as names - * of variables where the *srcRead, *dstWrote and *dstChars output - * from the functions are stored. + * If any of the srcreadvar, dstwrotevar and dstcharsvar are specified and + * not empty, they are treated as names of variables where the *srcRead, + * *dstWrote and *dstChars output from the functions are stored. * * The function also checks internally whether nuls are correctly * appended as requested but the TCL_ENCODING_NO_TERMINATE flag @@ -2121,9 +2120,9 @@ static int UtfExtWrapper( {"end", TCL_ENCODING_END}, {"noterminate", TCL_ENCODING_NO_TERMINATE}, {"charlimit", TCL_ENCODING_CHAR_LIMIT}, - {"profiletcl8", TCL_ENCODING_PROFILE_TCL8}, - {"profilestrict", TCL_ENCODING_PROFILE_STRICT}, - {"profilereplace", TCL_ENCODING_PROFILE_REPLACE}, + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"replace", TCL_ENCODING_PROFILE_REPLACE}, {NULL, 0} }; Tcl_Size i; @@ -2220,9 +2219,10 @@ static int UtfExtWrapper( &dstWrote, dstCharsVar ? &dstChars : NULL); if (memcmp(bufPtr + bufLen - 4, "\xAB\xCD\xEF\xAB", 4)) { - Tcl_SetResult(interp, - "Tcl_ExternalToUtf wrote past output buffer", - TCL_STATIC); + Tcl_SetObjResult(interp, + Tcl_ObjPrintf("%s wrote past output buffer", + transformer == Tcl_ExternalToUtf ? + "Tcl_ExternalToUtf" : "Tcl_UtfToExternal")); result = TCL_ERROR; } else if (result != TCL_ERROR) { Tcl_Obj *resultObjs[3]; diff --git a/tests/icuUcmTests.tcl b/tests/icuUcmTests.tcl index 3b70748..65082e5 100644 --- a/tests/icuUcmTests.tcl +++ b/tests/icuUcmTests.tcl @@ -2,8 +2,8 @@ # This file is automatically generated by ucm2tests.tcl. # Edits will be overwritten on next generation. # -# Generates tests comparing Tcl encodings to ICU. -# The generated file is NOT standalone. It should be sourced into a test script. +# Tests comparing Tcl encodings to ICU. +# This file is NOT standalone. It should be sourced into a test script. proc ucmConvertfromMismatches {enc map} { set mismatches {} diff --git a/tests/utfext.test b/tests/utfext.test index fd82b16..ecff331 100644 --- a/tests/utfext.test +++ b/tests/utfext.test @@ -1,6 +1,8 @@ # This file contains a collection of tests for Tcl_UtfToExternal and -# Tcl_UtfToExternal. Sourcing this file into Tcl runs the tests and generates -# errors. No output means no errors found. +# Tcl_UtfToExternal that exercise various combinations of flags, +# buffer lengths and fragmentation that cannot be tested by +# normal script level commands. There tests are NOT intended to check +# correct encodings; those are elsewhere. # # Copyright (c) 2023 Ashok P. Nadkarni # @@ -18,69 +20,211 @@ catch [list package require -exact tcl::test [info patchlevel]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testencoding [llength [info commands testencoding]] -# Maps encoded bytes string to utf-8 equivalents, both in hex -# encoding utf-8 encdata -lappend utfExtMap {*}{ - ascii 414243 414243 -} +namespace eval utftest { + # Format of table, indexed by encoding. The encodings are not exhaustive + # but one of each kind of encoding transform (algorithmic, table-driven, + # stateful, DBCS, MBCS). + # Each element is list of lists. Nested lists have following fields + # 0 comment (no spaces, might be used to generate id's as well) + # The combination of comment and internal hex (2) should be unique. + # 1 hex representation of internal *modified* utf-8 encoding. This is the + # source string for Tcl_UtfToExternal and expected result for + # Tcl_ExternalToUtf. + # 2 hex representation in specified encoding. This is the source string for + # Tcl_ExternalToUtf and expected result for Tcl_UtfToExternal. + # 3 internal fragmentation index - where to split field 1 for fragmentation + # tests. -1 to skip + # 4 external fragmentation index - where to split field 2 for fragmentation + # tests. -1 to skip + # + # cesu-8 tests disabled because of bug [304d30677a] - TODO + # cesu-8 { + # {bmp {41 c3a9 42} {41 c3a9 42} 2 2} + # {nonbmp {41 f09f9880 42} {41 eda0bd edb080 42} 3 3} + # {null {41 c080 42} {41 00 42} 2 -1} + # } + + lappend utfExtMap {*}{ + ascii { + {basic 414243 414243 -1 -1} + } + utf-8 { + {bmp {41 c3a9 42} {41 c3a9 42} 2 2} + {nonbmp-frag-1 {41 f09f9880 42} {41 f09f9880 42} 2 2} + {nonbmp-frag-2 {41 f09f9880 42} {41 f09f9880 42} 3 3} + {nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42} 4 4} + {null {41 c080 42} {41 00 42} 2 -1} + } + utf-16le { + {bmp {41 c3a9 42} {4100 e900 4200} 2 3} + {nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4 3} + {split-surrogate {41 f09f9080 42} {4100 3dd8 00dc 4200} 3 4} + {null {41 c080 42} {4100 0000 4200} 2 3} + } + utf-16be { + {bmp {41 c3a9 42} {0041 00e9 0042} 2 3} + {nonbmp {41 f09f9880 42} {0041 d83d de00 0042} 4 3} + {split-surrogate {41 f09f9080 42} {0041 d83d dc00 0042} 3 4} + {null {41 c080 42} {0041 0000 0042} 2 3} + } + utf-32le { + {bmp {41 c3a9 42} {41000000 e9000000 42000000} 2 3} + {nonbmp {41 f09f9880 42} {41000000 00f60100 42000000} 4 6} + {null {41 c080 42} {41000000 00000000 42000000} 2 3} + } + utf-32be { + {bmp {41 c3a9 42} {00000041 000000e9 00000042} 2 3} + {nonbmp {41 f09f9880 42} {00000041 0001f600 00000042} 4 3} + {null {41 c080 42} {00000041 00000000 00000042} 2 3} + } + iso8859-1 { + {basic {41 c3a9 42} 41e942 2 -1} + {null {41 c080 42} 410042 2 -1} + } + iso8859-3 { + {basic {41 c4a0 42} 41d542 2 -1} + {null {41 c080 42} 410042 2 -1} + } + shiftjis { + {basic {41 e4b98e 42} {41 8cc1 42} 3 2} + } + jis0208 { + {basic {e4b98e e590be} {3843 3863} 1 1} + } + iso2022-jp { + {frag-in-leadescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 2} + {frag-in-char {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 5} + {frag-in-trailescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 8} + } + } -# Simple test with basic flags -proc testbasic {direction enc hexin hexout {flags {start end}}} { - if {$direction eq "toutf"} { - set cmd Tcl_ExternalToUtf - } else { - set cmd Tcl_UtfToExternal + # Return a binary string containing nul terminator for encoding + proc hexnuls {enc} { + return [binary encode hex [encoding convertto $enc \x00]] } - set in [binary decode hex $hexin] - set out [binary decode hex $hexout] - set dstlen 40 ;# Should be enough for all encoding tests # The C wrapper fills entire destination buffer with FF. # Anything beyond expected output should have FF's - set filler [string repeat \xFF $dstlen] - set result [string range "$out$filler" 0 $dstlen-1] - test $cmd-$enc-$hexin-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \ - [list testencoding $cmd $enc $in $flags {} $dstlen] \ - -result [list ok {} $result] -constraints testencoding - foreach profile [encoding profiles] { - set flags2 [linsert $flags end profile$profile] - test $cmd-$enc-$hexin-[join $flags2 -] "$cmd - $enc - $hexin - $flags" -body \ - [list testencoding $cmd $enc $in $flags2 {} $dstlen] \ - -result [list ok {} $result] -constraints testencoding + proc fill {bin buflen} { + return [string range "$bin[string repeat \xFF $buflen]" 0 $buflen-1] } -} -# -# Basic tests -foreach {enc utfhex hex} $utfExtMap { - # Basic test - TCL_ENCODING_START|TCL_ENCODING_END - # Note by default output should be terminated with \0 - testbasic toutf $enc $hex ${utfhex}00 {start end} - testbasic fromutf $enc $utfhex ${hex}00 {start end} - - # Test TCL_ENCODING_NO_TERMINATE - testbasic toutf $enc $hex $utfhex {start end noterminate} - # knownBug - noterminate not obeyed by fromutf - # testbasic fromutf $enc $utfhex $hex {start end noterminate} -} + proc testutf {direction enc comment hexin hexout args} { + set id $comment-[join $hexin ""] + if {$direction eq "toutf"} { + set cmd Tcl_ExternalToUtf + } else { + set cmd Tcl_UtfToExternal + } + set in [binary decode hex $hexin] + set out [binary decode hex $hexout] + set dstlen 40 ;# Should be enough for all encoding tests + + set status ok + set flags [list start end] + set constraints [list testencoding] + set profiles [encoding profiles] + while {[llength $args] > 1} { + set opt [lpop args 0] + switch $opt { + -flags { set flags [lpop args 0] } + -constraints { lappend constraints {*}[lpop args 0] } + -profiles { set profiles [lpop args 0] } + -status { set status [lpop args 0]} + default { + error "Unknown option \"$opt\"" + } + } + } + if {[llength $args]} { + error "No value supplied for option [lindex $args 0]." + } + + set result [list $status {} [fill $out $dstlen]] + + test $cmd-$enc-$id-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \ + [list testencoding $cmd $enc $in $flags {} $dstlen] \ + -result $result -constraints $constraints + foreach profile $profiles { + set flags2 [linsert $flags end $profile] + test $cmd-$enc-$id-[join $flags2 -] "$cmd - $enc - $hexin - $flags2" -body \ + [list testencoding $cmd $enc $in $flags2 {} $dstlen] \ + -result $result -constraints $constraints + } + } + + proc testfragment {direction enc comment hexin hexout fragindex} { -# Test for insufficient space -test xx-bufferoverflow {buffer overflow Tcl_ExternalToUtf} -body { - testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1 -} -result [list nospace {} \xFF] -constraints testencoding + if {$fragindex < 0} { + # Single byte encodings so no question of fragmentation + return + } + set id $comment-[join $hexin ""]-fragment -# Another bug - char limit not obeyed -# % set cv 2 -# % testencoding Tcl_ExternalToUtf utf-8 abcdefgh {start end noterminate charlimit} {} 20 rv wv cv -# nospace {} abcÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ + if {$direction eq "toutf"} { + set cmd Tcl_ExternalToUtf + } else { + set cmd Tcl_UtfToExternal + } -test TableToUtf-bug-5be203d6ca {Bug 5be203d6ca - truncated prefix in table encoding} -body { - set src \x82\x4F\x82\x50\x82 - lassign [testencoding Tcl_ExternalToUtf shiftjis $src {start profiletcl8} 0 16 srcRead dstWritten charsWritten] buf - set result [list [testencoding Tcl_ExternalToUtf shiftjis $src {start profiletcl8} 0 16 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten] - lappend result {*}[list [testencoding Tcl_ExternalToUtf shiftjis [string range $src $srcRead end] {end profiletcl8} 0 10 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten] -} -result [list [list multibyte 0 \xEF\xBC\x90\xEF\xBC\x91\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 4 6 2 [list ok 0 \xC2\x82\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 1 2 1] -constraints testencoding + set in [binary decode hex $hexin] + set infrag [string range $in 0 $fragindex-1] + set out [binary decode hex $hexout] + set dstlen 40 ;# Should be enough for all encoding tests + + set expected_result {} + append expected_result multibyte $fragindex + + test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body { + set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written] + lassign $frag1Result frag1Status frag1State frag1Decoded + set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written] + lassign $frag2Result frag2Status frag2State frag2Decoded + set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]] + list $frag1Status [expr {$frag1Read < $fragindex}] \ + $frag2Status [expr {$frag1Read+$frag2Read}] \ + [expr {$frag1Written+$frag2Written}] $decoded + } -result [list multibyte 1 ok [string length $in] [string length $out] $out] + } + + # + # Basic tests + foreach {enc testcases} $utfExtMap { + foreach testcase $testcases { + lassign $testcase {*}{comment utfhex hex internalfragindex externalfragindex} + + # Basic test - TCL_ENCODING_START|TCL_ENCODING_END + # Note by default output should be terminated with \0 + set encnuls [hexnuls $enc] + testutf toutf $enc $comment $hex ${utfhex}00 + testutf fromutf $enc $comment $utfhex $hex$encnuls + + # Test TCL_ENCODING_NO_TERMINATE + testutf toutf $enc $comment $hex $utfhex -flags {start end noterminate} + # noterminate is specific to ExternalToUtf, + # should have no effect in other direction + testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate} + + testfragment toutf $enc $comment $hex $utfhex $externalfragindex + testfragment fromutf $enc $comment $utfhex $hex $internalfragindex + } + } + + # Bug regression tests + test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body { + testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1 + } -result [list nospace {} \xFF] -constraints testencoding + + test Tcl_ExternalToUtf-bug-5be203d6ca { + truncated prefix in table encoding + } -body { + set src \x82\x4F\x82\x50\x82 + set result [list [testencoding Tcl_ExternalToUtf shiftjis $src {start tcl8} 0 16 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten] + lappend result {*}[list [testencoding Tcl_ExternalToUtf shiftjis [string range $src $srcRead end] {end tcl8} 0 10 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten] + } -result [list [list multibyte 0 \xEF\xBC\x90\xEF\xBC\x91\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 4 6 2 [list ok 0 \xC2\x82\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 1 2 1] -constraints testencoding +} +namespace delete utftest ::tcltest::cleanupTests return |