summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorapnadkarni <apnmbx-wits@yahoo.com>2024-08-19 08:49:26 (GMT)
committerapnadkarni <apnmbx-wits@yahoo.com>2024-08-19 08:49:26 (GMT)
commit22a3d17d0df80cc380166516f4f6dadddcfb8bd5 (patch)
treeb1b485316248d05c479266d9b9d777b3f639c20f
parent33fcf01a622ce460dd628a489b1930e90db72a46 (diff)
parent9d36cc55cd2b92193b96f1c100c68dcf3f21a826 (diff)
downloadtcl-22a3d17d0df80cc380166516f4f6dadddcfb8bd5.zip
tcl-22a3d17d0df80cc380166516f4f6dadddcfb8bd5.tar.gz
tcl-22a3d17d0df80cc380166516f4f6dadddcfb8bd5.tar.bz2
Beef up encoding tests for fragmented and split encodings
-rw-r--r--generic/tclTest.c22
-rw-r--r--tests/icuUcmTests.tcl4
-rw-r--r--tests/utfext.test252
3 files changed, 211 insertions, 67 deletions
diff --git a/generic/tclTest.c b/generic/tclTest.c
index 9a7fa39..6c25770 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -2073,7 +2073,7 @@ static void SpecialFree(
* encodingname srcbytes flags state dstlen ?srcreadvar? ?dstwrotevar? ?dstcharsvar?
*
* Results:
- * TCL_OK or TCL_ERROR. This any errors running the test, NOT the
+ * TCL_OK or TCL_ERROR. This indicates any errors running the test, NOT the
* result of Tcl_UtfToExternal or Tcl_ExternalToUtf.
*
* Side effects:
@@ -2084,10 +2084,9 @@ static void SpecialFree(
* entire output buffer, not just the part containing the decoded
* portion. This allows for additional checks at test script level.
*
- * If any of the srcreadvar, dstwrotevar and
- * dstcharsvar are specified and not empty, they are treated as names
- * of variables where the *srcRead, *dstWrote and *dstChars output
- * from the functions are stored.
+ * If any of the srcreadvar, dstwrotevar and dstcharsvar are specified and
+ * not empty, they are treated as names of variables where the *srcRead,
+ * *dstWrote and *dstChars output from the functions are stored.
*
* The function also checks internally whether nuls are correctly
* appended as requested but the TCL_ENCODING_NO_TERMINATE flag
@@ -2121,9 +2120,9 @@ static int UtfExtWrapper(
{"end", TCL_ENCODING_END},
{"noterminate", TCL_ENCODING_NO_TERMINATE},
{"charlimit", TCL_ENCODING_CHAR_LIMIT},
- {"profiletcl8", TCL_ENCODING_PROFILE_TCL8},
- {"profilestrict", TCL_ENCODING_PROFILE_STRICT},
- {"profilereplace", TCL_ENCODING_PROFILE_REPLACE},
+ {"tcl8", TCL_ENCODING_PROFILE_TCL8},
+ {"strict", TCL_ENCODING_PROFILE_STRICT},
+ {"replace", TCL_ENCODING_PROFILE_REPLACE},
{NULL, 0}
};
Tcl_Size i;
@@ -2220,9 +2219,10 @@ static int UtfExtWrapper(
&dstWrote,
dstCharsVar ? &dstChars : NULL);
if (memcmp(bufPtr + bufLen - 4, "\xAB\xCD\xEF\xAB", 4)) {
- Tcl_SetResult(interp,
- "Tcl_ExternalToUtf wrote past output buffer",
- TCL_STATIC);
+ Tcl_SetObjResult(interp,
+ Tcl_ObjPrintf("%s wrote past output buffer",
+ transformer == Tcl_ExternalToUtf ?
+ "Tcl_ExternalToUtf" : "Tcl_UtfToExternal"));
result = TCL_ERROR;
} else if (result != TCL_ERROR) {
Tcl_Obj *resultObjs[3];
diff --git a/tests/icuUcmTests.tcl b/tests/icuUcmTests.tcl
index 3b70748..65082e5 100644
--- a/tests/icuUcmTests.tcl
+++ b/tests/icuUcmTests.tcl
@@ -2,8 +2,8 @@
# This file is automatically generated by ucm2tests.tcl.
# Edits will be overwritten on next generation.
#
-# Generates tests comparing Tcl encodings to ICU.
-# The generated file is NOT standalone. It should be sourced into a test script.
+# Tests comparing Tcl encodings to ICU.
+# This file is NOT standalone. It should be sourced into a test script.
proc ucmConvertfromMismatches {enc map} {
set mismatches {}
diff --git a/tests/utfext.test b/tests/utfext.test
index fd82b16..ecff331 100644
--- a/tests/utfext.test
+++ b/tests/utfext.test
@@ -1,6 +1,8 @@
# This file contains a collection of tests for Tcl_UtfToExternal and
-# Tcl_UtfToExternal. Sourcing this file into Tcl runs the tests and generates
-# errors. No output means no errors found.
+# Tcl_UtfToExternal that exercise various combinations of flags,
+# buffer lengths and fragmentation that cannot be tested by
+# normal script level commands. There tests are NOT intended to check
+# correct encodings; those are elsewhere.
#
# Copyright (c) 2023 Ashok P. Nadkarni
#
@@ -18,69 +20,211 @@ catch [list package require -exact tcl::test [info patchlevel]]
testConstraint testbytestring [llength [info commands testbytestring]]
testConstraint testencoding [llength [info commands testencoding]]
-# Maps encoded bytes string to utf-8 equivalents, both in hex
-# encoding utf-8 encdata
-lappend utfExtMap {*}{
- ascii 414243 414243
-}
+namespace eval utftest {
+ # Format of table, indexed by encoding. The encodings are not exhaustive
+ # but one of each kind of encoding transform (algorithmic, table-driven,
+ # stateful, DBCS, MBCS).
+ # Each element is list of lists. Nested lists have following fields
+ # 0 comment (no spaces, might be used to generate id's as well)
+ # The combination of comment and internal hex (2) should be unique.
+ # 1 hex representation of internal *modified* utf-8 encoding. This is the
+ # source string for Tcl_UtfToExternal and expected result for
+ # Tcl_ExternalToUtf.
+ # 2 hex representation in specified encoding. This is the source string for
+ # Tcl_ExternalToUtf and expected result for Tcl_UtfToExternal.
+ # 3 internal fragmentation index - where to split field 1 for fragmentation
+ # tests. -1 to skip
+ # 4 external fragmentation index - where to split field 2 for fragmentation
+ # tests. -1 to skip
+ #
+ # cesu-8 tests disabled because of bug [304d30677a] - TODO
+ # cesu-8 {
+ # {bmp {41 c3a9 42} {41 c3a9 42} 2 2}
+ # {nonbmp {41 f09f9880 42} {41 eda0bd edb080 42} 3 3}
+ # {null {41 c080 42} {41 00 42} 2 -1}
+ # }
+
+ lappend utfExtMap {*}{
+ ascii {
+ {basic 414243 414243 -1 -1}
+ }
+ utf-8 {
+ {bmp {41 c3a9 42} {41 c3a9 42} 2 2}
+ {nonbmp-frag-1 {41 f09f9880 42} {41 f09f9880 42} 2 2}
+ {nonbmp-frag-2 {41 f09f9880 42} {41 f09f9880 42} 3 3}
+ {nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42} 4 4}
+ {null {41 c080 42} {41 00 42} 2 -1}
+ }
+ utf-16le {
+ {bmp {41 c3a9 42} {4100 e900 4200} 2 3}
+ {nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4 3}
+ {split-surrogate {41 f09f9080 42} {4100 3dd8 00dc 4200} 3 4}
+ {null {41 c080 42} {4100 0000 4200} 2 3}
+ }
+ utf-16be {
+ {bmp {41 c3a9 42} {0041 00e9 0042} 2 3}
+ {nonbmp {41 f09f9880 42} {0041 d83d de00 0042} 4 3}
+ {split-surrogate {41 f09f9080 42} {0041 d83d dc00 0042} 3 4}
+ {null {41 c080 42} {0041 0000 0042} 2 3}
+ }
+ utf-32le {
+ {bmp {41 c3a9 42} {41000000 e9000000 42000000} 2 3}
+ {nonbmp {41 f09f9880 42} {41000000 00f60100 42000000} 4 6}
+ {null {41 c080 42} {41000000 00000000 42000000} 2 3}
+ }
+ utf-32be {
+ {bmp {41 c3a9 42} {00000041 000000e9 00000042} 2 3}
+ {nonbmp {41 f09f9880 42} {00000041 0001f600 00000042} 4 3}
+ {null {41 c080 42} {00000041 00000000 00000042} 2 3}
+ }
+ iso8859-1 {
+ {basic {41 c3a9 42} 41e942 2 -1}
+ {null {41 c080 42} 410042 2 -1}
+ }
+ iso8859-3 {
+ {basic {41 c4a0 42} 41d542 2 -1}
+ {null {41 c080 42} 410042 2 -1}
+ }
+ shiftjis {
+ {basic {41 e4b98e 42} {41 8cc1 42} 3 2}
+ }
+ jis0208 {
+ {basic {e4b98e e590be} {3843 3863} 1 1}
+ }
+ iso2022-jp {
+ {frag-in-leadescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 2}
+ {frag-in-char {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 5}
+ {frag-in-trailescape {58 e4b98e 5a} {58 1b2442 3843 1b2842 5a} 2 8}
+ }
+ }
-# Simple test with basic flags
-proc testbasic {direction enc hexin hexout {flags {start end}}} {
- if {$direction eq "toutf"} {
- set cmd Tcl_ExternalToUtf
- } else {
- set cmd Tcl_UtfToExternal
+ # Return a binary string containing nul terminator for encoding
+ proc hexnuls {enc} {
+ return [binary encode hex [encoding convertto $enc \x00]]
}
- set in [binary decode hex $hexin]
- set out [binary decode hex $hexout]
- set dstlen 40 ;# Should be enough for all encoding tests
# The C wrapper fills entire destination buffer with FF.
# Anything beyond expected output should have FF's
- set filler [string repeat \xFF $dstlen]
- set result [string range "$out$filler" 0 $dstlen-1]
- test $cmd-$enc-$hexin-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \
- [list testencoding $cmd $enc $in $flags {} $dstlen] \
- -result [list ok {} $result] -constraints testencoding
- foreach profile [encoding profiles] {
- set flags2 [linsert $flags end profile$profile]
- test $cmd-$enc-$hexin-[join $flags2 -] "$cmd - $enc - $hexin - $flags" -body \
- [list testencoding $cmd $enc $in $flags2 {} $dstlen] \
- -result [list ok {} $result] -constraints testencoding
+ proc fill {bin buflen} {
+ return [string range "$bin[string repeat \xFF $buflen]" 0 $buflen-1]
}
-}
-#
-# Basic tests
-foreach {enc utfhex hex} $utfExtMap {
- # Basic test - TCL_ENCODING_START|TCL_ENCODING_END
- # Note by default output should be terminated with \0
- testbasic toutf $enc $hex ${utfhex}00 {start end}
- testbasic fromutf $enc $utfhex ${hex}00 {start end}
-
- # Test TCL_ENCODING_NO_TERMINATE
- testbasic toutf $enc $hex $utfhex {start end noterminate}
- # knownBug - noterminate not obeyed by fromutf
- # testbasic fromutf $enc $utfhex $hex {start end noterminate}
-}
+ proc testutf {direction enc comment hexin hexout args} {
+ set id $comment-[join $hexin ""]
+ if {$direction eq "toutf"} {
+ set cmd Tcl_ExternalToUtf
+ } else {
+ set cmd Tcl_UtfToExternal
+ }
+ set in [binary decode hex $hexin]
+ set out [binary decode hex $hexout]
+ set dstlen 40 ;# Should be enough for all encoding tests
+
+ set status ok
+ set flags [list start end]
+ set constraints [list testencoding]
+ set profiles [encoding profiles]
+ while {[llength $args] > 1} {
+ set opt [lpop args 0]
+ switch $opt {
+ -flags { set flags [lpop args 0] }
+ -constraints { lappend constraints {*}[lpop args 0] }
+ -profiles { set profiles [lpop args 0] }
+ -status { set status [lpop args 0]}
+ default {
+ error "Unknown option \"$opt\""
+ }
+ }
+ }
+ if {[llength $args]} {
+ error "No value supplied for option [lindex $args 0]."
+ }
+
+ set result [list $status {} [fill $out $dstlen]]
+
+ test $cmd-$enc-$id-[join $flags -] "$cmd - $enc - $hexin - $flags" -body \
+ [list testencoding $cmd $enc $in $flags {} $dstlen] \
+ -result $result -constraints $constraints
+ foreach profile $profiles {
+ set flags2 [linsert $flags end $profile]
+ test $cmd-$enc-$id-[join $flags2 -] "$cmd - $enc - $hexin - $flags2" -body \
+ [list testencoding $cmd $enc $in $flags2 {} $dstlen] \
+ -result $result -constraints $constraints
+ }
+ }
+
+ proc testfragment {direction enc comment hexin hexout fragindex} {
-# Test for insufficient space
-test xx-bufferoverflow {buffer overflow Tcl_ExternalToUtf} -body {
- testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1
-} -result [list nospace {} \xFF] -constraints testencoding
+ if {$fragindex < 0} {
+ # Single byte encodings so no question of fragmentation
+ return
+ }
+ set id $comment-[join $hexin ""]-fragment
-# Another bug - char limit not obeyed
-# % set cv 2
-# % testencoding Tcl_ExternalToUtf utf-8 abcdefgh {start end noterminate charlimit} {} 20 rv wv cv
-# nospace {} abcÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ
+ if {$direction eq "toutf"} {
+ set cmd Tcl_ExternalToUtf
+ } else {
+ set cmd Tcl_UtfToExternal
+ }
-test TableToUtf-bug-5be203d6ca {Bug 5be203d6ca - truncated prefix in table encoding} -body {
- set src \x82\x4F\x82\x50\x82
- lassign [testencoding Tcl_ExternalToUtf shiftjis $src {start profiletcl8} 0 16 srcRead dstWritten charsWritten] buf
- set result [list [testencoding Tcl_ExternalToUtf shiftjis $src {start profiletcl8} 0 16 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
- lappend result {*}[list [testencoding Tcl_ExternalToUtf shiftjis [string range $src $srcRead end] {end profiletcl8} 0 10 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
-} -result [list [list multibyte 0 \xEF\xBC\x90\xEF\xBC\x91\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 4 6 2 [list ok 0 \xC2\x82\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 1 2 1] -constraints testencoding
+ set in [binary decode hex $hexin]
+ set infrag [string range $in 0 $fragindex-1]
+ set out [binary decode hex $hexout]
+ set dstlen 40 ;# Should be enough for all encoding tests
+
+ set expected_result {}
+ append expected_result multibyte $fragindex
+
+ test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body {
+ set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written]
+ lassign $frag1Result frag1Status frag1State frag1Decoded
+ set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written]
+ lassign $frag2Result frag2Status frag2State frag2Decoded
+ set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]]
+ list $frag1Status [expr {$frag1Read < $fragindex}] \
+ $frag2Status [expr {$frag1Read+$frag2Read}] \
+ [expr {$frag1Written+$frag2Written}] $decoded
+ } -result [list multibyte 1 ok [string length $in] [string length $out] $out]
+ }
+
+ #
+ # Basic tests
+ foreach {enc testcases} $utfExtMap {
+ foreach testcase $testcases {
+ lassign $testcase {*}{comment utfhex hex internalfragindex externalfragindex}
+
+ # Basic test - TCL_ENCODING_START|TCL_ENCODING_END
+ # Note by default output should be terminated with \0
+ set encnuls [hexnuls $enc]
+ testutf toutf $enc $comment $hex ${utfhex}00
+ testutf fromutf $enc $comment $utfhex $hex$encnuls
+
+ # Test TCL_ENCODING_NO_TERMINATE
+ testutf toutf $enc $comment $hex $utfhex -flags {start end noterminate}
+ # noterminate is specific to ExternalToUtf,
+ # should have no effect in other direction
+ testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate}
+
+ testfragment toutf $enc $comment $hex $utfhex $externalfragindex
+ testfragment fromutf $enc $comment $utfhex $hex $internalfragindex
+ }
+ }
+
+ # Bug regression tests
+ test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body {
+ testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1
+ } -result [list nospace {} \xFF] -constraints testencoding
+
+ test Tcl_ExternalToUtf-bug-5be203d6ca {
+ truncated prefix in table encoding
+ } -body {
+ set src \x82\x4F\x82\x50\x82
+ set result [list [testencoding Tcl_ExternalToUtf shiftjis $src {start tcl8} 0 16 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
+ lappend result {*}[list [testencoding Tcl_ExternalToUtf shiftjis [string range $src $srcRead end] {end tcl8} 0 10 srcRead dstWritten charsWritten] $srcRead $dstWritten $charsWritten]
+ } -result [list [list multibyte 0 \xEF\xBC\x90\xEF\xBC\x91\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 4 6 2 [list ok 0 \xC2\x82\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF] 1 2 1] -constraints testencoding
+}
+namespace delete utftest
::tcltest::cleanupTests
return