diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-21 17:27:16 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-21 17:27:16 (GMT) |
commit | 9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d (patch) | |
tree | 85165a2b11973970b2b3874b5bc1ec5e5ab71768 | |
parent | ad9b0963ca16870a312fde597a1f478ae73d42c0 (diff) | |
download | tcl-9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d.zip tcl-9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d.tar.gz tcl-9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d.tar.bz2 |
Rework ICU tests to check validity of whole charmap in one test, else too many tests.
-rw-r--r-- | tests/cmdAH.test | 87 | ||||
-rw-r--r-- | tools/ucm2tests.tcl | 101 |
2 files changed, 144 insertions, 44 deletions
diff --git a/tests/cmdAH.test b/tests/cmdAH.test index 1fbe6d2..3be2f14 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -186,9 +186,11 @@ set encDefaultProfile tcl8; # Should reflect the default from implementation # TODO - valid sequences for different encodings - shiftjis etc. # Note utf-16, utf-32 missing because they are automatically # generated based on le/be versions. -set encValidStrings { +lappend encValidStrings {*}{ ascii \u0000 00 {} {Lowest ASCII} ascii \u007F 7F knownBug {Highest ASCII} + ascii \u007D 7D {} {Brace - just to verify test scripts are escaped correctly} + ascii \u007B 7B {} {Terminating brace - just to verify test scripts are escaped correctly} utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1} utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1} @@ -361,9 +363,28 @@ lappend encInvalidBytes {*}{ utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8} utf-8 C080 strict {} 0 {} {C080 -> invalid} utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char} + utf-8 C0A2 tcl8 \u00C0\u00A2 -1 {} {websec.github.io - A} + utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A} + utf-8 C0A2 strict {} 0 {} {websec.github.io - A} + utf-8 C0A7 tcl8 \u00C0\u00A7 -1 {} {websec.github.io - double quote} + utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote} + utf-8 C0A7 strict {} 0 {} {websec.github.io - double quote} + utf-8 C0AE tcl8 \u00C0\u00AE -1 {} {websec.github.io - full stop} + utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop} + utf-8 C0AE strict {} 0 {} {websec.github.io - full stop} + utf-8 C0AF tcl8 \u00C0\u00AF -1 {} {websec.github.io - solidus} + utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus} + utf-8 C0AF strict {} 0 {} {websec.github.io - solidus} + utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere} utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere} utf-8 C1 strict {} 0 {} {C1 is invalid everywhere} + utf-8 C181 tcl8 \u00C1\u0081 -1 {} {websec.github.io - base test (A)} + utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)} + utf-8 C181 strict {} 0 {} {websec.github.io - base test (A)} + utf-8 C19C tcl8 \u00C1\u0153 -1 {} {websec.github.io - reverse solidus} + utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} + utf-8 C19C strict {} 0 {} {websec.github.io - reverse solidus} utf-8 C2 tcl8 \u00C2 -1 {} {Missing trail byte} utf-8 C2 replace \uFFFD -1 {} {Missing trail byte} @@ -387,6 +408,9 @@ lappend encInvalidBytes {*}{ utf-8 E080 tcl8 \u00E0\u20AC -1 {} {First trail byte must be A0:BF} utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF} + utf-8 E0819C tcl8 \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus} + utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus} + utf-8 E0819C strict {} 0 {} {websec.github.io - reverse solidus} utf-8 E09F tcl8 \u00E0\u0178 -1 {} {First trail byte must be A0:BF} utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF} utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF} @@ -526,6 +550,9 @@ lappend encInvalidBytes {*}{ utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte} utf-8 F0 replace \uFFFD -1 {} {Missing trail byte} utf-8 F0 strict {} 0 {} {Missing trail byte} + utf-8 F080 tcl8 \u00F0\u20AC -1 {} {First trail byte must be 90:BF} + utf-8 F080 replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} + utf-8 F080 strict {} 0 {} {First trail byte must be 90:BF} utf-8 F08F tcl8 \u00F0\u8F -1 {} {First trail byte must be 90:BF} utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF} utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF} @@ -755,7 +782,7 @@ lappend encInvalidBytes {*}{ # be skipped. This is intentional to skip known bugs. # TODO - other encodings # TODO - out of range code point (note cannot be generated by \U notation) -set encUnencodableStrings { +lappend encUnencodableStrings {*}{ ascii \u00e0 tcl8 3f -1 {} {unencodable} ascii \u00e0 strict {} 0 {} {unencodable} @@ -768,12 +795,6 @@ set encUnencodableStrings { utf-8 \uDC00 strict {} 0 {} High-surrogate } -if {$::tcl_platform(byteOrder) eq "littleEndian"} { - set endian le -} else { - set endian be -} - # Maps utf-{16,32}{le,be} to utf-16, utf-32 and # others to "". Used to test utf-16, utf-32 based # on system endianness @@ -881,19 +902,19 @@ proc testprofile {id converter enc profile data result args} { # Generates tests for compiled and uncompiled implementation. # Also generates utf-{16,32} tests if passed encoding is utf-{16,32}{le,be} # The enc and profile are appended to id to generate the test id -proc testfailindex {id converter enc data result {profile default}} { - testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result +proc testfailindex {id converter enc data result failidx {profile default}} { + testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc [list $data]\] \[set idx\]" [list $result $failidx] if {[set enc2 [endianUtf $enc]] ne ""} { # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 $data] \[set idx]" $result + testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 [list $data]\] \[set idx]" [list $result $failidx] } # If this is the default profile, generate a test without specifying profile if {$profile eq $::encDefaultProfile} { - testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result + testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc [list $data]\] \[set idx]" [list $result $failidx] if {[set enc2 [endianUtf $enc]] ne ""} { # If utf{16,32}-{le,be}, also do utf{16,32} - testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 $data] \[set idx]" $result + testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 [list $data]\] \[set idx]" [list $result $failidx] } } } @@ -962,10 +983,10 @@ foreach {enc str hex ctrl comment} $encValidStrings { set prefix_bytes [encoding convertto $enc A] set suffix_bytes [encoding convertto $enc B] foreach profile $encProfiles { - testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile - testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile - testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile - testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + testprofile cmdAH-4.3.13.$hex.solo convertfrom $enc $profile $bytes $str + testprofile cmdAH-4.3.13.$hex.lead convertfrom $enc $profile $bytes$suffix_bytes $str$suffix + testprofile cmdAH-4.3.13.$hex.tail convertfrom $enc $profile $prefix_bytes$bytes $prefix$str + testprofile cmdAH-4.3.13.$hex.middle convertfrom $enc $profile $prefix_bytes$bytes$suffix_bytes $prefix$str$suffix } } @@ -1026,10 +1047,10 @@ foreach {enc str hex ctrl comment} $encValidStrings { set prefix_bytes [encoding convertto $enc $prefix] set suffix_bytes [encoding convertto $enc $suffix] foreach profile $encProfiles { - testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile - testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile - testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile - testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes $str -1 $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes $str$suffix -1 $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes $prefix$str -1 $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes $prefix$str$suffix -1 $profile } } @@ -1044,7 +1065,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set suffix_bytes [encoding convertto $enc $suffix] set prefixLen [string length $prefix_bytes] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes $str $failidx $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -1054,7 +1075,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { # Failure expected set result "" } - testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $result $failidx] $profile + testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes $result $failidx $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -1066,7 +1087,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes $result $expected_failidx $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -1078,7 +1099,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $result $expected_failidx] $profile + testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes $result $expected_failidx $profile } } @@ -1193,10 +1214,10 @@ foreach {enc str hex ctrl comment} $encValidStrings { set prefix_bytes [encoding convertto $enc A] set suffix_bytes [encoding convertto $enc B] foreach profile $encProfiles { - testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str [list $bytes -1] $profile - testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix [list $bytes$suffix_bytes -1] $profile - testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str [list $prefix_bytes$bytes -1] $profile - testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix [list $prefix_bytes$bytes$suffix_bytes -1] $profile + testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str $bytes -1 $profile + testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix $bytes$suffix_bytes -1 $profile + testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str $prefix_bytes$bytes -1 $profile + testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix $prefix_bytes$bytes$suffix_bytes -1 $profile } } @@ -1209,7 +1230,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { set suffix B set prefixLen [string length [encoding convertto $enc $prefix]] if {$ctrl eq {} || "solo" in $ctrl} { - testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str [list $bytes $failidx] $profile + testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str $bytes $failidx $profile } if {$ctrl eq {} || "lead" in $ctrl} { if {$failidx == -1} { @@ -1219,7 +1240,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { # Failure expected set result "" } - testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix [list $result $failidx] $profile + testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix $result $failidx $profile } if {$ctrl eq {} || "tail" in $ctrl} { set expected_failidx $failidx @@ -1231,7 +1252,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str [list $result $expected_failidx] $profile + testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str $result $expected_failidx $profile } if {$ctrl eq {} || "middle" in $ctrl} { set expected_failidx $failidx @@ -1243,7 +1264,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings { set result $prefix incr expected_failidx $prefixLen } - testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix [list $result $expected_failidx] $profile + testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile } } diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl index 22ae529..e971631 100644 --- a/tools/ucm2tests.tcl +++ b/tools/ucm2tests.tcl @@ -1,14 +1,15 @@ # ucm2tests.tcl # # Parses given ucm files (from ICU) to generate test data -# for encodings. The generated scripts are written to stdout. +# for encodings. # -# tclsh ucmtotests.tcl PATH_TO_ICU_UCM_DIRECTORY +# tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH? # namespace eval ucm { # No means to change these currently but ... - variable outputChan stdout + variable outputPath + variable outputChan variable errorChan stderr variable verbose 0 @@ -24,6 +25,7 @@ namespace eval ucm { cp1256 glibc-CP1256-2.1.2 cp1257 glibc-CP1257-2.1.2 cp1258 glibc-CP1258-2.1.2 + gb1988 glibc-GB_1988_80-2.3.3 iso8859-1 glibc-ISO_8859_1-2.1.2 iso8859-2 glibc-ISO_8859_2-2.1.2 iso8859-3 glibc-ISO_8859_3-2.1.2 @@ -91,27 +93,99 @@ proc ucm::parse_SBCS {fd} { proc ucm::generate_tests {} { variable encNameMap variable charMap + variable outputPath + variable outputChan + + if {[info exists outputPath]} { + set outputChan [open $outputPath w] + } else { + set outputChan stdout + } array set tclNames {} foreach encName [encoding names] { set tclNames($encName) "" } - foreach encName [lsort [array names encNameMap]] { + + # Common procedures + print { +# This file is automatically generated by ucm2tests.tcl. +# Edits will be overwritten on next generation. +# +# Generates tests comparing Tcl encodings to ICU. +# The generated file is NOT standalone. It should be sourced into a test script. + +proc ucmConvertfromMismatches {enc map} { + set mismatches {} + foreach {unihex hex} $map { + set unich [subst "\\U$unihex"] + if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} { + lappend mismatches "<[printable $unich],$hex>" + } + } + return $mismatches +} +proc ucmConverttoMismatches {enc map} { + set mismatches {} + foreach {unihex hex} $map { + set unich [subst "\\U$unihex"] + if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} { + lappend mismatches "<[printable $unich],$hex>" + } + } + return $mismatches +} +if {[info commands printable] eq ""} { + proc printable {s} { + set print "" + foreach c [split $s ""] { + set i [scan $c %c] + if {[string is print $c] && ($i <= 127)} { + append print $c + } elseif {$i <= 0xff} { + append print \\x[format %02X $i] + } elseif {$i <= 0xffff} { + append print \\u[format %04X $i] + } else { + append print \\U[format %08X $i] + } + } + return $print + } +} + } + foreach encName [lsort -dictionary [array names encNameMap]] { if {![info exists charMap($encName)]} { warn "No character map read for $encName" continue } unset tclNames($encName) - print "\n# $encName (generated from $encNameMap($encName))" - print "lappend encValidStrings {*}{" - foreach {unich hex} $charMap($encName) { - print " $encName \\u$unich $hex {} {}" + + print "\n#\n# $encName (generated from $encNameMap($encName))" + print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{" + print " ucmConvertfromMismatches $encName {$charMap($encName)}" + print "\} -result {}" + print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{" + print " ucmConverttoMismatches $encName {$charMap($encName)}" + print "\} -result {}" + if {0} { + # This will generate individual tests for every char + # and test in lead, tail, middle, solo configurations + # but takes considerable time + print "lappend encValidStrings {*}{" + foreach {unich hex} $charMap($encName) { + print " $encName \\u$unich $hex {} {}" + } + print "}; # $encName" } - print "}; # $encName" } if {[array size tclNames]} { warn "Missing encoding: [lsort [array names tclNames]]" } + if {[info exists outputPath]} { + close $outputChan + unset outputChan + } } proc ucm::parse_file {encName ucmPath} { @@ -173,8 +247,13 @@ proc ucm::expand_paths {patterns} { proc ucm::run {} { variable encNameMap - if {[llength $::argv] != 1} { - abort "Usage: [info nameofexecutable] $::argv0 PATHTOUCMFILES" + variable outputPath + switch [llength $::argv] { + 2 {set outputPath [lindex $::argv 1]} + 1 {} + default { + abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?" + } } foreach {encName fname} [array get encNameMap] { ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm] |