summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorapnadkarni <apnmbx-wits@yahoo.com>2023-02-21 17:27:16 (GMT)
committerapnadkarni <apnmbx-wits@yahoo.com>2023-02-21 17:27:16 (GMT)
commit9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d (patch)
tree85165a2b11973970b2b3874b5bc1ec5e5ab71768
parentad9b0963ca16870a312fde597a1f478ae73d42c0 (diff)
downloadtcl-9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d.zip
tcl-9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d.tar.gz
tcl-9ff3f290cab36f5ede51d0b712fa040fb6cb5f9d.tar.bz2
Rework ICU tests to check validity of whole charmap in one test, else too many tests.
-rw-r--r--tests/cmdAH.test87
-rw-r--r--tools/ucm2tests.tcl101
2 files changed, 144 insertions, 44 deletions
diff --git a/tests/cmdAH.test b/tests/cmdAH.test
index 1fbe6d2..3be2f14 100644
--- a/tests/cmdAH.test
+++ b/tests/cmdAH.test
@@ -186,9 +186,11 @@ set encDefaultProfile tcl8; # Should reflect the default from implementation
# TODO - valid sequences for different encodings - shiftjis etc.
# Note utf-16, utf-32 missing because they are automatically
# generated based on le/be versions.
-set encValidStrings {
+lappend encValidStrings {*}{
ascii \u0000 00 {} {Lowest ASCII}
ascii \u007F 7F knownBug {Highest ASCII}
+ ascii \u007D 7D {} {Brace - just to verify test scripts are escaped correctly}
+ ascii \u007B 7B {} {Terminating brace - just to verify test scripts are escaped correctly}
utf-8 \u0000 00 {} {Unicode Table 3.7 Row 1}
utf-8 \u007F 7F {} {Unicode Table 3.7 Row 1}
@@ -361,9 +363,28 @@ lappend encInvalidBytes {*}{
utf-8 C080 tcl8 \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8}
utf-8 C080 strict {} 0 {} {C080 -> invalid}
utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char}
+ utf-8 C0A2 tcl8 \u00C0\u00A2 -1 {} {websec.github.io - A}
+ utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A}
+ utf-8 C0A2 strict {} 0 {} {websec.github.io - A}
+ utf-8 C0A7 tcl8 \u00C0\u00A7 -1 {} {websec.github.io - double quote}
+ utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote}
+ utf-8 C0A7 strict {} 0 {} {websec.github.io - double quote}
+ utf-8 C0AE tcl8 \u00C0\u00AE -1 {} {websec.github.io - full stop}
+ utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop}
+ utf-8 C0AE strict {} 0 {} {websec.github.io - full stop}
+ utf-8 C0AF tcl8 \u00C0\u00AF -1 {} {websec.github.io - solidus}
+ utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus}
+ utf-8 C0AF strict {} 0 {} {websec.github.io - solidus}
+
utf-8 C1 tcl8 \u00C1 -1 {} {C1 is invalid everywhere}
utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere}
utf-8 C1 strict {} 0 {} {C1 is invalid everywhere}
+ utf-8 C181 tcl8 \u00C1\u0081 -1 {} {websec.github.io - base test (A)}
+ utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)}
+ utf-8 C181 strict {} 0 {} {websec.github.io - base test (A)}
+ utf-8 C19C tcl8 \u00C1\u0153 -1 {} {websec.github.io - reverse solidus}
+ utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
+ utf-8 C19C strict {} 0 {} {websec.github.io - reverse solidus}
utf-8 C2 tcl8 \u00C2 -1 {} {Missing trail byte}
utf-8 C2 replace \uFFFD -1 {} {Missing trail byte}
@@ -387,6 +408,9 @@ lappend encInvalidBytes {*}{
utf-8 E080 tcl8 \u00E0\u20AC -1 {} {First trail byte must be A0:BF}
utf-8 E080 replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF}
utf-8 E080 strict {} 0 {} {First trail byte must be A0:BF}
+ utf-8 E0819C tcl8 \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus}
+ utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
+ utf-8 E0819C strict {} 0 {} {websec.github.io - reverse solidus}
utf-8 E09F tcl8 \u00E0\u0178 -1 {} {First trail byte must be A0:BF}
utf-8 E09F replace \uFFFD\uFFFD -1 {} {First trail byte must be A0:BF}
utf-8 E09F strict {} 0 {} {First trail byte must be A0:BF}
@@ -526,6 +550,9 @@ lappend encInvalidBytes {*}{
utf-8 F0 tcl8 \u00F0 -1 {} {Missing trail byte}
utf-8 F0 replace \uFFFD -1 {} {Missing trail byte}
utf-8 F0 strict {} 0 {} {Missing trail byte}
+ utf-8 F080 tcl8 \u00F0\u20AC -1 {} {First trail byte must be 90:BF}
+ utf-8 F080 replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF}
+ utf-8 F080 strict {} 0 {} {First trail byte must be 90:BF}
utf-8 F08F tcl8 \u00F0\u8F -1 {} {First trail byte must be 90:BF}
utf-8 F08F replace \uFFFD -1 {knownW3C} {First trail byte must be 90:BF}
utf-8 F08F strict {} 0 {} {First trail byte must be 90:BF}
@@ -755,7 +782,7 @@ lappend encInvalidBytes {*}{
# be skipped. This is intentional to skip known bugs.
# TODO - other encodings
# TODO - out of range code point (note cannot be generated by \U notation)
-set encUnencodableStrings {
+lappend encUnencodableStrings {*}{
ascii \u00e0 tcl8 3f -1 {} {unencodable}
ascii \u00e0 strict {} 0 {} {unencodable}
@@ -768,12 +795,6 @@ set encUnencodableStrings {
utf-8 \uDC00 strict {} 0 {} High-surrogate
}
-if {$::tcl_platform(byteOrder) eq "littleEndian"} {
- set endian le
-} else {
- set endian be
-}
-
# Maps utf-{16,32}{le,be} to utf-16, utf-32 and
# others to "". Used to test utf-16, utf-32 based
# on system endianness
@@ -881,19 +902,19 @@ proc testprofile {id converter enc profile data result args} {
# Generates tests for compiled and uncompiled implementation.
# Also generates utf-{16,32} tests if passed encoding is utf-{16,32}{le,be}
# The enc and profile are appended to id to generate the test id
-proc testfailindex {id converter enc data result {profile default}} {
- testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc $data] \[set idx]" $result
+proc testfailindex {id converter enc data result failidx {profile default}} {
+ testconvert $id.$enc.$profile "list \[encoding $converter -profile $profile -failindex idx $enc [list $data]\] \[set idx\]" [list $result $failidx]
if {[set enc2 [endianUtf $enc]] ne ""} {
# If utf{16,32}-{le,be}, also do utf{16,32}
- testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 $data] \[set idx]" $result
+ testconvert $id.$enc2.$profile "list \[encoding $converter -profile $profile -failindex idx $enc2 [list $data]\] \[set idx]" [list $result $failidx]
}
# If this is the default profile, generate a test without specifying profile
if {$profile eq $::encDefaultProfile} {
- testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc $data] \[set idx]" $result
+ testconvert $id.$enc.default "list \[encoding $converter -failindex idx $enc [list $data]\] \[set idx]" [list $result $failidx]
if {[set enc2 [endianUtf $enc]] ne ""} {
# If utf{16,32}-{le,be}, also do utf{16,32}
- testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 $data] \[set idx]" $result
+ testconvert $id.$enc2.default "list \[encoding $converter -failindex idx $enc2 [list $data]\] \[set idx]" [list $result $failidx]
}
}
}
@@ -962,10 +983,10 @@ foreach {enc str hex ctrl comment} $encValidStrings {
set prefix_bytes [encoding convertto $enc A]
set suffix_bytes [encoding convertto $enc B]
foreach profile $encProfiles {
- testfailindex cmdAH-4.3.13.$hex.solo convertfrom $enc $bytes [list $str -1] $profile
- testfailindex cmdAH-4.3.13.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile
- testfailindex cmdAH-4.3.13.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile
- testfailindex cmdAH-4.3.13.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile
+ testprofile cmdAH-4.3.13.$hex.solo convertfrom $enc $profile $bytes $str
+ testprofile cmdAH-4.3.13.$hex.lead convertfrom $enc $profile $bytes$suffix_bytes $str$suffix
+ testprofile cmdAH-4.3.13.$hex.tail convertfrom $enc $profile $prefix_bytes$bytes $prefix$str
+ testprofile cmdAH-4.3.13.$hex.middle convertfrom $enc $profile $prefix_bytes$bytes$suffix_bytes $prefix$str$suffix
}
}
@@ -1026,10 +1047,10 @@ foreach {enc str hex ctrl comment} $encValidStrings {
set prefix_bytes [encoding convertto $enc $prefix]
set suffix_bytes [encoding convertto $enc $suffix]
foreach profile $encProfiles {
- testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str -1] $profile
- testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $str$suffix -1] $profile
- testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $prefix$str -1] $profile
- testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $prefix$str$suffix -1] $profile
+ testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes $str -1 $profile
+ testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes $str$suffix -1 $profile
+ testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes $prefix$str -1 $profile
+ testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes $prefix$str$suffix -1 $profile
}
}
@@ -1044,7 +1065,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
set suffix_bytes [encoding convertto $enc $suffix]
set prefixLen [string length $prefix_bytes]
if {$ctrl eq {} || "solo" in $ctrl} {
- testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes [list $str $failidx] $profile
+ testfailindex cmdAH-4.3.14.$hex.solo convertfrom $enc $bytes $str $failidx $profile
}
if {$ctrl eq {} || "lead" in $ctrl} {
if {$failidx == -1} {
@@ -1054,7 +1075,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
# Failure expected
set result ""
}
- testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes [list $result $failidx] $profile
+ testfailindex cmdAH-4.3.14.$hex.lead convertfrom $enc $bytes$suffix_bytes $result $failidx $profile
}
if {$ctrl eq {} || "tail" in $ctrl} {
set expected_failidx $failidx
@@ -1066,7 +1087,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
set result $prefix
incr expected_failidx $prefixLen
}
- testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes [list $result $expected_failidx] $profile
+ testfailindex cmdAH-4.3.14.$hex.tail convertfrom $enc $prefix_bytes$bytes $result $expected_failidx $profile
}
if {$ctrl eq {} || "middle" in $ctrl} {
set expected_failidx $failidx
@@ -1078,7 +1099,7 @@ foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
set result $prefix
incr expected_failidx $prefixLen
}
- testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes [list $result $expected_failidx] $profile
+ testfailindex cmdAH-4.3.14.$hex.middle convertfrom $enc $prefix_bytes$bytes$suffix_bytes $result $expected_failidx $profile
}
}
@@ -1193,10 +1214,10 @@ foreach {enc str hex ctrl comment} $encValidStrings {
set prefix_bytes [encoding convertto $enc A]
set suffix_bytes [encoding convertto $enc B]
foreach profile $encProfiles {
- testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str [list $bytes -1] $profile
- testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix [list $bytes$suffix_bytes -1] $profile
- testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str [list $prefix_bytes$bytes -1] $profile
- testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix [list $prefix_bytes$bytes$suffix_bytes -1] $profile
+ testfailindex cmdAH-4.4.14.$enc.$printable.solo convertto $enc $str $bytes -1 $profile
+ testfailindex cmdAH-4.4.14.$enc.$printable.lead convertto $enc $str$suffix $bytes$suffix_bytes -1 $profile
+ testfailindex cmdAH-4.4.14.$enc.$printable.tail convertto $enc $prefix$str $prefix_bytes$bytes -1 $profile
+ testfailindex cmdAH-4.4.14.$enc.$printable.middle convertto $enc $prefix$str$suffix $prefix_bytes$bytes$suffix_bytes -1 $profile
}
}
@@ -1209,7 +1230,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
set suffix B
set prefixLen [string length [encoding convertto $enc $prefix]]
if {$ctrl eq {} || "solo" in $ctrl} {
- testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str [list $bytes $failidx] $profile
+ testfailindex cmdAH-4.4.14.$printable.solo convertto $enc $str $bytes $failidx $profile
}
if {$ctrl eq {} || "lead" in $ctrl} {
if {$failidx == -1} {
@@ -1219,7 +1240,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
# Failure expected
set result ""
}
- testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix [list $result $failidx] $profile
+ testfailindex cmdAH-4.4.14.$printable.lead convertto $enc $str$suffix $result $failidx $profile
}
if {$ctrl eq {} || "tail" in $ctrl} {
set expected_failidx $failidx
@@ -1231,7 +1252,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
set result $prefix
incr expected_failidx $prefixLen
}
- testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str [list $result $expected_failidx] $profile
+ testfailindex cmdAH-4.4.14.$printable.tail convertto $enc $prefix$str $result $expected_failidx $profile
}
if {$ctrl eq {} || "middle" in $ctrl} {
set expected_failidx $failidx
@@ -1243,7 +1264,7 @@ foreach {enc str profile hex failidx ctrl comment} $encUnencodableStrings {
set result $prefix
incr expected_failidx $prefixLen
}
- testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix [list $result $expected_failidx] $profile
+ testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile
}
}
diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl
index 22ae529..e971631 100644
--- a/tools/ucm2tests.tcl
+++ b/tools/ucm2tests.tcl
@@ -1,14 +1,15 @@
# ucm2tests.tcl
#
# Parses given ucm files (from ICU) to generate test data
-# for encodings. The generated scripts are written to stdout.
+# for encodings.
#
-# tclsh ucmtotests.tcl PATH_TO_ICU_UCM_DIRECTORY
+# tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH?
#
namespace eval ucm {
# No means to change these currently but ...
- variable outputChan stdout
+ variable outputPath
+ variable outputChan
variable errorChan stderr
variable verbose 0
@@ -24,6 +25,7 @@ namespace eval ucm {
cp1256 glibc-CP1256-2.1.2
cp1257 glibc-CP1257-2.1.2
cp1258 glibc-CP1258-2.1.2
+ gb1988 glibc-GB_1988_80-2.3.3
iso8859-1 glibc-ISO_8859_1-2.1.2
iso8859-2 glibc-ISO_8859_2-2.1.2
iso8859-3 glibc-ISO_8859_3-2.1.2
@@ -91,27 +93,99 @@ proc ucm::parse_SBCS {fd} {
proc ucm::generate_tests {} {
variable encNameMap
variable charMap
+ variable outputPath
+ variable outputChan
+
+ if {[info exists outputPath]} {
+ set outputChan [open $outputPath w]
+ } else {
+ set outputChan stdout
+ }
array set tclNames {}
foreach encName [encoding names] {
set tclNames($encName) ""
}
- foreach encName [lsort [array names encNameMap]] {
+
+ # Common procedures
+ print {
+# This file is automatically generated by ucm2tests.tcl.
+# Edits will be overwritten on next generation.
+#
+# Generates tests comparing Tcl encodings to ICU.
+# The generated file is NOT standalone. It should be sourced into a test script.
+
+proc ucmConvertfromMismatches {enc map} {
+ set mismatches {}
+ foreach {unihex hex} $map {
+ set unich [subst "\\U$unihex"]
+ if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} {
+ lappend mismatches "<[printable $unich],$hex>"
+ }
+ }
+ return $mismatches
+}
+proc ucmConverttoMismatches {enc map} {
+ set mismatches {}
+ foreach {unihex hex} $map {
+ set unich [subst "\\U$unihex"]
+ if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} {
+ lappend mismatches "<[printable $unich],$hex>"
+ }
+ }
+ return $mismatches
+}
+if {[info commands printable] eq ""} {
+ proc printable {s} {
+ set print ""
+ foreach c [split $s ""] {
+ set i [scan $c %c]
+ if {[string is print $c] && ($i <= 127)} {
+ append print $c
+ } elseif {$i <= 0xff} {
+ append print \\x[format %02X $i]
+ } elseif {$i <= 0xffff} {
+ append print \\u[format %04X $i]
+ } else {
+ append print \\U[format %08X $i]
+ }
+ }
+ return $print
+ }
+}
+ }
+ foreach encName [lsort -dictionary [array names encNameMap]] {
if {![info exists charMap($encName)]} {
warn "No character map read for $encName"
continue
}
unset tclNames($encName)
- print "\n# $encName (generated from $encNameMap($encName))"
- print "lappend encValidStrings {*}{"
- foreach {unich hex} $charMap($encName) {
- print " $encName \\u$unich $hex {} {}"
+
+ print "\n#\n# $encName (generated from $encNameMap($encName))"
+ print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{"
+ print " ucmConvertfromMismatches $encName {$charMap($encName)}"
+ print "\} -result {}"
+ print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{"
+ print " ucmConverttoMismatches $encName {$charMap($encName)}"
+ print "\} -result {}"
+ if {0} {
+ # This will generate individual tests for every char
+ # and test in lead, tail, middle, solo configurations
+ # but takes considerable time
+ print "lappend encValidStrings {*}{"
+ foreach {unich hex} $charMap($encName) {
+ print " $encName \\u$unich $hex {} {}"
+ }
+ print "}; # $encName"
}
- print "}; # $encName"
}
if {[array size tclNames]} {
warn "Missing encoding: [lsort [array names tclNames]]"
}
+ if {[info exists outputPath]} {
+ close $outputChan
+ unset outputChan
+ }
}
proc ucm::parse_file {encName ucmPath} {
@@ -173,8 +247,13 @@ proc ucm::expand_paths {patterns} {
proc ucm::run {} {
variable encNameMap
- if {[llength $::argv] != 1} {
- abort "Usage: [info nameofexecutable] $::argv0 PATHTOUCMFILES"
+ variable outputPath
+ switch [llength $::argv] {
+ 2 {set outputPath [lindex $::argv 1]}
+ 1 {}
+ default {
+ abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?"
+ }
}
foreach {encName fname} [array get encNameMap] {
ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm]