diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-22 13:42:55 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-22 13:42:55 (GMT) |
commit | 293504812606130380d7240fddbbdc573b9dae8c (patch) | |
tree | b1fed63c6b5cb520900605f51d9182374fff3149 /tools | |
parent | 9b8fa27457c97577817b8f86b0b658a04867d7c7 (diff) | |
download | tcl-293504812606130380d7240fddbbdc573b9dae8c.zip tcl-293504812606130380d7240fddbbdc573b9dae8c.tar.gz tcl-293504812606130380d7240fddbbdc573b9dae8c.tar.bz2 |
Add ICU tests for unmapped characters.
Diffstat (limited to 'tools')
-rw-r--r-- | tools/ucm2tests.tcl | 156 |
1 files changed, 122 insertions, 34 deletions
diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl index e971631..dc878ef 100644 --- a/tools/ucm2tests.tcl +++ b/tools/ucm2tests.tcl @@ -37,14 +37,27 @@ namespace eval ucm { iso8859-9 glibc-ISO_8859_9-2.1.2 iso8859-10 glibc-ISO_8859_10-2.1.2 iso8859-11 glibc-ISO_8859_11-2.1.2 - iso8859-13 glibc-ISO_8859_13-2.1.2 + iso8859-13 glibc-ISO_8859_13-2.3.3 iso8859-14 glibc-ISO_8859_14-2.1.2 iso8859-15 glibc-ISO_8859_15-2.1.2 iso8859-16 glibc-ISO_8859_16-2.3.3 } - # Dictionary Character map for Tcl encoding + # Array keyed by Tcl encoding name. Each element contains mapping of + # Unicode code point -> byte sequence for that encoding as a flat list + # (or dictionary). Both are stored as hex strings variable charMap + + # Array keyed by Tcl encoding name. List of invalid code sequences + # each being a hex string. + variable invalidCodeSequences + + # Array keyed by Tcl encoding name. List of unicode code points that are + # not mapped, each being a hex string. + variable unmappedCodePoints + + # The fallback character per encoding + variable encSubchar } proc ucm::abort {msg} { @@ -68,7 +81,11 @@ proc ucm::print {s} { puts $outputChan $s } -proc ucm::parse_SBCS {fd} { +proc ucm::parse_SBCS {encName fd} { + variable charMap + variable invalidCodeSequences + variable unmappedCodePoints + set result {} while {[gets $fd line] >= 0} { if {[string match #* $line]} { @@ -87,26 +104,44 @@ proc ucm::parse_SBCS {fd} { # It is a fallback mapping - ignore } } - return $result -} + set charMap($encName) $result -proc ucm::generate_tests {} { - variable encNameMap - variable charMap - variable outputPath - variable outputChan - - if {[info exists outputPath]} { - set outputChan [open $outputPath w] - } else { - set outputChan stdout + # Find out invalid code sequences and unicode code points that are not mapped + set valid {} + set mapped {} + foreach {unich bytes} $result { + lappend mapped $unich + lappend valid $bytes + } + set invalidCodeSequences($encName) {} + for {set i 0} {$i <= 255} {incr i} { + set hex [format %.2X $i] + if {[lsearch -exact $valid $hex] < 0} { + lappend invalidCodeSequences($encName) $hex + } } - array set tclNames {} - foreach encName [encoding names] { - set tclNames($encName) "" + set unmappedCodePoints($encName) {} + for {set i 0} {$i <= 65535} {incr i} { + set hex [format %.4X $i] + if {[lsearch -exact $mapped $hex] < 0} { + lappend unmappedCodePoints($encName) $hex + # Only look for (at most) one below 256 and one above 1024 + if {$i < 255} { + # Found one so jump past 8 bits + set i 255 + } else { + break + } + } + if {$i == 255} { + set i 1023 + } } + lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF +} +proc ucm::generate_boilerplate {} { # Common procedures print { # This file is automatically generated by ucm2tests.tcl. @@ -118,6 +153,7 @@ proc ucm::generate_tests {} { proc ucmConvertfromMismatches {enc map} { set mismatches {} foreach {unihex hex} $map { + set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits set unich [subst "\\U$unihex"] if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} { lappend mismatches "<[printable $unich],$hex>" @@ -128,6 +164,7 @@ proc ucmConvertfromMismatches {enc map} { proc ucmConverttoMismatches {enc map} { set mismatches {} foreach {unihex hex} $map { + set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits set unich [subst "\\U$unihex"] if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} { lappend mismatches "<[printable $unich],$hex>" @@ -154,6 +191,30 @@ if {[info commands printable] eq ""} { } } } +} ; # generate_boilerplate + +proc ucm::generate_tests {} { + variable encNameMap + variable charMap + variable invalidCodeSequences + variable unmappedCodePoints + variable outputPath + variable outputChan + variable encSubchar + + if {[info exists outputPath]} { + set outputChan [open $outputPath w] + fconfigure $outputChan -translation lf + } else { + set outputChan stdout + } + + array set tclNames {} + foreach encName [encoding names] { + set tclNames($encName) "" + } + + generate_boilerplate foreach encName [lsort -dictionary [array names encNameMap]] { if {![info exists charMap($encName)]} { warn "No character map read for $encName" @@ -161,6 +222,7 @@ if {[info commands printable] eq ""} { } unset tclNames($encName) + # Print the valid tests print "\n#\n# $encName (generated from $encNameMap($encName))" print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{" print " ucmConvertfromMismatches $encName {$charMap($encName)}" @@ -172,13 +234,42 @@ if {[info commands printable] eq ""} { # This will generate individual tests for every char # and test in lead, tail, middle, solo configurations # but takes considerable time - print "lappend encValidStrings {*}{" + print "lappend encValidStrings \{*\}\{" foreach {unich hex} $charMap($encName) { print " $encName \\u$unich $hex {} {}" } - print "}; # $encName" + print "\}; # $encName" + } + + # Generate the invalidity checks + print "\n# $encName - invalid byte sequences" + print "lappend encInvalidBytes \{*\}\{" + foreach hex $invalidCodeSequences($encName) { + # Map XXXX... to \xXX\xXX... + set uhex [regsub -all .. $hex {\\x\0}] + set uhex \\U[string range 00000000$hex end-7 end] + print " $encName $hex tcl8 $uhex -1 {} {}" + print " $encName $hex replace \\uFFFD -1 {} {}" + print " $encName $hex strict {} 0 {} {}" + } + print "\}; # $encName" + + print "\n# $encName - invalid byte sequences" + print "lappend encUnencodableStrings \{*\}\{" + if {[info exists encSubchar($encName)]} { + set subchar $encSubchar($encName) + } else { + set subchar "3F"; # Tcl uses ? by default } + foreach hex $unmappedCodePoints($encName) { + set uhex \\U[string range 00000000$hex end-7 end] + print " $encName $uhex tcl8 $subchar -1 {} {}" + print " $encName $uhex replace $subchar -1 {} {}" + print " $encName $uhex strict {} 0 {} {}" + } + print "\}; # $encName" } + if {[array size tclNames]} { warn "Missing encoding: [lsort [array names tclNames]]" } @@ -190,6 +281,8 @@ if {[info commands printable] eq ""} { proc ucm::parse_file {encName ucmPath} { variable charMap + variable encSubchar + set fd [open $ucmPath] try { # Parse the metadata @@ -205,7 +298,7 @@ proc ucm::parse_file {encName ucmPath} { } } if {![info exists state(charmap)]} { - abort "Error: $path has No CHARMAP line." + abort "Error: $ucmPath has No CHARMAP line." } foreach key {code_set_name uconv_class} { if {[info exists state($key)]} { @@ -216,18 +309,22 @@ proc ucm::parse_file {encName ucmPath} { abort "Duplicate file for $encName ($path)" } if {![info exists state(uconv_class)]} { - abort "Error: $path has no uconv_class definition." + abort "Error: $ucmPath has no uconv_class definition." + } + if {[info exists state(subchar)]} { + # \xNN\xNN.. -> NNNN.. + set encSubchar($encName) [string map {\\x {}} $state(subchar)] } switch -exact -- $state(uconv_class) { SBCS { if {[catch { - set charMap($encName) [parse_SBCS $fd] + parse_SBCS $encName $fd } result]} { - abort "Could not process $path. $result" + abort "Could not process $ucmPath. $result" } } default { - log "Skipping $path -- not SBCS encoding." + log "Skipping $ucmPath -- not SBCS encoding." return } } @@ -236,15 +333,6 @@ proc ucm::parse_file {encName ucmPath} { } } -proc ucm::expand_paths {patterns} { - set expanded {} - foreach pat $patterns { - # The file join is for \ -> / - lappend expanded {*}[glob -nocomplain [file join $pat]] - } - return $expanded -} - proc ucm::run {} { variable encNameMap variable outputPath |