diff options
Diffstat (limited to 'tools/ucm2tests.tcl')
-rw-r--r-- | tools/ucm2tests.tcl | 352 |
1 files changed, 0 insertions, 352 deletions
diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl deleted file mode 100644 index dc878ef..0000000 --- a/tools/ucm2tests.tcl +++ /dev/null @@ -1,352 +0,0 @@ -# ucm2tests.tcl -# -# Parses given ucm files (from ICU) to generate test data -# for encodings. -# -# tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH? -# - -namespace eval ucm { - # No means to change these currently but ... - variable outputPath - variable outputChan - variable errorChan stderr - variable verbose 0 - - # Map Tcl encoding name to ICU UCM file name - variable encNameMap - array set encNameMap { - cp1250 glibc-CP1250-2.1.2 - cp1251 glibc-CP1251-2.1.2 - cp1252 glibc-CP1252-2.1.2 - cp1253 glibc-CP1253-2.1.2 - cp1254 glibc-CP1254-2.1.2 - cp1255 glibc-CP1255-2.1.2 - cp1256 glibc-CP1256-2.1.2 - cp1257 glibc-CP1257-2.1.2 - cp1258 glibc-CP1258-2.1.2 - gb1988 glibc-GB_1988_80-2.3.3 - iso8859-1 glibc-ISO_8859_1-2.1.2 - iso8859-2 glibc-ISO_8859_2-2.1.2 - iso8859-3 glibc-ISO_8859_3-2.1.2 - iso8859-4 glibc-ISO_8859_4-2.1.2 - iso8859-5 glibc-ISO_8859_5-2.1.2 - iso8859-6 glibc-ISO_8859_6-2.1.2 - iso8859-7 glibc-ISO_8859_7-2.3.3 - iso8859-8 glibc-ISO_8859_8-2.3.3 - iso8859-9 glibc-ISO_8859_9-2.1.2 - iso8859-10 glibc-ISO_8859_10-2.1.2 - iso8859-11 glibc-ISO_8859_11-2.1.2 - iso8859-13 glibc-ISO_8859_13-2.3.3 - iso8859-14 glibc-ISO_8859_14-2.1.2 - iso8859-15 glibc-ISO_8859_15-2.1.2 - iso8859-16 glibc-ISO_8859_16-2.3.3 - } - - # Array keyed by Tcl encoding name. Each element contains mapping of - # Unicode code point -> byte sequence for that encoding as a flat list - # (or dictionary). Both are stored as hex strings - variable charMap - - # Array keyed by Tcl encoding name. List of invalid code sequences - # each being a hex string. - variable invalidCodeSequences - - # Array keyed by Tcl encoding name. List of unicode code points that are - # not mapped, each being a hex string. - variable unmappedCodePoints - - # The fallback character per encoding - variable encSubchar -} - -proc ucm::abort {msg} { - variable errorChan - puts $errorChan $msg - exit 1 -} -proc ucm::warn {msg} { - variable errorChan - puts $errorChan $msg -} -proc ucm::log {msg} { - variable verbose - if {$verbose} { - variable errorChan - puts $errorChan $msg - } -} -proc ucm::print {s} { - variable outputChan - puts $outputChan $s -} - -proc ucm::parse_SBCS {encName fd} { - variable charMap - variable invalidCodeSequences - variable unmappedCodePoints - - set result {} - while {[gets $fd line] >= 0} { - if {[string match #* $line]} { - continue - } - if {[string equal "END CHARMAP" [string trim $line]]} { - break - } - if {![regexp {^\s*<U([[:xdigit:]]{4})>\s*((\\x[[:xdigit:]]{2})+)\s*(\|(0|1|2|3|4))} $line -> unichar bytes - - precision]} { - error "Unexpected line parsing SBCS: $line" - } - set bytes [string map {\\x {}} $bytes]; # \xNN -> NN - if {$precision eq "" || $precision eq "0"} { - lappend result $unichar $bytes - } else { - # It is a fallback mapping - ignore - } - } - set charMap($encName) $result - - # Find out invalid code sequences and unicode code points that are not mapped - set valid {} - set mapped {} - foreach {unich bytes} $result { - lappend mapped $unich - lappend valid $bytes - } - set invalidCodeSequences($encName) {} - for {set i 0} {$i <= 255} {incr i} { - set hex [format %.2X $i] - if {[lsearch -exact $valid $hex] < 0} { - lappend invalidCodeSequences($encName) $hex - } - } - - set unmappedCodePoints($encName) {} - for {set i 0} {$i <= 65535} {incr i} { - set hex [format %.4X $i] - if {[lsearch -exact $mapped $hex] < 0} { - lappend unmappedCodePoints($encName) $hex - # Only look for (at most) one below 256 and one above 1024 - if {$i < 255} { - # Found one so jump past 8 bits - set i 255 - } else { - break - } - } - if {$i == 255} { - set i 1023 - } - } - lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF -} - -proc ucm::generate_boilerplate {} { - # Common procedures - print { -# This file is automatically generated by ucm2tests.tcl. -# Edits will be overwritten on next generation. -# -# Generates tests comparing Tcl encodings to ICU. -# The generated file is NOT standalone. It should be sourced into a test script. - -proc ucmConvertfromMismatches {enc map} { - set mismatches {} - foreach {unihex hex} $map { - set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits - set unich [subst "\\U$unihex"] - if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} { - lappend mismatches "<[printable $unich],$hex>" - } - } - return $mismatches -} -proc ucmConverttoMismatches {enc map} { - set mismatches {} - foreach {unihex hex} $map { - set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits - set unich [subst "\\U$unihex"] - if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} { - lappend mismatches "<[printable $unich],$hex>" - } - } - return $mismatches -} -if {[info commands printable] eq ""} { - proc printable {s} { - set print "" - foreach c [split $s ""] { - set i [scan $c %c] - if {[string is print $c] && ($i <= 127)} { - append print $c - } elseif {$i <= 0xff} { - append print \\x[format %02X $i] - } elseif {$i <= 0xffff} { - append print \\u[format %04X $i] - } else { - append print \\U[format %08X $i] - } - } - return $print - } -} - } -} ; # generate_boilerplate - -proc ucm::generate_tests {} { - variable encNameMap - variable charMap - variable invalidCodeSequences - variable unmappedCodePoints - variable outputPath - variable outputChan - variable encSubchar - - if {[info exists outputPath]} { - set outputChan [open $outputPath w] - fconfigure $outputChan -translation lf - } else { - set outputChan stdout - } - - array set tclNames {} - foreach encName [encoding names] { - set tclNames($encName) "" - } - - generate_boilerplate - foreach encName [lsort -dictionary [array names encNameMap]] { - if {![info exists charMap($encName)]} { - warn "No character map read for $encName" - continue - } - unset tclNames($encName) - - # Print the valid tests - print "\n#\n# $encName (generated from $encNameMap($encName))" - print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{" - print " ucmConvertfromMismatches $encName {$charMap($encName)}" - print "\} -result {}" - print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{" - print " ucmConverttoMismatches $encName {$charMap($encName)}" - print "\} -result {}" - if {0} { - # This will generate individual tests for every char - # and test in lead, tail, middle, solo configurations - # but takes considerable time - print "lappend encValidStrings \{*\}\{" - foreach {unich hex} $charMap($encName) { - print " $encName \\u$unich $hex {} {}" - } - print "\}; # $encName" - } - - # Generate the invalidity checks - print "\n# $encName - invalid byte sequences" - print "lappend encInvalidBytes \{*\}\{" - foreach hex $invalidCodeSequences($encName) { - # Map XXXX... to \xXX\xXX... - set uhex [regsub -all .. $hex {\\x\0}] - set uhex \\U[string range 00000000$hex end-7 end] - print " $encName $hex tcl8 $uhex -1 {} {}" - print " $encName $hex replace \\uFFFD -1 {} {}" - print " $encName $hex strict {} 0 {} {}" - } - print "\}; # $encName" - - print "\n# $encName - invalid byte sequences" - print "lappend encUnencodableStrings \{*\}\{" - if {[info exists encSubchar($encName)]} { - set subchar $encSubchar($encName) - } else { - set subchar "3F"; # Tcl uses ? by default - } - foreach hex $unmappedCodePoints($encName) { - set uhex \\U[string range 00000000$hex end-7 end] - print " $encName $uhex tcl8 $subchar -1 {} {}" - print " $encName $uhex replace $subchar -1 {} {}" - print " $encName $uhex strict {} 0 {} {}" - } - print "\}; # $encName" - } - - if {[array size tclNames]} { - warn "Missing encoding: [lsort [array names tclNames]]" - } - if {[info exists outputPath]} { - close $outputChan - unset outputChan - } -} - -proc ucm::parse_file {encName ucmPath} { - variable charMap - variable encSubchar - - set fd [open $ucmPath] - try { - # Parse the metadata - unset -nocomplain state - while {[gets $fd line] >= 0} { - if {[regexp {<(code_set_name|mb_cur_max|mb_cur_min|uconv_class|subchar)>\s+(\S+)} $line -> key val]} { - set state($key) $val - } elseif {[regexp {^\s*CHARMAP\s*$} $line]} { - set state(charmap) "" - break - } else { - # Skip all else - } - } - if {![info exists state(charmap)]} { - abort "Error: $ucmPath has No CHARMAP line." - } - foreach key {code_set_name uconv_class} { - if {[info exists state($key)]} { - set state($key) [string trim $state($key) {"}] - } - } - if {[info exists charMap($encName)]} { - abort "Duplicate file for $encName ($path)" - } - if {![info exists state(uconv_class)]} { - abort "Error: $ucmPath has no uconv_class definition." - } - if {[info exists state(subchar)]} { - # \xNN\xNN.. -> NNNN.. - set encSubchar($encName) [string map {\\x {}} $state(subchar)] - } - switch -exact -- $state(uconv_class) { - SBCS { - if {[catch { - parse_SBCS $encName $fd - } result]} { - abort "Could not process $ucmPath. $result" - } - } - default { - log "Skipping $ucmPath -- not SBCS encoding." - return - } - } - } finally { - close $fd - } -} - -proc ucm::run {} { - variable encNameMap - variable outputPath - switch [llength $::argv] { - 2 {set outputPath [lindex $::argv 1]} - 1 {} - default { - abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?" - } - } - foreach {encName fname} [array get encNameMap] { - ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm] - } - generate_tests -} - -ucm::run |