diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-21 16:03:18 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-21 16:03:18 (GMT) |
commit | 4d644dfb73457eb3615b30550dd31d1b48bfa7d4 (patch) | |
tree | d61b9af0fac4280337ac4178d61b5599466e5021 /tools | |
parent | fa9ac8a850701b20b6c178fdbf30b705148ffd6b (diff) | |
download | tcl-4d644dfb73457eb3615b30550dd31d1b48bfa7d4.zip tcl-4d644dfb73457eb3615b30550dd31d1b48bfa7d4.tar.gz tcl-4d644dfb73457eb3615b30550dd31d1b48bfa7d4.tar.bz2 |
Generate test data from ICU UCM data files. SBCS only for now
Diffstat (limited to 'tools')
-rw-r--r-- | tools/ucm2tests.tcl | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl new file mode 100644 index 0000000..22ae529 --- /dev/null +++ b/tools/ucm2tests.tcl @@ -0,0 +1,185 @@ +# ucm2tests.tcl +# +# Parses given ucm files (from ICU) to generate test data +# for encodings. The generated scripts are written to stdout. +# +# tclsh ucmtotests.tcl PATH_TO_ICU_UCM_DIRECTORY +# + +namespace eval ucm { + # No means to change these currently but ... + variable outputChan stdout + variable errorChan stderr + variable verbose 0 + + # Map Tcl encoding name to ICU UCM file name + variable encNameMap + array set encNameMap { + cp1250 glibc-CP1250-2.1.2 + cp1251 glibc-CP1251-2.1.2 + cp1252 glibc-CP1252-2.1.2 + cp1253 glibc-CP1253-2.1.2 + cp1254 glibc-CP1254-2.1.2 + cp1255 glibc-CP1255-2.1.2 + cp1256 glibc-CP1256-2.1.2 + cp1257 glibc-CP1257-2.1.2 + cp1258 glibc-CP1258-2.1.2 + iso8859-1 glibc-ISO_8859_1-2.1.2 + iso8859-2 glibc-ISO_8859_2-2.1.2 + iso8859-3 glibc-ISO_8859_3-2.1.2 + iso8859-4 glibc-ISO_8859_4-2.1.2 + iso8859-5 glibc-ISO_8859_5-2.1.2 + iso8859-6 glibc-ISO_8859_6-2.1.2 + iso8859-7 glibc-ISO_8859_7-2.3.3 + iso8859-8 glibc-ISO_8859_8-2.3.3 + iso8859-9 glibc-ISO_8859_9-2.1.2 + iso8859-10 glibc-ISO_8859_10-2.1.2 + iso8859-11 glibc-ISO_8859_11-2.1.2 + iso8859-13 glibc-ISO_8859_13-2.1.2 + iso8859-14 glibc-ISO_8859_14-2.1.2 + iso8859-15 glibc-ISO_8859_15-2.1.2 + iso8859-16 glibc-ISO_8859_16-2.3.3 + } + + # Dictionary Character map for Tcl encoding + variable charMap +} + +proc ucm::abort {msg} { + variable errorChan + puts $errorChan $msg + exit 1 +} +proc ucm::warn {msg} { + variable errorChan + puts $errorChan $msg +} +proc ucm::log {msg} { + variable verbose + if {$verbose} { + variable errorChan + puts $errorChan $msg + } +} +proc ucm::print {s} { + variable outputChan + puts $outputChan $s +} + +proc ucm::parse_SBCS {fd} { + set result {} + while {[gets $fd line] >= 0} { + if {[string match #* $line]} { + continue + } + if {[string equal "END CHARMAP" [string trim $line]]} { + break + } + if {![regexp {^\s*<U([[:xdigit:]]{4})>\s*((\\x[[:xdigit:]]{2})+)\s*(\|(0|1|2|3|4))} $line -> unichar bytes - - precision]} { + error "Unexpected line parsing SBCS: $line" + } + set bytes [string map {\\x {}} $bytes]; # \xNN -> NN + if {$precision eq "" || $precision eq "0"} { + lappend result $unichar $bytes + } else { + # It is a fallback mapping - ignore + } + } + return $result +} + +proc ucm::generate_tests {} { + variable encNameMap + variable charMap + + array set tclNames {} + foreach encName [encoding names] { + set tclNames($encName) "" + } + foreach encName [lsort [array names encNameMap]] { + if {![info exists charMap($encName)]} { + warn "No character map read for $encName" + continue + } + unset tclNames($encName) + print "\n# $encName (generated from $encNameMap($encName))" + print "lappend encValidStrings {*}{" + foreach {unich hex} $charMap($encName) { + print " $encName \\u$unich $hex {} {}" + } + print "}; # $encName" + } + if {[array size tclNames]} { + warn "Missing encoding: [lsort [array names tclNames]]" + } +} + +proc ucm::parse_file {encName ucmPath} { + variable charMap + set fd [open $ucmPath] + try { + # Parse the metadata + unset -nocomplain state + while {[gets $fd line] >= 0} { + if {[regexp {<(code_set_name|mb_cur_max|mb_cur_min|uconv_class|subchar)>\s+(\S+)} $line -> key val]} { + set state($key) $val + } elseif {[regexp {^\s*CHARMAP\s*$} $line]} { + set state(charmap) "" + break + } else { + # Skip all else + } + } + if {![info exists state(charmap)]} { + abort "Error: $path has No CHARMAP line." + } + foreach key {code_set_name uconv_class} { + if {[info exists state($key)]} { + set state($key) [string trim $state($key) {"}] + } + } + if {[info exists charMap($encName)]} { + abort "Duplicate file for $encName ($path)" + } + if {![info exists state(uconv_class)]} { + abort "Error: $path has no uconv_class definition." + } + switch -exact -- $state(uconv_class) { + SBCS { + if {[catch { + set charMap($encName) [parse_SBCS $fd] + } result]} { + abort "Could not process $path. $result" + } + } + default { + log "Skipping $path -- not SBCS encoding." + return + } + } + } finally { + close $fd + } +} + +proc ucm::expand_paths {patterns} { + set expanded {} + foreach pat $patterns { + # The file join is for \ -> / + lappend expanded {*}[glob -nocomplain [file join $pat]] + } + return $expanded +} + +proc ucm::run {} { + variable encNameMap + if {[llength $::argv] != 1} { + abort "Usage: [info nameofexecutable] $::argv0 PATHTOUCMFILES" + } + foreach {encName fname} [array get encNameMap] { + ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm] + } + generate_tests +} + +ucm::run |