1 files changed, 0 insertions, 352 deletions
diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl
deleted file mode 100644
index dc878ef..0000000
--- a/tools/ucm2tests.tcl
+++ /dev/null
@@ -1,352 +0,0 @@
-# ucm2tests.tcl
-#
-# Parses given ucm files (from ICU) to generate test data
-# for encodings.
-#
-#  tclsh ucm2tests.tcl PATH_TO_ICU_UCM_DIRECTORY ?OUTPUTPATH?
-#
-
-namespace eval ucm {
-    # No means to change these currently but ...
-    variable outputPath
-    variable outputChan
-    variable errorChan stderr
-    variable verbose 0
-
-    # Map Tcl encoding name to ICU UCM file name
-    variable encNameMap
-    array set encNameMap {
-        cp1250    glibc-CP1250-2.1.2
-        cp1251    glibc-CP1251-2.1.2
-        cp1252    glibc-CP1252-2.1.2
-        cp1253    glibc-CP1253-2.1.2
-        cp1254    glibc-CP1254-2.1.2
-        cp1255    glibc-CP1255-2.1.2
-        cp1256    glibc-CP1256-2.1.2
-        cp1257    glibc-CP1257-2.1.2
-        cp1258    glibc-CP1258-2.1.2
-        gb1988    glibc-GB_1988_80-2.3.3
-        iso8859-1 glibc-ISO_8859_1-2.1.2
-        iso8859-2 glibc-ISO_8859_2-2.1.2
-        iso8859-3 glibc-ISO_8859_3-2.1.2
-        iso8859-4 glibc-ISO_8859_4-2.1.2
-        iso8859-5 glibc-ISO_8859_5-2.1.2
-        iso8859-6 glibc-ISO_8859_6-2.1.2
-        iso8859-7 glibc-ISO_8859_7-2.3.3
-        iso8859-8 glibc-ISO_8859_8-2.3.3
-        iso8859-9 glibc-ISO_8859_9-2.1.2
-        iso8859-10 glibc-ISO_8859_10-2.1.2
-        iso8859-11 glibc-ISO_8859_11-2.1.2
-        iso8859-13 glibc-ISO_8859_13-2.3.3
-        iso8859-14 glibc-ISO_8859_14-2.1.2
-        iso8859-15 glibc-ISO_8859_15-2.1.2
-        iso8859-16 glibc-ISO_8859_16-2.3.3
-    }
-
-    # Array keyed by Tcl encoding name. Each element contains mapping of
-    # Unicode code point -> byte sequence for that encoding as a flat list
-    # (or dictionary). Both are stored as hex strings
-    variable charMap
-
-    # Array keyed by Tcl encoding name. List of invalid code sequences
-    # each being a hex string.
-    variable invalidCodeSequences
-
-    # Array keyed by Tcl encoding name. List of unicode code points that are
-    # not mapped, each being a hex string.
-    variable unmappedCodePoints
-
-    # The fallback character per encoding
-    variable encSubchar
-}
-
-proc ucm::abort {msg} {
-    variable errorChan
-    puts $errorChan $msg
-    exit 1
-}
-proc ucm::warn {msg} {
-    variable errorChan
-    puts $errorChan $msg
-}
-proc ucm::log {msg} {
-    variable verbose
-    if {$verbose} {
-        variable errorChan
-        puts $errorChan $msg
-    }
-}
-proc ucm::print {s} {
-    variable outputChan
-    puts $outputChan $s
-}
-
-proc ucm::parse_SBCS {encName fd} {
-    variable charMap
-    variable invalidCodeSequences
-    variable unmappedCodePoints
-
-    set result {}
-    while {[gets $fd line] >= 0} {
-        if {[string match #* $line]} {
-            continue
-        }
-        if {[string equal "END CHARMAP" [string trim $line]]} {
-            break
-        }
-        if {![regexp {^\s*<U([[:xdigit:]]{4})>\s*((\\x[[:xdigit:]]{2})+)\s*(\|(0|1|2|3|4))} $line -> unichar bytes - - precision]} {
-            error "Unexpected line parsing SBCS: $line"
-        }
-        set bytes [string map {\\x {}} $bytes]; # \xNN -> NN
-        if {$precision eq "" || $precision eq "0"} {
-            lappend result $unichar $bytes
-        } else {
-            # It is a fallback mapping - ignore
-        }
-    }
-    set charMap($encName) $result
-
-    # Find out invalid code sequences and unicode code points that are not mapped
-    set valid {}
-    set mapped {}
-    foreach {unich bytes} $result {
-        lappend mapped $unich
-        lappend valid $bytes
-    }
-    set invalidCodeSequences($encName) {}
-    for {set i 0} {$i <= 255} {incr i} {
-        set hex [format %.2X $i]
-        if {[lsearch -exact $valid $hex] < 0} {
-            lappend invalidCodeSequences($encName) $hex
-        }
-    }
-
-    set unmappedCodePoints($encName) {}
-    for {set i 0} {$i <= 65535} {incr i} {
-        set hex [format %.4X $i]
-        if {[lsearch -exact $mapped $hex] < 0} {
-            lappend unmappedCodePoints($encName) $hex
-            # Only look for (at most) one below 256 and one above 1024
-            if {$i < 255} {
-                # Found one so jump past 8 bits
-                set i 255
-            } else {
-                break
-            }
-        }
-        if {$i == 255} {
-            set i 1023
-        }
-    }
-    lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF
-}
-
-proc ucm::generate_boilerplate {} {
-    # Common procedures
-    print {
-# This file is automatically generated by ucm2tests.tcl.
-# Edits will be overwritten on next generation.
-#
-# Generates tests comparing Tcl encodings to ICU.
-# The generated file is NOT standalone. It should be sourced into a test script.
-
-proc ucmConvertfromMismatches {enc map} {
-    set mismatches {}
-    foreach {unihex hex} $map {
-        set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
-        set unich [subst "\\U$unihex"]
-        if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} {
-            lappend mismatches "<[printable $unich],$hex>"
-        }
-    }
-    return $mismatches
-}
-proc ucmConverttoMismatches {enc map} {
-    set mismatches {}
-    foreach {unihex hex} $map {
-        set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
-        set unich [subst "\\U$unihex"]
-        if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} {
-            lappend mismatches "<[printable $unich],$hex>"
-        }
-    }
-    return $mismatches
-}
-if {[info commands printable] eq ""} {
-    proc printable {s} {
-        set print ""
-        foreach c [split $s ""] {
-            set i [scan $c %c]
-            if {[string is print $c] && ($i <= 127)} {
-                append print $c
-            } elseif {$i <= 0xff} {
-                append print \\x[format %02X $i]
-            } elseif {$i <= 0xffff} {
-                append print \\u[format %04X $i]
-            } else {
-                append print \\U[format %08X $i]
-            }
-        }
-        return $print
-    }
-}
-    }
-} ; # generate_boilerplate
-
-proc ucm::generate_tests {} {
-    variable encNameMap
-    variable charMap
-    variable invalidCodeSequences
-    variable unmappedCodePoints
-    variable outputPath
-    variable outputChan
-    variable encSubchar
-
-    if {[info exists outputPath]} {
-        set outputChan [open $outputPath w]
-        fconfigure $outputChan -translation lf
-    } else {
-        set outputChan stdout
-    }
-
-    array set tclNames {}
-    foreach encName [encoding names] {
-        set tclNames($encName) ""
-    }
-
-    generate_boilerplate
-    foreach encName [lsort -dictionary [array names encNameMap]] {
-        if {![info exists charMap($encName)]} {
-            warn "No character map read for $encName"
-            continue
-        }
-        unset tclNames($encName)
-
-        # Print the valid tests
-        print "\n#\n# $encName (generated from $encNameMap($encName))"
-        print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{"
-        print "    ucmConvertfromMismatches $encName {$charMap($encName)}"
-        print "\} -result {}"
-        print "\ntest encoding-convertto-ucmCompare-$encName {Compare against ICU UCM} -body \{"
-        print "    ucmConverttoMismatches $encName {$charMap($encName)}"
-        print "\} -result {}"
-        if {0} {
-            # This will generate individual tests for every char
-            # and test in lead, tail, middle, solo configurations
-            # but takes considerable time
-            print "lappend encValidStrings \{*\}\{"
-            foreach {unich hex} $charMap($encName) {
-                print "    $encName \\u$unich $hex {} {}"
-            }
-            print "\}; # $encName"
-        }
-
-        # Generate the invalidity checks
-        print "\n# $encName - invalid byte sequences"
-        print "lappend encInvalidBytes \{*\}\{"
-        foreach hex $invalidCodeSequences($encName) {
-            # Map XXXX... to \xXX\xXX...
-            set uhex [regsub -all .. $hex {\\x\0}]
-            set uhex \\U[string range 00000000$hex end-7 end]
-            print "    $encName $hex tcl8    $uhex -1 {} {}"
-            print "    $encName $hex replace \\uFFFD -1 {} {}"
-            print "    $encName $hex strict  {}       0 {} {}"
-        }
-        print "\}; # $encName"
-
-        print "\n# $encName - invalid byte sequences"
-        print "lappend encUnencodableStrings \{*\}\{"
-        if {[info exists encSubchar($encName)]} {
-            set subchar $encSubchar($encName)
-        } else {
-            set subchar "3F"; # Tcl uses ? by default
-        }
-        foreach hex $unmappedCodePoints($encName) {
-            set uhex \\U[string range 00000000$hex end-7 end]
-            print "    $encName $uhex tcl8    $subchar -1 {} {}"
-            print "    $encName $uhex replace $subchar -1 {} {}"
-            print "    $encName $uhex strict  {}                      0 {} {}"
-        }
-        print "\}; # $encName"
-    }
-
-    if {[array size tclNames]} {
-        warn "Missing encoding: [lsort [array names tclNames]]"
-    }
-    if {[info exists outputPath]} {
-        close $outputChan
-        unset outputChan
-    }
-}
-
-proc ucm::parse_file {encName ucmPath} {
-    variable charMap
-    variable encSubchar
-
-    set fd [open $ucmPath]
-    try {
-        # Parse the metadata
-        unset -nocomplain state
-        while {[gets $fd line] >= 0} {
-            if {[regexp {<(code_set_name|mb_cur_max|mb_cur_min|uconv_class|subchar)>\s+(\S+)} $line -> key val]} {
-                set state($key) $val
-            } elseif {[regexp {^\s*CHARMAP\s*$} $line]} {
-                set state(charmap) ""
-                break
-            } else {
-                # Skip all else
-            }
-        }
-        if {![info exists state(charmap)]} {
-            abort "Error: $ucmPath has No CHARMAP line."
-        }
-        foreach key {code_set_name uconv_class} {
-            if {[info exists state($key)]} {
-                set state($key) [string trim $state($key) {"}]
-            }
-        }
-        if {[info exists charMap($encName)]} {
-            abort "Duplicate file for $encName ($path)"
-        }
-        if {![info exists state(uconv_class)]} {
-            abort "Error: $ucmPath has no uconv_class definition."
-        }
-        if {[info exists state(subchar)]} {
-            # \xNN\xNN.. -> NNNN..
-            set encSubchar($encName) [string map {\\x {}} $state(subchar)]
-        }
-        switch -exact -- $state(uconv_class) {
-            SBCS {
-                if {[catch {
-                    parse_SBCS $encName $fd
-                } result]} {
-                    abort "Could not process $ucmPath. $result"
-                }
-            }
-            default {
-                log "Skipping $ucmPath -- not SBCS encoding."
-                return
-            }
-        }
-    } finally {
-        close $fd
-    }
-}
-
-proc ucm::run {} {
-    variable encNameMap
-    variable outputPath
-    switch [llength $::argv] {
-        2 {set outputPath [lindex $::argv 1]}
-        1 {}
-        default {
-            abort "Usage: [info nameofexecutable] $::argv0 path/to/icu/ucm/data ?outputfile?"
-        }
-    }
-    foreach {encName fname} [array get encNameMap] {
-        ucm::parse_file $encName [file join [lindex $::argv 0] ${fname}.ucm]
-    }
-    generate_tests
-}
-
-ucm::run