summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorapnadkarni <apnmbx-wits@yahoo.com>2023-02-22 13:42:55 (GMT)
committerapnadkarni <apnmbx-wits@yahoo.com>2023-02-22 13:42:55 (GMT)
commit293504812606130380d7240fddbbdc573b9dae8c (patch)
treeb1fed63c6b5cb520900605f51d9182374fff3149 /tools
parent9b8fa27457c97577817b8f86b0b658a04867d7c7 (diff)
downloadtcl-293504812606130380d7240fddbbdc573b9dae8c.zip
tcl-293504812606130380d7240fddbbdc573b9dae8c.tar.gz
tcl-293504812606130380d7240fddbbdc573b9dae8c.tar.bz2
Add ICU tests for unmapped characters.
Diffstat (limited to 'tools')
-rw-r--r--tools/ucm2tests.tcl156
1 files changed, 122 insertions, 34 deletions
diff --git a/tools/ucm2tests.tcl b/tools/ucm2tests.tcl
index e971631..dc878ef 100644
--- a/tools/ucm2tests.tcl
+++ b/tools/ucm2tests.tcl
@@ -37,14 +37,27 @@ namespace eval ucm {
iso8859-9 glibc-ISO_8859_9-2.1.2
iso8859-10 glibc-ISO_8859_10-2.1.2
iso8859-11 glibc-ISO_8859_11-2.1.2
- iso8859-13 glibc-ISO_8859_13-2.1.2
+ iso8859-13 glibc-ISO_8859_13-2.3.3
iso8859-14 glibc-ISO_8859_14-2.1.2
iso8859-15 glibc-ISO_8859_15-2.1.2
iso8859-16 glibc-ISO_8859_16-2.3.3
}
- # Dictionary Character map for Tcl encoding
+ # Array keyed by Tcl encoding name. Each element contains mapping of
+ # Unicode code point -> byte sequence for that encoding as a flat list
+ # (or dictionary). Both are stored as hex strings
variable charMap
+
+ # Array keyed by Tcl encoding name. List of invalid code sequences
+ # each being a hex string.
+ variable invalidCodeSequences
+
+ # Array keyed by Tcl encoding name. List of unicode code points that are
+ # not mapped, each being a hex string.
+ variable unmappedCodePoints
+
+ # The fallback character per encoding
+ variable encSubchar
}
proc ucm::abort {msg} {
@@ -68,7 +81,11 @@ proc ucm::print {s} {
puts $outputChan $s
}
-proc ucm::parse_SBCS {fd} {
+proc ucm::parse_SBCS {encName fd} {
+ variable charMap
+ variable invalidCodeSequences
+ variable unmappedCodePoints
+
set result {}
while {[gets $fd line] >= 0} {
if {[string match #* $line]} {
@@ -87,26 +104,44 @@ proc ucm::parse_SBCS {fd} {
# It is a fallback mapping - ignore
}
}
- return $result
-}
+ set charMap($encName) $result
-proc ucm::generate_tests {} {
- variable encNameMap
- variable charMap
- variable outputPath
- variable outputChan
-
- if {[info exists outputPath]} {
- set outputChan [open $outputPath w]
- } else {
- set outputChan stdout
+ # Find out invalid code sequences and unicode code points that are not mapped
+ set valid {}
+ set mapped {}
+ foreach {unich bytes} $result {
+ lappend mapped $unich
+ lappend valid $bytes
+ }
+ set invalidCodeSequences($encName) {}
+ for {set i 0} {$i <= 255} {incr i} {
+ set hex [format %.2X $i]
+ if {[lsearch -exact $valid $hex] < 0} {
+ lappend invalidCodeSequences($encName) $hex
+ }
}
- array set tclNames {}
- foreach encName [encoding names] {
- set tclNames($encName) ""
+ set unmappedCodePoints($encName) {}
+ for {set i 0} {$i <= 65535} {incr i} {
+ set hex [format %.4X $i]
+ if {[lsearch -exact $mapped $hex] < 0} {
+ lappend unmappedCodePoints($encName) $hex
+ # Only look for (at most) one below 256 and one above 1024
+ if {$i < 255} {
+ # Found one so jump past 8 bits
+ set i 255
+ } else {
+ break
+ }
+ }
+ if {$i == 255} {
+ set i 1023
+ }
}
+ lappend unmappedCodePoints($encName) D800 DC00 10000 10FFFF
+}
+proc ucm::generate_boilerplate {} {
# Common procedures
print {
# This file is automatically generated by ucm2tests.tcl.
@@ -118,6 +153,7 @@ proc ucm::generate_tests {} {
proc ucmConvertfromMismatches {enc map} {
set mismatches {}
foreach {unihex hex} $map {
+ set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
set unich [subst "\\U$unihex"]
if {[encoding convertfrom -profile strict $enc [binary decode hex $hex]] ne $unich} {
lappend mismatches "<[printable $unich],$hex>"
@@ -128,6 +164,7 @@ proc ucmConvertfromMismatches {enc map} {
proc ucmConverttoMismatches {enc map} {
set mismatches {}
foreach {unihex hex} $map {
+ set unihex [string range 00000000$unihex end-7 end]; # Make 8 digits
set unich [subst "\\U$unihex"]
if {[encoding convertto -profile strict $enc $unich] ne [binary decode hex $hex]} {
lappend mismatches "<[printable $unich],$hex>"
@@ -154,6 +191,30 @@ if {[info commands printable] eq ""} {
}
}
}
+} ; # generate_boilerplate
+
+proc ucm::generate_tests {} {
+ variable encNameMap
+ variable charMap
+ variable invalidCodeSequences
+ variable unmappedCodePoints
+ variable outputPath
+ variable outputChan
+ variable encSubchar
+
+ if {[info exists outputPath]} {
+ set outputChan [open $outputPath w]
+ fconfigure $outputChan -translation lf
+ } else {
+ set outputChan stdout
+ }
+
+ array set tclNames {}
+ foreach encName [encoding names] {
+ set tclNames($encName) ""
+ }
+
+ generate_boilerplate
foreach encName [lsort -dictionary [array names encNameMap]] {
if {![info exists charMap($encName)]} {
warn "No character map read for $encName"
@@ -161,6 +222,7 @@ if {[info commands printable] eq ""} {
}
unset tclNames($encName)
+ # Print the valid tests
print "\n#\n# $encName (generated from $encNameMap($encName))"
print "\ntest encoding-convertfrom-ucmCompare-$encName {Compare against ICU UCM} -body \{"
print " ucmConvertfromMismatches $encName {$charMap($encName)}"
@@ -172,13 +234,42 @@ if {[info commands printable] eq ""} {
# This will generate individual tests for every char
# and test in lead, tail, middle, solo configurations
# but takes considerable time
- print "lappend encValidStrings {*}{"
+ print "lappend encValidStrings \{*\}\{"
foreach {unich hex} $charMap($encName) {
print " $encName \\u$unich $hex {} {}"
}
- print "}; # $encName"
+ print "\}; # $encName"
+ }
+
+ # Generate the invalidity checks
+ print "\n# $encName - invalid byte sequences"
+ print "lappend encInvalidBytes \{*\}\{"
+ foreach hex $invalidCodeSequences($encName) {
+ # Map XXXX... to \xXX\xXX...
+ set uhex [regsub -all .. $hex {\\x\0}]
+ set uhex \\U[string range 00000000$hex end-7 end]
+ print " $encName $hex tcl8 $uhex -1 {} {}"
+ print " $encName $hex replace \\uFFFD -1 {} {}"
+ print " $encName $hex strict {} 0 {} {}"
+ }
+ print "\}; # $encName"
+
+ print "\n# $encName - invalid byte sequences"
+ print "lappend encUnencodableStrings \{*\}\{"
+ if {[info exists encSubchar($encName)]} {
+ set subchar $encSubchar($encName)
+ } else {
+ set subchar "3F"; # Tcl uses ? by default
}
+ foreach hex $unmappedCodePoints($encName) {
+ set uhex \\U[string range 00000000$hex end-7 end]
+ print " $encName $uhex tcl8 $subchar -1 {} {}"
+ print " $encName $uhex replace $subchar -1 {} {}"
+ print " $encName $uhex strict {} 0 {} {}"
+ }
+ print "\}; # $encName"
}
+
if {[array size tclNames]} {
warn "Missing encoding: [lsort [array names tclNames]]"
}
@@ -190,6 +281,8 @@ if {[info commands printable] eq ""} {
proc ucm::parse_file {encName ucmPath} {
variable charMap
+ variable encSubchar
+
set fd [open $ucmPath]
try {
# Parse the metadata
@@ -205,7 +298,7 @@ proc ucm::parse_file {encName ucmPath} {
}
}
if {![info exists state(charmap)]} {
- abort "Error: $path has No CHARMAP line."
+ abort "Error: $ucmPath has No CHARMAP line."
}
foreach key {code_set_name uconv_class} {
if {[info exists state($key)]} {
@@ -216,18 +309,22 @@ proc ucm::parse_file {encName ucmPath} {
abort "Duplicate file for $encName ($path)"
}
if {![info exists state(uconv_class)]} {
- abort "Error: $path has no uconv_class definition."
+ abort "Error: $ucmPath has no uconv_class definition."
+ }
+ if {[info exists state(subchar)]} {
+ # \xNN\xNN.. -> NNNN..
+ set encSubchar($encName) [string map {\\x {}} $state(subchar)]
}
switch -exact -- $state(uconv_class) {
SBCS {
if {[catch {
- set charMap($encName) [parse_SBCS $fd]
+ parse_SBCS $encName $fd
} result]} {
- abort "Could not process $path. $result"
+ abort "Could not process $ucmPath. $result"
}
}
default {
- log "Skipping $path -- not SBCS encoding."
+ log "Skipping $ucmPath -- not SBCS encoding."
return
}
}
@@ -236,15 +333,6 @@ proc ucm::parse_file {encName ucmPath} {
}
}
-proc ucm::expand_paths {patterns} {
- set expanded {}
- foreach pat $patterns {
- # The file join is for \ -> /
- lappend expanded {*}[glob -nocomplain [file join $pat]]
- }
- return $expanded
-}
-
proc ucm::run {} {
variable encNameMap
variable outputPath