diff options
Diffstat (limited to 'tcl8.6/tools/uniParse.tcl')
-rw-r--r-- | tcl8.6/tools/uniParse.tcl | 411 |
1 files changed, 411 insertions, 0 deletions
diff --git a/tcl8.6/tools/uniParse.tcl b/tcl8.6/tools/uniParse.tcl new file mode 100644 index 0000000..8125790 --- /dev/null +++ b/tcl8.6/tools/uniParse.tcl @@ -0,0 +1,411 @@ +# uniParse.tcl -- +# +# This program parses the UnicodeData file and generates the +# corresponding tclUniData.c file with compressed character +# data tables. The input to this program should be the latest +# UnicodeData file from: +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt +# +# Copyright (c) 1998-1999 by Scriptics Corporation. +# All rights reserved. + + +namespace eval uni { + set shift 5; # number of bits of data within a page + # This value can be adjusted to find the + # best split to minimize table size + + variable pMap; # map from page to page index, each entry is + # an index into the pages table, indexed by + # page number + variable pages; # map from page index to page info, each + # entry is a list of indices into the groups + # table, the list is indexed by the offset + variable groups; # list of character info values, indexed by + # group number, initialized with the + # unassigned character group + + variable categories { + Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp + Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So + }; # Ordered list of character categories, must + # match the enumeration in the header file. +} + +proc uni::getValue {items index} { + variable categories + + # Extract character info + + set category [lindex $items 2] + if {[scan [lindex $items 12] %x toupper] == 1} { + set toupper [expr {$index - $toupper}] + } else { + set toupper 0 + } + if {[scan [lindex $items 13] %x tolower] == 1} { + set tolower [expr {$tolower - $index}] + } else { + set tolower 0 + } + if {[scan [lindex $items 14] %x totitle] == 1} { + set totitle [expr {$index - $totitle}] + } elseif {$tolower} { + set totitle 0 + } else { + set totitle $toupper + } + + set categoryIndex [lsearch -exact $categories $category] + if {$categoryIndex < 0} { + error "Unexpected character category: $index($category)" + } + + return [list $categoryIndex $toupper $tolower $totitle] +} + +proc uni::getGroup {value} { + variable groups + + set gIndex [lsearch -exact $groups $value] + if {$gIndex == -1} { + set gIndex [llength $groups] + lappend groups $value + } + return $gIndex +} + +proc uni::addPage {info} { + variable pMap + variable pages + variable shift + + set pIndex [lsearch -exact $pages $info] + if {$pIndex == -1} { + set pIndex [llength $pages] + lappend pages $info + } + lappend pMap [expr {$pIndex << $shift}] + return +} + +proc uni::buildTables {data} { + variable shift + + variable pMap {} + variable pages {} + variable groups {{0 0 0 0}} + variable next 0 + set info {} ;# temporary page info + + set mask [expr {(1 << $shift) - 1}] + + foreach line [split $data \n] { + if {$line eq ""} { + if {!($next & $mask)} { + # next character is already on page boundary + continue + } + # fill remaining page + set line [format %X [expr {($next-1)|$mask}]] + append line ";;Cn;0;ON;;;;;N;;;;;\n" + } + + set items [split $line \;] + + scan [lindex $items 0] %x index + if {$index > 0x2ffff} then { + # Ignore non-BMP characters, as long as Tcl doesn't support them + continue + } + set index [format %d $index] + + set gIndex [getGroup [getValue $items $index]] + + # Since the input table omits unassigned characters, these will + # show up as gaps in the index sequence. There are a few special cases + # where the gaps correspond to a uniform block of assigned characters. + # These are indicated as such in the character name. + + # Enter all unassigned characters up to the current character. + if {($index > $next) \ + && ![regexp "Last>$" [lindex $items 1]]} { + for {} {$next < $index} {incr next} { + lappend info 0 + if {($next & $mask) == $mask} { + addPage $info + set info {} + } + } + } + + # Enter all assigned characters up to the current character + for {set i $next} {$i <= $index} {incr i} { + # Add the group index to the info for the current page + lappend info $gIndex + + # If this is the last entry in the page, add the page + if {($i & $mask) == $mask} { + addPage $info + set info {} + } + } + set next [expr {$index + 1}] + } + return +} + +proc uni::main {} { + global argc argv0 argv + variable pMap + variable pages + variable groups + variable shift + variable next + + if {$argc != 2} { + puts stderr "\nusage: $argv0 <datafile> <outdir>\n" + exit 1 + } + set f [open [lindex $argv 0] r] + set data [read $f] + close $f + + buildTables $data + puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" + set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}] + puts "shift = $shift, space = $size" + + set f [open [file join [lindex $argv 1] tclUniData.c] w] + fconfigure $f -translation lf + puts $f "/* + * tclUniData.c -- + * + * Declarations of Unicode character information tables. This file is + * automatically generated by the tools/uniParse.tcl script. Do not + * modify this file by hand. + * + * Copyright (c) 1998 by Scriptics Corporation. + * All rights reserved. + */ + +/* + * A 16-bit Unicode character is split into two parts in order to index + * into the following tables. The lower OFFSET_BITS comprise an offset + * into a page of characters. The upper bits comprise the page number. + */ + +#define OFFSET_BITS $shift + +/* + * The pageMap is indexed by page number and returns an alternate page number + * that identifies a unique page of characters. Many Unicode characters map + * to the same alternate page number. + */ + +static const unsigned short pageMap\[\] = {" + set line " " + set last [expr {[llength $pMap] - 1}] + for {set i 0} {$i <= $last} {incr i} { + if {$i == [expr {0x10000 >> $shift}]} { + set line [string trimright $line " \t,"] + puts $f $line + set lastpage [expr {[lindex $line end] >> $shift}] + puts stdout "lastpage: $lastpage" + puts $f "#if TCL_UTF_MAX > 3" + set line " ," + } + append line [lindex $pMap $i] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 70} { + puts $f [string trimright $line] + set line " " + } + } + puts $f $line + puts $f "#endif /* TCL_UTF_MAX > 3 */" + puts $f "}; + +/* + * The groupMap is indexed by combining the alternate page number with + * the page offset and returns a group number that identifies a unique + * set of character attributes. + */ + +static const unsigned char groupMap\[\] = {" + set line " " + set lasti [expr {[llength $pages] - 1}] + for {set i 0} {$i <= $lasti} {incr i} { + set page [lindex $pages $i] + set lastj [expr {[llength $page] - 1}] + if {$i == ($lastpage + 1)} { + puts $f [string trimright $line " \t,"] + puts $f "#if TCL_UTF_MAX > 3" + set line " ," + } + for {set j 0} {$j <= $lastj} {incr j} { + append line [lindex $page $j] + if {$j != $lastj || $i != $lasti} { + append line ", " + } + if {[string length $line] > 70} { + puts $f [string trimright $line] + set line " " + } + } + } + puts $f $line + puts $f "#endif /* TCL_UTF_MAX > 3 */" + puts $f "}; + +/* + * Each group represents a unique set of character attributes. The attributes + * are encoded into a 32-bit value as follows: + * + * Bits 0-4 Character category: see the constants listed below. + * + * Bits 5-7 Case delta type: 000 = identity + * 010 = add delta for lower + * 011 = add delta for lower, add 1 for title + * 100 = subtract delta for title/upper + * 101 = sub delta for upper, sub 1 for title + * 110 = sub delta for upper, add delta for lower + * + * Bits 8-31 Case delta: delta for case conversions. This should be the + * highest field so we can easily sign extend. + */ + +static const int groups\[\] = {" + set line " " + set last [expr {[llength $groups] - 1}] + for {set i 0} {$i <= $last} {incr i} { + foreach {type toupper tolower totitle} [lindex $groups $i] {} + + # Compute the case conversion type and delta + + if {$totitle} { + if {$totitle == $toupper} { + # subtract delta for title or upper + set case 4 + set delta $toupper + if {$tolower} { + error "New case conversion type needed: $toupper $tolower $totitle" + } + } elseif {$toupper} { + # subtract delta for upper, subtract 1 for title + set case 5 + set delta $toupper + if {($totitle != 1) || $tolower} { + error "New case conversion type needed: $toupper $tolower $totitle" + } + } else { + # add delta for lower, add 1 for title + set case 3 + set delta $tolower + if {$totitle != -1} { + error "New case conversion type needed: $toupper $tolower $totitle" + } + } + } elseif {$toupper} { + # subtract delta for upper, add delta for lower + set case 6 + set delta $toupper + if {$tolower != $toupper} { + error "New case conversion type needed: $toupper $tolower $totitle" + } + } elseif {$tolower} { + # add delta for lower + set case 2 + set delta $tolower + } else { + # noop + set case 0 + set delta 0 + } + + append line [expr {($delta << 8) | ($case << 5) | $type}] + if {$i != $last} { + append line ", " + } + if {[string length $line] > 65} { + puts $f [string trimright $line] + set line " " + } + } + puts $f $line + puts -nonewline $f "}; + +#if TCL_UTF_MAX > 3 +# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next]) +#else +# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0) +#endif + +/* + * The following constants are used to determine the category of a + * Unicode character. + */ + +enum { + UNASSIGNED, + UPPERCASE_LETTER, + LOWERCASE_LETTER, + TITLECASE_LETTER, + MODIFIER_LETTER, + OTHER_LETTER, + NON_SPACING_MARK, + ENCLOSING_MARK, + COMBINING_SPACING_MARK, + DECIMAL_DIGIT_NUMBER, + LETTER_NUMBER, + OTHER_NUMBER, + SPACE_SEPARATOR, + LINE_SEPARATOR, + PARAGRAPH_SEPARATOR, + CONTROL, + FORMAT, + PRIVATE_USE, + SURROGATE, + CONNECTOR_PUNCTUATION, + DASH_PUNCTUATION, + OPEN_PUNCTUATION, + CLOSE_PUNCTUATION, + INITIAL_QUOTE_PUNCTUATION, + FINAL_QUOTE_PUNCTUATION, + OTHER_PUNCTUATION, + MATH_SYMBOL, + CURRENCY_SYMBOL, + MODIFIER_SYMBOL, + OTHER_SYMBOL +}; + +/* + * The following macros extract the fields of the character info. The + * GetDelta() macro is complicated because we can't rely on the C compiler + * to do sign extension on right shifts. + */ + +#define GetCaseType(info) (((info) & 0xe0) >> 5) +#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f) +#define GetDelta(info) ((info) >> 8) + +/* + * This macro extracts the information about a character from the + * Unicode character tables. + */ + +#if TCL_UTF_MAX > 3 +# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1fffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#else +# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#endif +" + + close $f +} + +uni::main + +return |