diff options
Diffstat (limited to 'tcl8.6/tools/uniParse.tcl')
-rw-r--r-- | tcl8.6/tools/uniParse.tcl | 411 |
1 files changed, 0 insertions, 411 deletions
diff --git a/tcl8.6/tools/uniParse.tcl b/tcl8.6/tools/uniParse.tcl deleted file mode 100644 index 8125790..0000000 --- a/tcl8.6/tools/uniParse.tcl +++ /dev/null @@ -1,411 +0,0 @@ -# uniParse.tcl -- -# -# This program parses the UnicodeData file and generates the -# corresponding tclUniData.c file with compressed character -# data tables. The input to this program should be the latest -# UnicodeData file from: -# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt -# -# Copyright (c) 1998-1999 by Scriptics Corporation. -# All rights reserved. - - -namespace eval uni { - set shift 5; # number of bits of data within a page - # This value can be adjusted to find the - # best split to minimize table size - - variable pMap; # map from page to page index, each entry is - # an index into the pages table, indexed by - # page number - variable pages; # map from page index to page info, each - # entry is a list of indices into the groups - # table, the list is indexed by the offset - variable groups; # list of character info values, indexed by - # group number, initialized with the - # unassigned character group - - variable categories { - Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp - Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So - }; # Ordered list of character categories, must - # match the enumeration in the header file. -} - -proc uni::getValue {items index} { - variable categories - - # Extract character info - - set category [lindex $items 2] - if {[scan [lindex $items 12] %x toupper] == 1} { - set toupper [expr {$index - $toupper}] - } else { - set toupper 0 - } - if {[scan [lindex $items 13] %x tolower] == 1} { - set tolower [expr {$tolower - $index}] - } else { - set tolower 0 - } - if {[scan [lindex $items 14] %x totitle] == 1} { - set totitle [expr {$index - $totitle}] - } elseif {$tolower} { - set totitle 0 - } else { - set totitle $toupper - } - - set categoryIndex [lsearch -exact $categories $category] - if {$categoryIndex < 0} { - error "Unexpected character category: $index($category)" - } - - return [list $categoryIndex $toupper $tolower $totitle] -} - -proc uni::getGroup {value} { - variable groups - - set gIndex [lsearch -exact $groups $value] - if {$gIndex == -1} { - set gIndex [llength $groups] - lappend groups $value - } - return $gIndex -} - -proc uni::addPage {info} { - variable pMap - variable pages - variable shift - - set pIndex [lsearch -exact $pages $info] - if {$pIndex == -1} { - set pIndex [llength $pages] - lappend pages $info - } - lappend pMap [expr {$pIndex << $shift}] - return -} - -proc uni::buildTables {data} { - variable shift - - variable pMap {} - variable pages {} - variable groups {{0 0 0 0}} - variable next 0 - set info {} ;# temporary page info - - set mask [expr {(1 << $shift) - 1}] - - foreach line [split $data \n] { - if {$line eq ""} { - if {!($next & $mask)} { - # next character is already on page boundary - continue - } - # fill remaining page - set line [format %X [expr {($next-1)|$mask}]] - append line ";;Cn;0;ON;;;;;N;;;;;\n" - } - - set items [split $line \;] - - scan [lindex $items 0] %x index - if {$index > 0x2ffff} then { - # Ignore non-BMP characters, as long as Tcl doesn't support them - continue - } - set index [format %d $index] - - set gIndex [getGroup [getValue $items $index]] - - # Since the input table omits unassigned characters, these will - # show up as gaps in the index sequence. There are a few special cases - # where the gaps correspond to a uniform block of assigned characters. - # These are indicated as such in the character name. - - # Enter all unassigned characters up to the current character. - if {($index > $next) \ - && ![regexp "Last>$" [lindex $items 1]]} { - for {} {$next < $index} {incr next} { - lappend info 0 - if {($next & $mask) == $mask} { - addPage $info - set info {} - } - } - } - - # Enter all assigned characters up to the current character - for {set i $next} {$i <= $index} {incr i} { - # Add the group index to the info for the current page - lappend info $gIndex - - # If this is the last entry in the page, add the page - if {($i & $mask) == $mask} { - addPage $info - set info {} - } - } - set next [expr {$index + 1}] - } - return -} - -proc uni::main {} { - global argc argv0 argv - variable pMap - variable pages - variable groups - variable shift - variable next - - if {$argc != 2} { - puts stderr "\nusage: $argv0 <datafile> <outdir>\n" - exit 1 - } - set f [open [lindex $argv 0] r] - set data [read $f] - close $f - - buildTables $data - puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" - set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}] - puts "shift = $shift, space = $size" - - set f [open [file join [lindex $argv 1] tclUniData.c] w] - fconfigure $f -translation lf - puts $f "/* - * tclUniData.c -- - * - * Declarations of Unicode character information tables. This file is - * automatically generated by the tools/uniParse.tcl script. Do not - * modify this file by hand. - * - * Copyright (c) 1998 by Scriptics Corporation. - * All rights reserved. - */ - -/* - * A 16-bit Unicode character is split into two parts in order to index - * into the following tables. The lower OFFSET_BITS comprise an offset - * into a page of characters. The upper bits comprise the page number. - */ - -#define OFFSET_BITS $shift - -/* - * The pageMap is indexed by page number and returns an alternate page number - * that identifies a unique page of characters. Many Unicode characters map - * to the same alternate page number. - */ - -static const unsigned short pageMap\[\] = {" - set line " " - set last [expr {[llength $pMap] - 1}] - for {set i 0} {$i <= $last} {incr i} { - if {$i == [expr {0x10000 >> $shift}]} { - set line [string trimright $line " \t,"] - puts $f $line - set lastpage [expr {[lindex $line end] >> $shift}] - puts stdout "lastpage: $lastpage" - puts $f "#if TCL_UTF_MAX > 3" - set line " ," - } - append line [lindex $pMap $i] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 70} { - puts $f [string trimright $line] - set line " " - } - } - puts $f $line - puts $f "#endif /* TCL_UTF_MAX > 3 */" - puts $f "}; - -/* - * The groupMap is indexed by combining the alternate page number with - * the page offset and returns a group number that identifies a unique - * set of character attributes. - */ - -static const unsigned char groupMap\[\] = {" - set line " " - set lasti [expr {[llength $pages] - 1}] - for {set i 0} {$i <= $lasti} {incr i} { - set page [lindex $pages $i] - set lastj [expr {[llength $page] - 1}] - if {$i == ($lastpage + 1)} { - puts $f [string trimright $line " \t,"] - puts $f "#if TCL_UTF_MAX > 3" - set line " ," - } - for {set j 0} {$j <= $lastj} {incr j} { - append line [lindex $page $j] - if {$j != $lastj || $i != $lasti} { - append line ", " - } - if {[string length $line] > 70} { - puts $f [string trimright $line] - set line " " - } - } - } - puts $f $line - puts $f "#endif /* TCL_UTF_MAX > 3 */" - puts $f "}; - -/* - * Each group represents a unique set of character attributes. The attributes - * are encoded into a 32-bit value as follows: - * - * Bits 0-4 Character category: see the constants listed below. - * - * Bits 5-7 Case delta type: 000 = identity - * 010 = add delta for lower - * 011 = add delta for lower, add 1 for title - * 100 = subtract delta for title/upper - * 101 = sub delta for upper, sub 1 for title - * 110 = sub delta for upper, add delta for lower - * - * Bits 8-31 Case delta: delta for case conversions. This should be the - * highest field so we can easily sign extend. - */ - -static const int groups\[\] = {" - set line " " - set last [expr {[llength $groups] - 1}] - for {set i 0} {$i <= $last} {incr i} { - foreach {type toupper tolower totitle} [lindex $groups $i] {} - - # Compute the case conversion type and delta - - if {$totitle} { - if {$totitle == $toupper} { - # subtract delta for title or upper - set case 4 - set delta $toupper - if {$tolower} { - error "New case conversion type needed: $toupper $tolower $totitle" - } - } elseif {$toupper} { - # subtract delta for upper, subtract 1 for title - set case 5 - set delta $toupper - if {($totitle != 1) || $tolower} { - error "New case conversion type needed: $toupper $tolower $totitle" - } - } else { - # add delta for lower, add 1 for title - set case 3 - set delta $tolower - if {$totitle != -1} { - error "New case conversion type needed: $toupper $tolower $totitle" - } - } - } elseif {$toupper} { - # subtract delta for upper, add delta for lower - set case 6 - set delta $toupper - if {$tolower != $toupper} { - error "New case conversion type needed: $toupper $tolower $totitle" - } - } elseif {$tolower} { - # add delta for lower - set case 2 - set delta $tolower - } else { - # noop - set case 0 - set delta 0 - } - - append line [expr {($delta << 8) | ($case << 5) | $type}] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 65} { - puts $f [string trimright $line] - set line " " - } - } - puts $f $line - puts -nonewline $f "}; - -#if TCL_UTF_MAX > 3 -# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next]) -#else -# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0) -#endif - -/* - * The following constants are used to determine the category of a - * Unicode character. - */ - -enum { - UNASSIGNED, - UPPERCASE_LETTER, - LOWERCASE_LETTER, - TITLECASE_LETTER, - MODIFIER_LETTER, - OTHER_LETTER, - NON_SPACING_MARK, - ENCLOSING_MARK, - COMBINING_SPACING_MARK, - DECIMAL_DIGIT_NUMBER, - LETTER_NUMBER, - OTHER_NUMBER, - SPACE_SEPARATOR, - LINE_SEPARATOR, - PARAGRAPH_SEPARATOR, - CONTROL, - FORMAT, - PRIVATE_USE, - SURROGATE, - CONNECTOR_PUNCTUATION, - DASH_PUNCTUATION, - OPEN_PUNCTUATION, - CLOSE_PUNCTUATION, - INITIAL_QUOTE_PUNCTUATION, - FINAL_QUOTE_PUNCTUATION, - OTHER_PUNCTUATION, - MATH_SYMBOL, - CURRENCY_SYMBOL, - MODIFIER_SYMBOL, - OTHER_SYMBOL -}; - -/* - * The following macros extract the fields of the character info. The - * GetDelta() macro is complicated because we can't rely on the C compiler - * to do sign extension on right shifts. - */ - -#define GetCaseType(info) (((info) & 0xe0) >> 5) -#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f) -#define GetDelta(info) ((info) >> 8) - -/* - * This macro extracts the information about a character from the - * Unicode character tables. - */ - -#if TCL_UTF_MAX > 3 -# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1fffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) -#else -# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) -#endif -" - - close $f -} - -uni::main - -return |