# uniParse.tcl -- # # This program parses the UnicodeData file and generates the # corresponding tclUniData.c file with compressed character # data tables. The input to this program should be the latest # UnicodeData file from: # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt # # Copyright (c) 1998-1999 by Scriptics Corporation. # All rights reserved. namespace eval uni { set shift 5; # number of bits of data within a page # This value can be adjusted to find the # best split to minimize table size variable pMap; # map from page to page index, each entry is # an index into the pages table, indexed by # page number variable pages; # map from page index to page info, each # entry is a list of indices into the groups # table, the list is indexed by the offset variable groups; # list of character info values, indexed by # group number, initialized with the # unassigned character group variable categories { Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So }; # Ordered list of character categories, must # match the enumeration in the header file. } proc uni::getValue {items index} { variable categories # Extract character info set category [lindex $items 2] if {[scan [lindex $items 12] %x toupper] == 1} { set toupper [expr {$index - $toupper}] } else { set toupper 0 } if {[scan [lindex $items 13] %x tolower] == 1} { set tolower [expr {$tolower - $index}] } else { set tolower 0 } if {[scan [lindex $items 14] %x totitle] == 1} { set totitle [expr {$index - $totitle}] } elseif {$tolower} { set totitle 0 } else { set totitle $toupper } set categoryIndex [lsearch -exact $categories $category] if {$categoryIndex < 0} { error "Unexpected character category: $index($category)" } return [list $categoryIndex $toupper $tolower $totitle] } proc uni::getGroup {value} { variable groups set gIndex [lsearch -exact $groups $value] if {$gIndex == -1} { set gIndex [llength $groups] lappend groups $value } return $gIndex } proc uni::addPage {info} { variable pMap variable pages variable shift set pIndex [lsearch -exact $pages $info] if {$pIndex == -1} { set pIndex [llength $pages] lappend pages $info } lappend pMap [expr {$pIndex << $shift}] return } proc uni::buildTables {data} { variable shift variable pMap {} variable pages {} variable groups {{0 0 0 0}} variable next 0 set info {} ;# temporary page info set mask [expr {(1 << $shift) - 1}] foreach line [split $data \n] { if {$line eq ""} { if {!($next & $mask)} { # next character is already on page boundary continue } # fill remaining page set line [format %X [expr {($next-1)|$mask}]] append line ";;Cn;0;ON;;;;;N;;;;;\n" } set items [split $line \;] scan [lindex $items 0] %x index if {$index > 0x2ffff} then { # Ignore non-BMP characters, as long as Tcl doesn't support them continue } set index [format %d $index] set gIndex [getGroup [getValue $items $index]] # Since the input table omits unassigned characters, these will # show up as gaps in the index sequence. There are a few special cases # where the gaps correspond to a uniform block of assigned characters. # These are indicated as such in the character name. # Enter all unassigned characters up to the current character. if {($index > $next) \ && ![regexp "Last>$" [lindex $items 1]]} { for {} {$next < $index} {incr next} { lappend info 0 if {($next & $mask) == $mask} { addPage $info set info {} } } } # Enter all assigned characters up to the current character for {set i $next} {$i <= $index} {incr i} { # Add the group index to the info for the current page lappend info $gIndex # If this is the last entry in the page, add the page if {($i & $mask) == $mask} { addPage $info set info {} } } set next [expr {$index + 1}] } return } proc uni::main {} { global argc argv0 argv variable pMap variable pages variable groups variable shift variable next if {$argc != 2} { puts stderr "\nusage: $argv0 <datafile> <outdir>\n" exit 1 } set f [open [lindex $argv 0] r] set data [read $f] close $f buildTables $data puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}] puts "shift = $shift, space = $size" set f [open [file join [lindex $argv 1] tclUniData.c] w] fconfigure $f -translation lf puts $f "/* * tclUniData.c -- * * Declarations of Unicode character information tables. This file is * automatically generated by the tools/uniParse.tcl script. Do not * modify this file by hand. * * Copyright (c) 1998 by Scriptics Corporation. * All rights reserved. */ /* * A 16-bit Unicode character is split into two parts in order to index * into the following tables. The lower OFFSET_BITS comprise an offset * into a page of characters. The upper bits comprise the page number. */ #define OFFSET_BITS $shift /* * The pageMap is indexed by page number and returns an alternate page number * that identifies a unique page of characters. Many Unicode characters map * to the same alternate page number. */ static const unsigned short pageMap\[\] = {" set line " " set last [expr {[llength $pMap] - 1}] for {set i 0} {$i <= $last} {incr i} { if {$i == [expr {0x10000 >> $shift}]} { set line [string trimright $line " \t,"] puts $f $line set lastpage [expr {[lindex $line end] >> $shift}] puts stdout "lastpage: $lastpage" puts $f "#if TCL_UTF_MAX > 3" set line " ," } append line [lindex $pMap $i] if {$i != $last} { append line ", " } if {[string length $line] > 70} { puts $f [string trimright $line] set line " " } } puts $f $line puts $f "#endif /* TCL_UTF_MAX > 3 */" puts $f "}; /* * The groupMap is indexed by combining the alternate page number with * the page offset and returns a group number that identifies a unique * set of character attributes. */ static const unsigned char groupMap\[\] = {" set line " " set lasti [expr {[llength $pages] - 1}] for {set i 0} {$i <= $lasti} {incr i} { set page [lindex $pages $i] set lastj [expr {[llength $page] - 1}] if {$i == ($lastpage + 1)} { puts $f [string trimright $line " \t,"] puts $f "#if TCL_UTF_MAX > 3" set line " ," } for {set j 0} {$j <= $lastj} {incr j} { append line [lindex $page $j] if {$j != $lastj || $i != $lasti} { append line ", " } if {[string length $line] > 70} { puts $f [string trimright $line] set line " " } } } puts $f $line puts $f "#endif /* TCL_UTF_MAX > 3 */" puts $f "}; /* * Each group represents a unique set of character attributes. The attributes * are encoded into a 32-bit value as follows: * * Bits 0-4 Character category: see the constants listed below. * * Bits 5-7 Case delta type: 000 = identity * 010 = add delta for lower * 011 = add delta for lower, add 1 for title * 100 = subtract delta for title/upper * 101 = sub delta for upper, sub 1 for title * 110 = sub delta for upper, add delta for lower * * Bits 8-31 Case delta: delta for case conversions. This should be the * highest field so we can easily sign extend. */ static const int groups\[\] = {" set line " " set last [expr {[llength $groups] - 1}] for {set i 0} {$i <= $last} {incr i} { foreach {type toupper tolower totitle} [lindex $groups $i] {} # Compute the case conversion type and delta if {$totitle} { if {$totitle == $toupper} { # subtract delta for title or upper set case 4 set delta $toupper if {$tolower} { error "New case conversion type needed: $toupper $tolower $totitle" } } elseif {$toupper} { # subtract delta for upper, subtract 1 for title set case 5 set delta $toupper if {($totitle != 1) || $tolower} { error "New case conversion type needed: $toupper $tolower $totitle" } } else { # add delta for lower, add 1 for title set case 3 set delta $tolower if {$totitle != -1} { error "New case conversion type needed: $toupper $tolower $totitle" } } } elseif {$toupper} { # subtract delta for upper, add delta for lower set case 6 set delta $toupper if {$tolower != $toupper} { error "New case conversion type needed: $toupper $tolower $totitle" } } elseif {$tolower} { # add delta for lower set case 2 set delta $tolower } else { # noop set case 0 set delta 0 } append line [expr {($delta << 8) | ($case << 5) | $type}] if {$i != $last} { append line ", " } if {[string length $line] > 65} { puts $f [string trimright $line] set line " " } } puts $f $line puts -nonewline $f "}; #if TCL_UTF_MAX > 3 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next]) #else # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0) #endif /* * The following constants are used to determine the category of a * Unicode character. */ enum { UNASSIGNED, UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, OTHER_LETTER, NON_SPACING_MARK, ENCLOSING_MARK, COMBINING_SPACING_MARK, DECIMAL_DIGIT_NUMBER, LETTER_NUMBER, OTHER_NUMBER, SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, PRIVATE_USE, SURROGATE, CONNECTOR_PUNCTUATION, DASH_PUNCTUATION, OPEN_PUNCTUATION, CLOSE_PUNCTUATION, INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, OTHER_PUNCTUATION, MATH_SYMBOL, CURRENCY_SYMBOL, MODIFIER_SYMBOL, OTHER_SYMBOL }; /* * The following macros extract the fields of the character info. The * GetDelta() macro is complicated because we can't rely on the C compiler * to do sign extension on right shifts. */ #define GetCaseType(info) (((info) & 0xe0) >> 5) #define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f) #define GetDelta(info) ((info) >> 8) /* * This macro extracts the information about a character from the * Unicode character tables. */ #if TCL_UTF_MAX > 3 # define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1fffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) #else # define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) #endif " close $f } uni::main return