diff options
Diffstat (limited to 'tools/uniParse.tcl')
-rw-r--r-- | tools/uniParse.tcl | 159 |
1 files changed, 90 insertions, 69 deletions
diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl index 3fe38d2..e33b3c7 100644 --- a/tools/uniParse.tcl +++ b/tools/uniParse.tcl @@ -4,12 +4,10 @@ # corresponding tclUniData.c file with compressed character # data tables. The input to this program should be the latest # UnicodeData file from: -# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt # # Copyright (c) 1998-1999 by Scriptics Corporation. # All rights reserved. -# -# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $ namespace eval uni { @@ -32,45 +30,38 @@ namespace eval uni { Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So }; # Ordered list of character categories, must # match the enumeration in the header file. - - variable titleCount 0; # Count of the number of title case - # characters. This value is used in the - # regular expression code to allocate enough - # space for the title case variants. } proc uni::getValue {items index} { variable categories - variable titleCount # Extract character info set category [lindex $items 2] - if {[scan [lindex $items 12] %4x toupper] == 1} { + if {[scan [lindex $items 12] %x toupper] == 1} { set toupper [expr {$index - $toupper}] } else { - set toupper {} + set toupper 0 } - if {[scan [lindex $items 13] %4x tolower] == 1} { + if {[scan [lindex $items 13] %x tolower] == 1} { set tolower [expr {$tolower - $index}] } else { - set tolower {} + set tolower 0 } - if {[scan [lindex $items 14] %4x totitle] == 1} { + if {[scan [lindex $items 14] %x totitle] == 1} { set totitle [expr {$index - $totitle}] + } elseif {$tolower} { + set totitle 0 } else { - set totitle {} + set totitle $toupper } set categoryIndex [lsearch -exact $categories $category] if {$categoryIndex < 0} { - puts "Unexpected character category: $index($category)" - set categoryIndex 0 - } elseif {$category == "Lt"} { - incr titleCount + error "Unexpected character category: $index($category)" } - return "$categoryIndex,$toupper,$tolower,$totitle" + return [list $categoryIndex $toupper $tolower $totitle] } proc uni::getGroup {value} { @@ -87,38 +78,48 @@ proc uni::getGroup {value} { proc uni::addPage {info} { variable pMap variable pages - + variable shift + set pIndex [lsearch -exact $pages $info] if {$pIndex == -1} { set pIndex [llength $pages] lappend pages $info } - lappend pMap $pIndex + lappend pMap [expr {$pIndex << $shift}] return } - + proc uni::buildTables {data} { variable shift variable pMap {} variable pages {} - variable groups {{0,,,}} + variable groups {{0 0 0 0}} + variable next 0 set info {} ;# temporary page info - - set mask [expr {(1 << $shift) - 1}] - set next 0 + set mask [expr {(1 << $shift) - 1}] foreach line [split $data \n] { - if {$line == ""} { - set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" + if {$line eq ""} { + if {!($next & $mask)} { + # next character is already on page boundary + continue + } + # fill remaining page + set line [format %X [expr {($next-1)|$mask}]] + append line ";;Cn;0;ON;;;;;N;;;;;\n" } set items [split $line \;] - scan [lindex $items 0] %4x index - set index [format 0x%0.4x $index] - + scan [lindex $items 0] %x index + if {$index > 0x2ffff} then { + # Ignore non-BMP characters, as long as Tcl doesn't support them + continue + } + set index [format %d $index] + set gIndex [getGroup [getValue $items $index]] # Since the input table omits unassigned characters, these will @@ -140,15 +141,11 @@ proc uni::buildTables {data} { # Enter all assigned characters up to the current character for {set i $next} {$i <= $index} {incr i} { - # Split character index into offset and page number - set offset [expr {$i & $mask}] - set page [expr {($i >> $shift)}] - # Add the group index to the info for the current page lappend info $gIndex # If this is the last entry in the page, add the page - if {$offset == $mask} { + if {($i & $mask) == $mask} { addPage $info set info {} } @@ -164,7 +161,7 @@ proc uni::main {} { variable pages variable groups variable shift - variable titleCount + variable next if {$argc != 2} { puts stderr "\nusage: $argv0 <datafile> <outdir>\n" @@ -176,9 +173,8 @@ proc uni::main {} { buildTables $data puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" - set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] - puts "shift = 6, space = $size" - puts "title case count = $titleCount" + set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}] + puts "shift = $shift, space = $size" set f [open [file join [lindex $argv 1] tclUniData.c] w] fconfigure $f -translation lf @@ -191,8 +187,6 @@ proc uni::main {} { * * Copyright (c) 1998 by Scriptics Corporation. * All rights reserved. - * - * RCS: @(#) \$Id\$ */ /* @@ -209,20 +203,29 @@ proc uni::main {} { * to the same alternate page number. */ -static unsigned char pageMap\[\] = {" +static const unsigned short pageMap\[\] = {" set line " " set last [expr {[llength $pMap] - 1}] for {set i 0} {$i <= $last} {incr i} { + if {$i == [expr {0x10000 >> $shift}]} { + set line [string trimright $line " \t,"] + puts $f $line + set lastpage [expr {[lindex $line end] >> $shift}] + puts stdout "lastpage: $lastpage" + puts $f "#if TCL_UTF_MAX > 3" + set line " ," + } append line [lindex $pMap $i] if {$i != $last} { append line ", " } if {[string length $line] > 70} { - puts $f $line + puts $f [string trimright $line] set line " " } } puts $f $line + puts $f "#endif /* TCL_UTF_MAX > 3 */" puts $f "}; /* @@ -231,24 +234,30 @@ static unsigned char pageMap\[\] = {" * set of character attributes. */ -static unsigned char groupMap\[\] = {" +static const unsigned char groupMap\[\] = {" set line " " set lasti [expr {[llength $pages] - 1}] for {set i 0} {$i <= $lasti} {incr i} { set page [lindex $pages $i] set lastj [expr {[llength $page] - 1}] + if {$i == ($lastpage + 1)} { + puts $f [string trimright $line " \t,"] + puts $f "#if TCL_UTF_MAX > 3" + set line " ," + } for {set j 0} {$j <= $lastj} {incr j} { append line [lindex $page $j] if {$j != $lastj || $i != $lasti} { append line ", " } if {[string length $line] > 70} { - puts $f $line + puts $f [string trimright $line] set line " " } } } puts $f $line + puts $f "#endif /* TCL_UTF_MAX > 3 */" puts $f "}; /* @@ -260,43 +269,53 @@ static unsigned char groupMap\[\] = {" * Bits 5-7 Case delta type: 000 = identity * 010 = add delta for lower * 011 = add delta for lower, add 1 for title - * 100 = sutract delta for title/upper + * 100 = subtract delta for title/upper * 101 = sub delta for upper, sub 1 for title * 110 = sub delta for upper, add delta for lower * - * Bits 8-21 Reserved for future use. - * - * Bits 22-31 Case delta: delta for case conversions. This should be the + * Bits 8-31 Case delta: delta for case conversions. This should be the * highest field so we can easily sign extend. */ -static int groups\[\] = {" +static const int groups\[\] = {" set line " " set last [expr {[llength $groups] - 1}] for {set i 0} {$i <= $last} {incr i} { - foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} - + foreach {type toupper tolower totitle} [lindex $groups $i] {} + # Compute the case conversion type and delta - if {$totitle != ""} { + if {$totitle} { if {$totitle == $toupper} { # subtract delta for title or upper set case 4 set delta $toupper - } elseif {$toupper != ""} { + if {$tolower} { + error "New case conversion type needed: $toupper $tolower $totitle" + } + } elseif {$toupper} { # subtract delta for upper, subtract 1 for title set case 5 set delta $toupper + if {($totitle != 1) || $tolower} { + error "New case conversion type needed: $toupper $tolower $totitle" + } } else { # add delta for lower, add 1 for title set case 3 set delta $tolower + if {$totitle != -1} { + error "New case conversion type needed: $toupper $tolower $totitle" + } } - } elseif {$toupper != ""} { + } elseif {$toupper} { # subtract delta for upper, add delta for lower set case 6 set delta $toupper - } elseif {$tolower != ""} { + if {$tolower != $toupper} { + error "New case conversion type needed: $toupper $tolower $totitle" + } + } elseif {$tolower} { # add delta for lower set case 2 set delta $tolower @@ -306,27 +325,29 @@ static int groups\[\] = {" set delta 0 } - set val [expr {($delta << 22) | ($case << 5) | $type}] - - append line [format "%d" $val] + append line [expr {($delta << 8) | ($case << 5) | $type}] if {$i != $last} { append line ", " } if {[string length $line] > 65} { - puts $f $line + puts $f [string trimright $line] set line " " } } puts $f $line - puts $f "}; + puts -nonewline $f "}; + +#if TCL_UTF_MAX > 3 +# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next]) +#else +# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0) +#endif /* * The following constants are used to determine the category of a * Unicode character. */ -#define UNICODE_CATEGORY_MASK 0X1F - enum { UNASSIGNED, UPPERCASE_LETTER, @@ -366,16 +387,16 @@ enum { * to do sign extension on right shifts. */ -#define GetCaseType(info) (((info) & 0xE0) >> 5) -#define GetCategory(info) ((info) & 0x1F) -#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) +#define GetCaseType(info) (((info) & 0xe0) >> 5) +#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f) +#define GetDelta(info) ((info) >> 8) /* * This macro extracts the information about a character from the * Unicode character tables. */ -#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) " close $f |