diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2012-01-22 21:41:11 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2012-01-22 21:41:11 (GMT) |
commit | ec04b423d5d61edfa6bb015735698e1c81fcd154 (patch) | |
tree | 0d0942519126dd5cb530385a7118dfabf5363134 /tools | |
parent | ebf98dc88745b06ea047608ddb438cf2f7ce74d4 (diff) | |
parent | 6c36acaa44a79338e62cd865dcc7397aea9df3f0 (diff) | |
download | tcl-ec04b423d5d61edfa6bb015735698e1c81fcd154.zip tcl-ec04b423d5d61edfa6bb015735698e1c81fcd154.tar.gz tcl-ec04b423d5d61edfa6bb015735698e1c81fcd154.tar.bz2 |
[Frq 3473670]: Various Unicode-related speedups/robustness
Diffstat (limited to 'tools')
-rw-r--r-- | tools/uniClass.tcl | 38 | ||||
-rw-r--r-- | tools/uniParse.tcl | 63 |
2 files changed, 75 insertions, 26 deletions
diff --git a/tools/uniClass.tcl b/tools/uniClass.tcl index 55aa44c..f741434 100644 --- a/tools/uniClass.tcl +++ b/tools/uniClass.tcl @@ -13,22 +13,36 @@ exec tclsh "$0" ${1+"$@"} # proc emitRange {first last} { - global ranges numranges chars numchars + global ranges numranges chars numchars extchars extranges if {$first < ($last-1)} { - append ranges [format "{0x%04x, 0x%04x}, " \ + if {!$extranges && ($first) > 0xffff} { + set extranges 1 + set numranges 0 + set ranges [string trimright $ranges " \n\r\t,"] + append ranges "\n#if MAX_UTF_CHAR > 4\n ," + } + append ranges [format "{0x%x, 0x%x}, " \ $first $last] if {[incr numranges] % 4 == 0} { + set ranges [string trimright $ranges] append ranges "\n " } } else { - append chars [format "0x%04x, " $first] + if {!$extchars && ($first) > 0xffff} { + set extchars 1 + set numchars 0 + set chars [string trimright $chars " \n\r\t,"] + append chars "\n#if MAX_UTF_CHAR > 4\n ," + } + append chars [format "0x%x, " $first] incr numchars if {$numchars % 9 == 0} { + set chars [string trimright $chars] append chars "\n " } if {$first != $last} { - append chars [format "0x%04x, " $last] + append chars [format "0x%x, " $last] incr numchars if {$numchars % 9 == 0} { append chars "\n " @@ -38,7 +52,7 @@ proc emitRange {first last} { } proc genTable {type} { - global first last ranges numranges chars numchars + global first last ranges numranges chars numchars extchars extranges set first -2 set last -2 @@ -46,8 +60,14 @@ proc genTable {type} { set numranges 0 set chars " " set numchars 0 + set extchars 0 + set extranges 0 - for {set i 0} {$i <= 0xFFFF} {incr i} { + for {set i 0} {$i <= 0x10ffff} {incr i} { + if {$i == 0xd800} { + # Skip surrogates + set i 0xdc00 + } if {[string is $type [format %c $i]]} { if {$i == ($last + 1)} { set last $i @@ -63,7 +83,13 @@ proc genTable {type} { emitRange $first $last set ranges [string trimright $ranges "\t\n ,"] + if {$extranges} { + append ranges "\n#endif" + } set chars [string trimright $chars "\t\n ,"] + if {$extchars} { + append chars "\n#endif" + } if {$ranges ne ""} { puts "static CONST crange ${type}RangeTable\[\] = {\n$ranges\n};\n" puts "#define NUM_[string toupper $type]_RANGE (sizeof(${type}RangeTable)/sizeof(crange))\n" diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl index af71eeb..86f6d04 100644 --- a/tools/uniParse.tcl +++ b/tools/uniParse.tcl @@ -58,8 +58,7 @@ proc uni::getValue {items index} { set categoryIndex [lsearch -exact $categories $category] if {$categoryIndex < 0} { - puts "Unexpected character category: $index($category)" - set categoryIndex 0 + error "Unexpected character category: $index($category)" } return [list $categoryIndex $toupper $tolower $totitle] @@ -79,13 +78,14 @@ proc uni::getGroup {value} { proc uni::addPage {info} { variable pMap variable pages + variable shift set pIndex [lsearch -exact $pages $info] if {$pIndex == -1} { set pIndex [llength $pages] lappend pages $info } - lappend pMap $pIndex + lappend pMap [expr {$pIndex << $shift}] return } @@ -114,7 +114,7 @@ proc uni::buildTables {data} { set items [split $line \;] scan [lindex $items 0] %x index - if {$index > 0xffff} then { + if {$index > 0x2ffff} then { # Ignore non-BMP characters, as long as Tcl doesn't support them continue } @@ -141,15 +141,11 @@ proc uni::buildTables {data} { # Enter all assigned characters up to the current character for {set i $next} {$i <= $index} {incr i} { - # Split character index into offset and page number - set offset [expr {$i & $mask}] - set page [expr {($i >> $shift)}] - # Add the group index to the info for the current page lappend info $gIndex # If this is the last entry in the page, add the page - if {$offset == $mask} { + if {($i & $mask) == $mask} { addPage $info set info {} } @@ -177,7 +173,7 @@ proc uni::main {} { buildTables $data puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" - set size [expr {[llength $pMap]*2 + [llength $pages]*(1<<$shift)}] + set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}] puts "shift = $shift, space = $size" set f [open [file join [lindex $argv 1] tclUniData.c] w] @@ -211,6 +207,14 @@ static CONST unsigned short pageMap\[\] = {" set line " " set last [expr {[llength $pMap] - 1}] for {set i 0} {$i <= $last} {incr i} { + if {$i == [expr {0x10000 >> $shift}]} { + set line [string trimright $line " \t,"] + puts $f $line + set lastpage [expr {[lindex $line end] >> $shift}] + puts stdout "lastpage: $lastpage" + puts $f "#if TCL_UTF_MAX > 3" + set line " ," + } append line [lindex $pMap $i] if {$i != $last} { append line ", " @@ -221,6 +225,7 @@ static CONST unsigned short pageMap\[\] = {" } } puts $f $line + puts $f "#endif /* TCL_UTF_MAX > 3 */" puts $f "}; /* @@ -235,6 +240,11 @@ static CONST unsigned char groupMap\[\] = {" for {set i 0} {$i <= $lasti} {incr i} { set page [lindex $pages $i] set lastj [expr {[llength $page] - 1}] + if {$i == ($lastpage + 1)} { + puts $f [string trimright $line " \t,"] + puts $f "#if TCL_UTF_MAX > 3" + set line " ," + } for {set j 0} {$j <= $lastj} {incr j} { append line [lindex $page $j] if {$j != $lastj || $i != $lasti} { @@ -247,6 +257,7 @@ static CONST unsigned char groupMap\[\] = {" } } puts $f $line + puts $f "#endif /* TCL_UTF_MAX > 3 */" puts $f "}; /* @@ -262,9 +273,7 @@ static CONST unsigned char groupMap\[\] = {" * 101 = sub delta for upper, sub 1 for title * 110 = sub delta for upper, add delta for lower * - * Bits 8-14 Reserved for future use. - * - * Bits 15-31 Case delta: delta for case conversions. This should be the + * Bits 8-31 Case delta: delta for case conversions. This should be the * highest field so we can easily sign extend. */ @@ -281,19 +290,31 @@ static CONST int groups\[\] = {" # subtract delta for title or upper set case 4 set delta $toupper + if {$tolower} { + error "New case conversion type needed: $toupper $tolower $totitle" + } } elseif {$toupper} { # subtract delta for upper, subtract 1 for title set case 5 set delta $toupper + if {($totitle != 1) || $tolower} { + error "New case conversion type needed: $toupper $tolower $totitle" + } } else { # add delta for lower, add 1 for title set case 3 set delta $tolower + if {$totitle != -1} { + error "New case conversion type needed: $toupper $tolower $totitle" + } } } elseif {$toupper} { # subtract delta for upper, add delta for lower set case 6 set delta $toupper + if {$tolower != $toupper} { + error "New case conversion type needed: $toupper $tolower $totitle" + } } elseif {$tolower} { # add delta for lower set case 2 @@ -304,7 +325,7 @@ static CONST int groups\[\] = {" set delta 0 } - append line [expr {($delta << 15) | ($case << 5) | $type}] + append line [expr {($delta << 8) | ($case << 5) | $type}] if {$i != $last} { append line ", " } @@ -316,15 +337,17 @@ static CONST int groups\[\] = {" puts $f $line puts -nonewline $f "}; +#if UTF_MAX_LEN > 3 +# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next]) +#else +# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0) +#endif + /* * The following constants are used to determine the category of a * Unicode character. */ -#define UNICODE_CATEGORY_MASK 0x1f -#define UNICODE_OUT_OF_RANGE " - puts $f [format 0x%xu $next] - puts $f " enum { UNASSIGNED, UPPERCASE_LETTER, @@ -366,14 +389,14 @@ enum { #define GetCaseType(info) (((info) & 0xe0) >> 5) #define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f) -#define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15))) +#define GetDelta(info) ((info) >> 8) /* * This macro extracts the information about a character from the * Unicode character tables. */ -#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) " close $f |