From ffa41c2677af95d32514e51ef9bf3cf2cdfcf29b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 7 Dec 2011 06:13:15 +0000 Subject: [Bug 3444754] string tolower \u01c5 is wrong --- ChangeLog | 6 ++++++ generic/tclUniData.c | 7 ++++--- tests/utf.test | 4 ++-- tools/uniParse.tcl | 48 ++++++++++++++++++++++++++++-------------------- 4 files changed, 40 insertions(+), 25 deletions(-) diff --git a/ChangeLog b/ChangeLog index 86cfbc0..d1d3718 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2011-12-07 Jan Nijtmans + + * tools/uniParse.tcl: [Bug 3444754] string tolower \u01c5 is wrong + * generic/tclUniData.c: + * tests/utf.test: + 2011-11-30 Jan Nijtmans * library/tcltest/tcltest.tcl: [Bug 967195]: Make tcltest work diff --git a/generic/tclUniData.c b/generic/tclUniData.c index 43edac1..8d8efd1 100644 --- a/generic/tclUniData.c +++ b/generic/tclUniData.c @@ -751,7 +751,7 @@ static CONST int groups[] = { 7602306, -3964863, 9830530, -6389630, 6881345, 6750273, 6717505, 2588737, 6619201, 6651969, 6783041, -3178366, 6914113, 6848577, -5341054, 6979649, -4259710, 7012417, 7143489, 7110721, 7176257, - 5, -1834878, 65633, 32931, 65698, 2588802, -3178431, -1834943, + 5, -1834878, 65633, 32963, 65698, 2588802, -3178431, -1834943, -4259775, 353730625, -5341119, 353632321, -354385790, -6389695, 2261057, 2326593, -353337214, -353238910, -353304446, 6881410, 6750338, 6717570, 6619266, 6652034, 6783106, -1385430910, 6848642, @@ -775,7 +775,8 @@ static CONST int groups[] = { * Unicode character. */ -#define UNICODE_CATEGORY_MASK 0X1F +#define UNICODE_CATEGORY_MASK 0x1F +#define UNICODE_OUT_OF_RANGE 0x10000u enum { UNASSIGNED, @@ -817,7 +818,7 @@ enum { */ #define GetCaseType(info) (((info) & 0xE0) >> 5) -#define GetCategory(info) ((info) & 0x1F) +#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F) #define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15))) /* diff --git a/tests/utf.test b/tests/utf.test index 5e78666..af84b54 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -244,8 +244,8 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff\uA78D -} \u00ff\u00ff\u0265 + string tolower \u0178\u00ff\uA78D\u01c5 +} \u00ff\u00ff\u0265\u01c6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl index 6b71f2d..a7c857e 100644 --- a/tools/uniParse.tcl +++ b/tools/uniParse.tcl @@ -44,20 +44,20 @@ proc uni::getValue {items index} { # Extract character info set category [lindex $items 2] - if {[scan [lindex $items 12] %4x toupper] == 1} { + if {[scan [lindex $items 12] %6x toupper] == 1} { set toupper [expr {$index - $toupper}] } else { - set toupper {} + set toupper 0 } - if {[scan [lindex $items 13] %4x tolower] == 1} { + if {[scan [lindex $items 13] %6x tolower] == 1} { set tolower [expr {$tolower - $index}] } else { - set tolower {} + set tolower 0 } - if {[scan [lindex $items 14] %4x totitle] == 1} { + if {[scan [lindex $items 14] %6x totitle] == 1} { set totitle [expr {$index - $totitle}] } else { - set totitle {} + set totitle 0 } set categoryIndex [lsearch -exact $categories $category] @@ -100,16 +100,21 @@ proc uni::buildTables {data} { variable pMap {} variable pages {} - variable groups {{0,,,}} + variable groups {{0,0,0,0}} + variable next 0 set info {} ;# temporary page info set mask [expr {(1 << $shift) - 1}] - set next 0 - foreach line [split $data \n] { if {$line eq ""} { - set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" + if {!($next & $mask)} { + # next character is already on page boundary + continue + } + # fill remaining page + set line [format %X [expr {($next-1)|$mask}]] + append line ";;Cn;0;ON;;;;;N;;;;;\n" } set items [split $line \;] @@ -119,7 +124,7 @@ proc uni::buildTables {data} { # Ignore non-BMP characters, as long as Tcl doesn't support them continue } - set index [format 0x%0.4x $index] + set index [format %d $index] set gIndex [getGroup [getValue $items $index]] @@ -167,6 +172,7 @@ proc uni::main {} { variable groups variable shift variable titleCount + variable next if {$argc != 2} { puts stderr "\nusage: $argv0 \n" @@ -178,7 +184,7 @@ proc uni::main {} { buildTables $data puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" - set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] + set size [expr {[llength $pMap]*2 + [llength $pages]*(1<<$shift)}] puts "shift = $shift, space = $size" puts "title case count = $titleCount" @@ -278,12 +284,12 @@ static CONST int groups\[\] = {" # Compute the case conversion type and delta - if {$totitle ne ""} { + if {$totitle} { if {$totitle == $toupper} { # subtract delta for title or upper set case 4 set delta $toupper - } elseif {$toupper ne ""} { + } elseif {$toupper} { # subtract delta for upper, subtract 1 for title set case 5 set delta $toupper @@ -292,11 +298,11 @@ static CONST int groups\[\] = {" set case 3 set delta $tolower } - } elseif {$toupper ne ""} { + } elseif {$toupper} { # subtract delta for upper, add delta for lower set case 6 set delta $toupper - } elseif {$tolower ne ""} { + } elseif {$tolower} { # add delta for lower set case 2 set delta $tolower @@ -316,15 +322,17 @@ static CONST int groups\[\] = {" } } puts $f $line - puts $f "}; + puts -nonewline $f "}; /* * The following constants are used to determine the category of a * Unicode character. */ -#define UNICODE_CATEGORY_MASK 0X1F - +#define UNICODE_CATEGORY_MASK 0x1F +#define UNICODE_OUT_OF_RANGE " + puts $f [format 0x%Xu $next] + puts $f " enum { UNASSIGNED, UPPERCASE_LETTER, @@ -365,7 +373,7 @@ enum { */ #define GetCaseType(info) (((info) & 0xE0) >> 5) -#define GetCategory(info) ((info) & 0x1F) +#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F) #define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15))) /* -- cgit v0.12