summaryrefslogtreecommitdiffstats
path: root/tools/uniParse.tcl
diff options
context:
space:
mode:
Diffstat (limited to 'tools/uniParse.tcl')
-rw-r--r--tools/uniParse.tcl159
1 files changed, 90 insertions, 69 deletions
diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl
index 3fe38d2..e33b3c7 100644
--- a/tools/uniParse.tcl
+++ b/tools/uniParse.tcl
@@ -4,12 +4,10 @@
# corresponding tclUniData.c file with compressed character
# data tables. The input to this program should be the latest
# UnicodeData file from:
-# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
+# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
#
# Copyright (c) 1998-1999 by Scriptics Corporation.
# All rights reserved.
-#
-# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
namespace eval uni {
@@ -32,45 +30,38 @@ namespace eval uni {
Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
}; # Ordered list of character categories, must
# match the enumeration in the header file.
-
- variable titleCount 0; # Count of the number of title case
- # characters. This value is used in the
- # regular expression code to allocate enough
- # space for the title case variants.
}
proc uni::getValue {items index} {
variable categories
- variable titleCount
# Extract character info
set category [lindex $items 2]
- if {[scan [lindex $items 12] %4x toupper] == 1} {
+ if {[scan [lindex $items 12] %x toupper] == 1} {
set toupper [expr {$index - $toupper}]
} else {
- set toupper {}
+ set toupper 0
}
- if {[scan [lindex $items 13] %4x tolower] == 1} {
+ if {[scan [lindex $items 13] %x tolower] == 1} {
set tolower [expr {$tolower - $index}]
} else {
- set tolower {}
+ set tolower 0
}
- if {[scan [lindex $items 14] %4x totitle] == 1} {
+ if {[scan [lindex $items 14] %x totitle] == 1} {
set totitle [expr {$index - $totitle}]
+ } elseif {$tolower} {
+ set totitle 0
} else {
- set totitle {}
+ set totitle $toupper
}
set categoryIndex [lsearch -exact $categories $category]
if {$categoryIndex < 0} {
- puts "Unexpected character category: $index($category)"
- set categoryIndex 0
- } elseif {$category == "Lt"} {
- incr titleCount
+ error "Unexpected character category: $index($category)"
}
- return "$categoryIndex,$toupper,$tolower,$totitle"
+ return [list $categoryIndex $toupper $tolower $totitle]
}
proc uni::getGroup {value} {
@@ -87,38 +78,48 @@ proc uni::getGroup {value} {
proc uni::addPage {info} {
variable pMap
variable pages
-
+ variable shift
+
set pIndex [lsearch -exact $pages $info]
if {$pIndex == -1} {
set pIndex [llength $pages]
lappend pages $info
}
- lappend pMap $pIndex
+ lappend pMap [expr {$pIndex << $shift}]
return
}
-
+
proc uni::buildTables {data} {
variable shift
variable pMap {}
variable pages {}
- variable groups {{0,,,}}
+ variable groups {{0 0 0 0}}
+ variable next 0
set info {} ;# temporary page info
-
- set mask [expr {(1 << $shift) - 1}]
- set next 0
+ set mask [expr {(1 << $shift) - 1}]
foreach line [split $data \n] {
- if {$line == ""} {
- set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
+ if {$line eq ""} {
+ if {!($next & $mask)} {
+ # next character is already on page boundary
+ continue
+ }
+ # fill remaining page
+ set line [format %X [expr {($next-1)|$mask}]]
+ append line ";;Cn;0;ON;;;;;N;;;;;\n"
}
set items [split $line \;]
- scan [lindex $items 0] %4x index
- set index [format 0x%0.4x $index]
-
+ scan [lindex $items 0] %x index
+ if {$index > 0x2ffff} then {
+ # Ignore non-BMP characters, as long as Tcl doesn't support them
+ continue
+ }
+ set index [format %d $index]
+
set gIndex [getGroup [getValue $items $index]]
# Since the input table omits unassigned characters, these will
@@ -140,15 +141,11 @@ proc uni::buildTables {data} {
# Enter all assigned characters up to the current character
for {set i $next} {$i <= $index} {incr i} {
- # Split character index into offset and page number
- set offset [expr {$i & $mask}]
- set page [expr {($i >> $shift)}]
-
# Add the group index to the info for the current page
lappend info $gIndex
# If this is the last entry in the page, add the page
- if {$offset == $mask} {
+ if {($i & $mask) == $mask} {
addPage $info
set info {}
}
@@ -164,7 +161,7 @@ proc uni::main {} {
variable pages
variable groups
variable shift
- variable titleCount
+ variable next
if {$argc != 2} {
puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
@@ -176,9 +173,8 @@ proc uni::main {} {
buildTables $data
puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
- set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
- puts "shift = 6, space = $size"
- puts "title case count = $titleCount"
+ set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}]
+ puts "shift = $shift, space = $size"
set f [open [file join [lindex $argv 1] tclUniData.c] w]
fconfigure $f -translation lf
@@ -191,8 +187,6 @@ proc uni::main {} {
*
* Copyright (c) 1998 by Scriptics Corporation.
* All rights reserved.
- *
- * RCS: @(#) \$Id\$
*/
/*
@@ -209,20 +203,29 @@ proc uni::main {} {
* to the same alternate page number.
*/
-static unsigned char pageMap\[\] = {"
+static const unsigned short pageMap\[\] = {"
set line " "
set last [expr {[llength $pMap] - 1}]
for {set i 0} {$i <= $last} {incr i} {
+ if {$i == [expr {0x10000 >> $shift}]} {
+ set line [string trimright $line " \t,"]
+ puts $f $line
+ set lastpage [expr {[lindex $line end] >> $shift}]
+ puts stdout "lastpage: $lastpage"
+ puts $f "#if TCL_UTF_MAX > 3"
+ set line " ,"
+ }
append line [lindex $pMap $i]
if {$i != $last} {
append line ", "
}
if {[string length $line] > 70} {
- puts $f $line
+ puts $f [string trimright $line]
set line " "
}
}
puts $f $line
+ puts $f "#endif /* TCL_UTF_MAX > 3 */"
puts $f "};
/*
@@ -231,24 +234,30 @@ static unsigned char pageMap\[\] = {"
* set of character attributes.
*/
-static unsigned char groupMap\[\] = {"
+static const unsigned char groupMap\[\] = {"
set line " "
set lasti [expr {[llength $pages] - 1}]
for {set i 0} {$i <= $lasti} {incr i} {
set page [lindex $pages $i]
set lastj [expr {[llength $page] - 1}]
+ if {$i == ($lastpage + 1)} {
+ puts $f [string trimright $line " \t,"]
+ puts $f "#if TCL_UTF_MAX > 3"
+ set line " ,"
+ }
for {set j 0} {$j <= $lastj} {incr j} {
append line [lindex $page $j]
if {$j != $lastj || $i != $lasti} {
append line ", "
}
if {[string length $line] > 70} {
- puts $f $line
+ puts $f [string trimright $line]
set line " "
}
}
}
puts $f $line
+ puts $f "#endif /* TCL_UTF_MAX > 3 */"
puts $f "};
/*
@@ -260,43 +269,53 @@ static unsigned char groupMap\[\] = {"
* Bits 5-7 Case delta type: 000 = identity
* 010 = add delta for lower
* 011 = add delta for lower, add 1 for title
- * 100 = sutract delta for title/upper
+ * 100 = subtract delta for title/upper
* 101 = sub delta for upper, sub 1 for title
* 110 = sub delta for upper, add delta for lower
*
- * Bits 8-21 Reserved for future use.
- *
- * Bits 22-31 Case delta: delta for case conversions. This should be the
+ * Bits 8-31 Case delta: delta for case conversions. This should be the
* highest field so we can easily sign extend.
*/
-static int groups\[\] = {"
+static const int groups\[\] = {"
set line " "
set last [expr {[llength $groups] - 1}]
for {set i 0} {$i <= $last} {incr i} {
- foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
-
+ foreach {type toupper tolower totitle} [lindex $groups $i] {}
+
# Compute the case conversion type and delta
- if {$totitle != ""} {
+ if {$totitle} {
if {$totitle == $toupper} {
# subtract delta for title or upper
set case 4
set delta $toupper
- } elseif {$toupper != ""} {
+ if {$tolower} {
+ error "New case conversion type needed: $toupper $tolower $totitle"
+ }
+ } elseif {$toupper} {
# subtract delta for upper, subtract 1 for title
set case 5
set delta $toupper
+ if {($totitle != 1) || $tolower} {
+ error "New case conversion type needed: $toupper $tolower $totitle"
+ }
} else {
# add delta for lower, add 1 for title
set case 3
set delta $tolower
+ if {$totitle != -1} {
+ error "New case conversion type needed: $toupper $tolower $totitle"
+ }
}
- } elseif {$toupper != ""} {
+ } elseif {$toupper} {
# subtract delta for upper, add delta for lower
set case 6
set delta $toupper
- } elseif {$tolower != ""} {
+ if {$tolower != $toupper} {
+ error "New case conversion type needed: $toupper $tolower $totitle"
+ }
+ } elseif {$tolower} {
# add delta for lower
set case 2
set delta $tolower
@@ -306,27 +325,29 @@ static int groups\[\] = {"
set delta 0
}
- set val [expr {($delta << 22) | ($case << 5) | $type}]
-
- append line [format "%d" $val]
+ append line [expr {($delta << 8) | ($case << 5) | $type}]
if {$i != $last} {
append line ", "
}
if {[string length $line] > 65} {
- puts $f $line
+ puts $f [string trimright $line]
set line " "
}
}
puts $f $line
- puts $f "};
+ puts -nonewline $f "};
+
+#if TCL_UTF_MAX > 3
+# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next])
+#else
+# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0)
+#endif
/*
* The following constants are used to determine the category of a
* Unicode character.
*/
-#define UNICODE_CATEGORY_MASK 0X1F
-
enum {
UNASSIGNED,
UPPERCASE_LETTER,
@@ -366,16 +387,16 @@ enum {
* to do sign extension on right shifts.
*/
-#define GetCaseType(info) (((info) & 0xE0) >> 5)
-#define GetCategory(info) ((info) & 0x1F)
-#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
+#define GetCaseType(info) (((info) & 0xe0) >> 5)
+#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f)
+#define GetDelta(info) ((info) >> 8)
/*
* This macro extracts the information about a character from the
* Unicode character tables.
*/
-#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
+#define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
"
close $f