1 files changed, 369 insertions, 0 deletions
diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl
new file mode 100644
index 0000000..f92275f
--- /dev/null
+++ b/tools/uniParse.tcl
@@ -0,0 +1,369 @@
+namespace eval uni {
+    set shift 9			;# number of bits of data within a page
+    variable pMap		;# map from page to page index, each entry is
+				 # an index into the pages table, indexed by
+				 # page number
+    variable pages		;# map from page index to page info, each
+				 # entry is a list of indices into the groups
+				 # table, the list is indexed by the offset
+    variable groups		;# list of character info values, indexed by
+				 # group number, initialized with the
+				 # unassigned character group
+
+    variable categories {
+	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
+	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
+    }				;# Ordered list of character categories, must
+				 # match the enumeration in the header file.
+
+    variable titleCount 0	;# Count of the number of title case
+				 # characters.  This value is used in the
+				 # regular expression code to allocate enough
+				 # space for the title case variants.
+}
+
+proc uni::getValue {items index} {
+    variable categories
+    variable titleCount
+
+    # Extract character info
+
+    set category [lindex $items 2]
+    if {[scan [lindex $items 12] %4x toupper] == 1} {
+	set toupper [expr {$index - $toupper}]
+    } else {
+	set toupper {}
+    }
+    if {[scan [lindex $items 13] %4x tolower] == 1} {
+	set tolower [expr {$tolower - $index}]
+    } else {
+	set tolower {}
+    }
+    if {[scan [lindex $items 14] %4x totitle] == 1} {
+	set totitle [expr {$index - $totitle}]
+    } else {
+	set totitle {}
+    }
+
+    set categoryIndex [lsearch -exact $categories $category]
+    if {$categoryIndex < 0} {
+	puts "Unexpected character category: $index($category)"
+	set categoryIndex 0
+    } elseif {$category == "Lt"} {
+	incr titleCount
+    }
+
+    return "$categoryIndex,$toupper,$tolower,$totitle"
+}
+
+proc uni::getGroup {value} {
+    variable groups
+
+    set gIndex [lsearch -exact $groups $value]
+    if {$gIndex == -1} {
+	set gIndex [llength $groups]
+	lappend groups $value
+    }
+    return $gIndex
+}
+
+proc uni::addPage {info} {
+    variable pMap
+    variable pages
+    
+    set pIndex [lsearch -exact $pages $info]
+    if {$pIndex == -1} {
+	set pIndex [llength $pages]
+	lappend pages $info
+    }
+    lappend pMap $pIndex
+    return
+}
+    
+proc uni::buildTables {data} {
+    variable shift
+
+    variable pMap {}
+    variable pages {}
+    variable groups {{0,,,}}
+    set info {}			;# temporary page info
+    
+    set mask [expr {(1 << $shift) - 1}]
+
+    set next 0
+
+    foreach line [split $data \n] {
+	if {$line == ""} {
+	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
+	}
+
+	set items [split $line \;]
+
+	scan [lindex $items 0] %4x index
+	set index [format 0x%0.4x $index]
+	
+	set gIndex [getGroup [getValue $items $index]]
+
+	# Since the input table omits unassigned characters, these will
+	# show up as gaps in the index sequence.  There are a few special cases
+	# where the gaps correspond to a uniform block of assigned characters.
+	# These are indicated as such in the character name.
+
+	# Enter all unassigned characters up to the current character.
+	if {($index > $next) \
+		&& ![regexp "Last>$" [lindex $items 1]]} {
+	    for {} {$next < $index} {incr next} {
+		lappend info 0
+		if {($next & $mask) == $mask} {
+		    addPage $info
+		    set info {}
+		}
+	    }
+	}
+
+	# Enter all assigned characters up to the current character
+	for {set i $next} {$i <= $index} {incr i} {
+	    # Split character index into offset and page number
+	    set offset [expr {$i & $mask}]
+	    set page [expr {($i >> $shift)}]
+
+	    # Add the group index to the info for the current page
+	    lappend info $gIndex
+
+	    # If this is the last entry in the page, add the page
+	    if {$offset == $mask} {
+		addPage $info
+		set info {}
+	    }
+	}
+	set next [expr {$index + 1}]
+    }
+    return
+}
+
+proc uni::main {} {
+    global argc argv0 argv
+    variable pMap
+    variable pages
+    variable groups
+    variable shift
+    variable titleCount
+
+    if {$argc != 2 && $argc != 3} {
+	puts stderr "\nusage: $argv0 <datafile> <outdir> ?optimize?\n"
+	exit 1
+    }
+    set f [open [lindex $argv 0] r]
+    set data [read $f]
+    close $f
+
+    set shift 6
+    buildTables $data
+    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
+    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
+    puts "shift = 6, space = $size"
+    puts "title case count = $titleCount"
+
+    set f [open [file join [lindex $argv 1] tclUniData.c] w]
+    puts $f "/*
+ * tclUtfData.c --
+ *
+ *	Declarations of Unicode character information tables.  This file is
+ *	automatically generated by the tools/uniParse.tcl script.  Do not
+ *	modify this file by hand.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ * All rights reserved.
+ *
+ * RCS: @(#) \$Id\$
+ */
+
+/*
+ * A 16-bit Unicode character is split into two parts in order to index
+ * into the following tables.  The lower OFFSET_BITS comprise an offset
+ * into a page of characters.  The upper bits comprise the page number.
+ */
+
+#define OFFSET_BITS $shift
+
+/*
+ * The pageMap is indexed by page number and returns an alternate page number
+ * that identifies a unique page of characters.  Many Unicode characters map
+ * to the same alternate page number.
+ */
+
+static char pageMap\[\] = {"
+    set line "    "
+    set last [expr {[llength $pMap] - 1}]
+    for {set i 0} {$i <= $last} {incr i} {
+	append line [lindex $pMap $i]
+	if {$i != $last} {
+	    append line ", "
+	}
+	if {[string length $line] > 70} {
+	    puts $f $line
+	    set line "    "
+	}
+    }
+    puts $f $line
+    puts $f "};
+
+/*
+ * The groupMap is indexed by combining the alternate page number with
+ * the page offset and returns a group number that identifies a unique
+ * set of character attributes.
+ */
+
+static char groupMap\[\] = {"
+    set line "    "
+    set lasti [expr {[llength $pages] - 1}]
+    for {set i 0} {$i <= $lasti} {incr i} {
+	set page [lindex $pages $i]
+	set lastj [expr {[llength $page] - 1}]
+	for {set j 0} {$j <= $lastj} {incr j} {
+	    append line [lindex $page $j]
+	    if {$j != $lastj || $i != $lasti} {
+		append line ", "
+	    }
+	    if {[string length $line] > 70} {
+		puts $f $line
+		set line "    "
+	    }
+	}
+    }
+    puts $f $line
+    puts $f "};
+
+/*
+ * Each group represents a unique set of character attributes.  The attributes
+ * are encoded into a 32-bit value as follows:
+ *
+ * Bits 0-4	Character category: see the constants listed below.
+ *
+ * Bits 5-7	Case delta type: 000 = identity
+ *				 010 = add delta for lower
+ *				 011 = add delta for lower, add 1 for title
+ *				 100 = sutract delta for title/upper
+ *				 101 = sub delta for upper, sub 1 for title
+ *				 110 = sub delta for upper, add delta for lower
+ *
+ * Bits 8-21	Reserved for future use.
+ *
+ * Bits 22-31	Case delta: delta for case conversions.  This should be the
+ *			    highest field so we can easily sign extend.
+ */
+
+static int groups\[\] = {"
+    set line "    "
+    set last [expr {[llength $groups] - 1}]
+    for {set i 0} {$i <= $last} {incr i} {
+	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
+	
+	# Compute the case conversion type and delta
+
+	if {$totitle != ""} {
+	    if {$totitle == $toupper} {
+		# subtract delta for title or upper
+		set case 4
+		set delta $toupper
+	    } elseif {$toupper != ""} {
+		# subtract delta for upper, subtract 1 for title
+		set case 5
+		set delta $toupper
+	    } else {
+		# add delta for lower, add 1 for title
+		set case 3
+		set delta $tolower
+	    }
+	} elseif {$toupper != ""} {
+	    # subtract delta for upper, add delta for lower
+	    set case 6
+	    set delta $toupper
+	} elseif {$tolower != ""} {
+	    # add delta for lower
+	    set case 2
+	    set delta $tolower
+	} else {
+	    # noop
+	    set case 0
+	    set delta 0
+	}
+
+	set val [expr {($delta << 22) | ($case << 5) | $type}]
+
+	append line [format "%d" $val]
+	if {$i != $last} {
+	    append line ", "
+	}
+	if {[string length $line] > 65} {
+	    puts $f $line
+	    set line "    "
+	}
+    }
+    puts $f $line
+    puts $f "};
+
+/*
+ * The following constants are used to determine the category of a
+ * Unicode character.
+ */
+
+#define UNICODE_CATEGORY_MASK 0X1F
+
+enum {
+    UNASSIGNED,
+    UPPERCASE_LETTER,
+    LOWERCASE_LETTER,
+    TITLECASE_LETTER,
+    MODIFIER_LETTER,
+    OTHER_LETTER,
+    NON_SPACING_MARK,
+    ENCLOSING_MARK,
+    COMBINING_SPACING_MARK,
+    DECIMAL_DIGIT_NUMBER,
+    LETTER_NUMBER,
+    OTHER_NUMBER,
+    SPACE_SEPARATOR,
+    LINE_SEPARATOR,
+    PARAGRAPH_SEPARATOR,
+    CONTROL,
+    FORMAT,
+    PRIVATE_USE,
+    SURROGATE,
+    CONNECTOR_PUNCTUATION,
+    DASH_PUNCTUATION,
+    OPEN_PUNCTUATION,
+    CLOSE_PUNCTUATION,
+    INITIAL_QUOTE_PUNCTUATION,
+    FINAL_QUOTE_PUNCTUATION,
+    OTHER_PUNCTUATION,
+    MATH_SYMBOL,
+    CURRENCY_SYMBOL,
+    MODIFIER_SYMBOL,
+    OTHER_SYMBOL
+};
+
+/*
+ * The following macros extract the fields of the character info.  The
+ * GetDelta() macro is complicated because we can't rely on the C compiler
+ * to do sign extension on right shifts.
+ */
+
+#define GetCaseType(info) (((info) & 0xE0) >> 5)
+#define GetCategory(info) ((info) & 0x1F)
+#define GetDelta(infO) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
+
+/*
+ * This macro extracts the information about a character from the
+ * Unicode character tables.
+ */
+
+#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
+"
+
+    close $f
+}
+
+uni::main
+
+return