summaryrefslogtreecommitdiffstats
path: root/tools/uniClass.tcl
blob: a4a58c2815e2cf08744f71ecd9ac1a1a2b7c2899 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/bin/sh
# The next line is executed by /bin/sh, but not tcl \
exec tclsh "$0" ${1+"$@"}

#
# uniClass.tcl --
#
#	Generates the character ranges and singletons that are used in
#	generic/regc_locale.c for translation of character classes.
#	This file must be generated using a tclsh that contains the
#	correct corresponding tclUniData.c file (generated by uniParse.tcl)
#	in order for the class ranges to match.
#

proc emitRange {first last} {
    global ranges numranges chars numchars extchars extranges

    if {$first < ($last-1)} {
	if {!$extranges && ($first) > 0xFFFF} {
	    set extranges 1
	    set numranges 0
	    set ranges [string trimright $ranges " \n\r\t,"]
	    append ranges "\n#if CHRBITS > 16\n    ,"
	}
	append ranges [format "{0x%X, 0x%X}, " \
		$first $last]
	if {[incr numranges] % 4 == 0} {
	    set ranges [string trimright $ranges]
	    append ranges "\n    "
	}
    } else {
	if {!$extchars && ($first) > 0xFFFF} {
	    set extchars 1
	    set numchars 0
	    set chars [string trimright $chars " \n\r\t,"]
	    append chars "\n#if CHRBITS > 16\n    ,"
	}
	append chars [format "0x%X, " $first]
	incr numchars
	if {$numchars % 9 == 0} {
	    set chars [string trimright $chars]
	    append chars "\n    "
	}
	if {$first != $last} {
	    append chars [format "0x%X, " $last]
	    incr numchars
	    if {$numchars % 9 == 0} {
		append chars "\n    "
	    }
	}
    }
}

proc genTable {type} {
    global first last ranges numranges chars numchars extchars extranges
    set first -2
    set last -2

    set ranges "    "
    set numranges 0
    set chars "    "
    set numchars 0
    set extchars 0
    set extranges 0

    for {set i 0} {$i <= 0xEFFFF} {incr i} {
	if {$i == 0xD800} {
	    # Skip surrogates
	    set i 0xE000
	}
	if {$i == 0xE000} {
	    # Skip private
	    set i 0xF900
	}
	if {[string is $type [format %c $i]]} {
	    if {$i == ($last + 1)} {
		set last $i
	    } else {
		if {$first >= 0} {
		    emitRange $first $last
		}
		set first $i
		set last $i
	    }
	}
    }
    emitRange $first $last

    set ranges [string trimright $ranges "\t\n ,"]
    if {$extranges} {
	append ranges "\n#endif"
    }
    set chars  [string trimright $chars "\t\n ,"]
    if {$extchars} {
	append chars "\n#endif"
    }
    if {$ranges ne ""} {
	puts "static const crange ${type}RangeTable\[\] = {\n$ranges\n};\n"
	puts "#define NUM_[string toupper $type]_RANGE ((int)(sizeof(${type}RangeTable)/sizeof(crange)))\n"
    } else {
	puts "/* no contiguous ranges of $type characters */\n"
    }
    if {$chars ne ""} {
	puts "static const chr ${type}CharTable\[\] = {\n$chars\n};\n"
	puts "#define NUM_[string toupper $type]_CHAR ((int)(sizeof(${type}CharTable)/sizeof(chr)))\n"
    } else {
	puts "/*\n * no singletons of $type characters.\n */\n"
    }
}

puts "/*
 *	Declarations of Unicode character ranges.  This code
 *	is automatically generated by the tools/uniClass.tcl script
 *	and used in generic/regc_locale.c.  Do not modify by hand.
 */
"

foreach {type desc} {
    alpha "alphabetic characters"
    control "control characters"
    digit "decimal digit characters"
    punct "punctuation characters"
    space "white space characters"
    lower "lowercase characters"
    upper "uppercase characters"
    graph "unicode print characters excluding space"
} {
    puts "/*\n * Unicode: $desc.\n */\n"
    genTable $type
}

puts "/*
 *	End of auto-generated Unicode character ranges declarations.
 */"