diff options
author | Amaury Forgeot d'Arc <amauryfa@gmail.com> | 2009-10-06 19:56:32 (GMT) |
---|---|---|
committer | Amaury Forgeot d'Arc <amauryfa@gmail.com> | 2009-10-06 19:56:32 (GMT) |
commit | d0052d17b1a067e4aa8a69f5564a4b94e0c00502 (patch) | |
tree | c80b69d55175b2f9d9090262963fc1989358b963 /Tools | |
parent | 85ea4bf781203d8b4fd2873791d0a7a26e103652 (diff) | |
download | cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.zip cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.tar.gz cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.tar.bz2 |
#1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,
_PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace.
It now also parses the Unihan.txt for numeric values.
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 131 |
1 files changed, 123 insertions, 8 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index e3842e5..92268ad 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt" +UNIHAN = "Unihan%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" old_versions = ["3.2.0"] @@ -59,6 +60,7 @@ SPACE_MASK = 0x20 TITLE_MASK = 0x40 UPPER_MASK = 0x80 NODELTA_MASK = 0x100 +NUMERIC_MASK = 0x200 def maketables(trace=0): @@ -68,6 +70,7 @@ def maketables(trace=0): unicode = UnicodeData(UNICODE_DATA % version, COMPOSITION_EXCLUSIONS % version, EASTASIAN_WIDTH % version, + UNIHAN % version, DERIVEDNORMALIZATION_PROPS % version) print len(filter(None, unicode.table)), "characters" @@ -76,7 +79,8 @@ def maketables(trace=0): print "--- Reading", UNICODE_DATA % ("-"+version), "..." old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), COMPOSITION_EXCLUSIONS % ("-"+version), - EASTASIAN_WIDTH % ("-"+version)) + EASTASIAN_WIDTH % ("-"+version), + UNIHAN % ("-"+version)) print len(filter(None, old_unicode.table)), "characters" merge_old_version(version, unicode, old_unicode) @@ -352,6 +356,9 @@ def makeunicodetype(unicode, trace): table = [dummy] cache = {0: dummy} index = [0] * len(unicode.chars) + numeric = {} + spaces = [] + linebreaks = [] for char in unicode.chars: record = unicode.table[char] @@ -367,8 +374,10 @@ def makeunicodetype(unicode, trace): flags |= LOWER_MASK if category == "Zl" or bidirectional == "B": flags |= LINEBREAK_MASK + linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): flags |= SPACE_MASK + spaces.append(char) if category == "Lt": flags |= TITLE_MASK if category == "Lu": @@ -411,6 +420,9 @@ def makeunicodetype(unicode, trace): if record[7]: flags |= DIGIT_MASK digit = int(record[7]) + if record[8]: + flags |= NUMERIC_MASK + numeric.setdefault(record[8], []).append(char) item = ( upper, lower, title, decimal, digit, flags ) @@ -422,6 +434,9 @@ def makeunicodetype(unicode, trace): index[char] = i print len(table), "unique character type entries" + print sum(map(len, numeric.values())), "numeric code points" + print len(spaces), "whitespace code points" + print len(linebreaks), "linebreak code points" print "--- Writing", FILE, "..." @@ -443,6 +458,97 @@ def makeunicodetype(unicode, trace): Array("index1", index1).dump(fp, trace) Array("index2", index2).dump(fp, trace) + # Generate code for _PyUnicode_ToNumeric() + numeric_items = numeric.items() + numeric_items.sort() + print >>fp, '/* Returns the numeric value as double for Unicode characters' + print >>fp, ' * having this property, -1.0 otherwise.' + print >>fp, ' */' + print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)' + print >>fp, '{' + print >>fp, ' switch (ch) {' + for value, codepoints in numeric_items: + haswide = False + hasnonewide = False + codepoints.sort() + for codepoint in codepoints: + if codepoint < 0x10000: + hasnonewide = True + if codepoint >= 0x10000 and not haswide: + print >>fp, '#ifdef Py_UNICODE_WIDE' + haswide = True + print >>fp, ' case 0x%04X:' % (codepoint,) + if haswide and hasnonewide: + print >>fp, '#endif' + print >>fp, ' return (double) %s;' % (value,) + if haswide and not hasnonewide: + print >>fp, '#endif' + print >>fp,' }' + print >>fp,' return -1.0;' + print >>fp,'}' + print >>fp + + # Generate code for _PyUnicode_IsWhitespace() + print >>fp, "/* Returns 1 for Unicode characters having the bidirectional" + print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise." + print >>fp, " */" + print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)' + print >>fp, '{' + print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS' + print >>fp, ' return iswspace(ch);' + print >>fp, '#else' + print >>fp, ' switch (ch) {' + + haswide = False + hasnonewide = False + spaces.sort() + for codepoint in spaces: + if codepoint < 0x10000: + hasnonewide = True + if codepoint >= 0x10000 and not haswide: + print >>fp, '#ifdef Py_UNICODE_WIDE' + haswide = True + print >>fp, ' case 0x%04X:' % (codepoint,) + if haswide and hasnonewide: + print >>fp, '#endif' + print >>fp, ' return 1;' + if haswide and not hasnonewide: + print >>fp, '#endif' + + print >>fp,' }' + print >>fp,' return 0;' + print >>fp, '#endif' + print >>fp,'}' + print >>fp + + # Generate code for _PyUnicode_IsLinebreak() + print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl'," + print >>fp, " * 'Zp' or type 'B', 0 otherwise." + print >>fp, " */" + print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)' + print >>fp, '{' + print >>fp, ' switch (ch) {' + haswide = False + hasnonewide = False + linebreaks.sort() + for codepoint in linebreaks: + if codepoint < 0x10000: + hasnonewide = True + if codepoint >= 0x10000 and not haswide: + print >>fp, '#ifdef Py_UNICODE_WIDE' + haswide = True + print >>fp, ' case 0x%04X:' % (codepoint,) + if haswide and hasnonewide: + print >>fp, '#endif' + print >>fp, ' return 1;' + if haswide and not hasnonewide: + print >>fp, '#endif' + + print >>fp,' }' + print >>fp,' return 0;' + print >>fp,'}' + print >>fp + fp.close() # -------------------------------------------------------------------- @@ -660,12 +766,11 @@ def merge_old_version(version, new, old): elif k == 8: # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] # Since 0 encodes "no change", the old value is better not 0 - assert value != "0" and value != "-1" if not value: numeric_changes[i] = -1 else: - assert re.match("^[0-9]+$", value) - numeric_changes[i] = int(value) + numeric_changes[i] = float(value) + assert numeric_changes[i] not in (0, -1) elif k == 9: if value == 'Y': mirrored_changes[i] = '1' @@ -698,11 +803,9 @@ def merge_old_version(version, new, old): # load a unicode-data file from disk -import sys - class UnicodeData: - def __init__(self, filename, exclusions, eastasianwidth, + def __init__(self, filename, exclusions, eastasianwidth, unihan, derivednormalizationprops=None, expand=1): self.changed = [] file = open(filename) @@ -789,6 +892,19 @@ class UnicodeData: if table[i] is not None: table[i].append(quickchecks[i]) + for line in open(unihan): + if not line.startswith('U+'): + continue + code, tag, value = line.split(None, 3)[:3] + if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', + 'kOtherNumeric'): + continue + value = value.strip().replace(',', '') + i = int(code[2:], 16) + # Patch the numeric field + if table[i] is not None: + table[i][8] = value + def uselatin1(self): # restrict character range to ISO Latin 1 self.chars = range(256) @@ -938,7 +1054,6 @@ def splitbins(t, trace=0): you'll get. """ - import sys if trace: def dump(t1, t2, shift, bytes): print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( |