diff options
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 231 |
1 files changed, 183 insertions, 48 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index c35170c..d503190 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -25,18 +25,20 @@ # written by Fredrik Lundh (fredrik@pythonware.com) # -import sys +import sys, os, zipfile SCRIPT = sys.argv[0] -VERSION = "2.6" +VERSION = "3.2" # The Unicode Database -UNIDATA_VERSION = "5.1.0" +UNIDATA_VERSION = "6.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt" +UNIHAN = "Unihan%s.zip" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" +LINE_BREAK = "LineBreak%s.txt" old_versions = ["3.2.0"] @@ -51,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] + # note: should match definitions in Objects/unicodectype.c ALPHA_MASK = 0x01 DECIMAL_MASK = 0x02 @@ -64,26 +68,29 @@ XID_START_MASK = 0x100 XID_CONTINUE_MASK = 0x200 PRINTABLE_MASK = 0x400 NODELTA_MASK = 0x800 +NUMERIC_MASK = 0x1000 + +# these ranges need to match unicodedata.c:is_unified_ideograph +cjk_ranges = [ + ('3400', '4DB5'), + ('4E00', '9FCB'), + ('20000', '2A6D6'), + ('2A700', '2B734'), + ('2B740', '2B81D') +] def maketables(trace=0): print("--- Reading", UNICODE_DATA % "", "...") version = "" - unicode = UnicodeData(UNICODE_DATA % version, - COMPOSITION_EXCLUSIONS % version, - EASTASIAN_WIDTH % version, - DERIVED_CORE_PROPERTIES % version, - DERIVEDNORMALIZATION_PROPS % version) + unicode = UnicodeData(UNIDATA_VERSION) print(len(list(filter(None, unicode.table))), "characters") for version in old_versions: print("--- Reading", UNICODE_DATA % ("-"+version), "...") - old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), - COMPOSITION_EXCLUSIONS % ("-"+version), - EASTASIAN_WIDTH % ("-"+version), - DERIVED_CORE_PROPERTIES % ("-"+version)) + old_unicode = UnicodeData(version, cjk_check=False) print(len(list(filter(None, old_unicode.table))), "characters") merge_old_version(version, unicode, old_unicode) @@ -357,6 +364,9 @@ def makeunicodetype(unicode, trace): table = [dummy] cache = {0: dummy} index = [0] * len(unicode.chars) + numeric = {} + spaces = [] + linebreaks = [] for char in unicode.chars: record = unicode.table[char] @@ -371,10 +381,12 @@ def makeunicodetype(unicode, trace): flags |= ALPHA_MASK if category == "Ll": flags |= LOWER_MASK - if category == "Zl" or bidirectional == "B": + if 'Line_Break' in properties or bidirectional == "B": flags |= LINEBREAK_MASK + linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): flags |= SPACE_MASK + spaces.append(char) if category == "Lt": flags |= TITLE_MASK if category == "Lu": @@ -423,6 +435,9 @@ def makeunicodetype(unicode, trace): if record[7]: flags |= DIGIT_MASK digit = int(record[7]) + if record[8]: + flags |= NUMERIC_MASK + numeric.setdefault(record[8], []).append(char) item = ( upper, lower, title, decimal, digit, flags ) @@ -434,6 +449,9 @@ def makeunicodetype(unicode, trace): index[char] = i print(len(table), "unique character type entries") + print(sum(map(len, numeric.values())), "numeric code points") + print(len(spaces), "whitespace code points") + print(len(linebreaks), "linebreak code points") print("--- Writing", FILE, "...") @@ -455,6 +473,63 @@ def makeunicodetype(unicode, trace): Array("index1", index1).dump(fp, trace) Array("index2", index2).dump(fp, trace) + # Generate code for _PyUnicode_ToNumeric() + numeric_items = sorted(numeric.items()) + print('/* Returns the numeric value as double for Unicode characters', file=fp) + print(' * having this property, -1.0 otherwise.', file=fp) + print(' */', file=fp) + print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp) + print('{', file=fp) + print(' switch (ch) {', file=fp) + for value, codepoints in numeric_items: + # Turn text into float literals + parts = value.split('/') + parts = [repr(float(part)) for part in parts] + value = '/'.join(parts) + + codepoints.sort() + for codepoint in codepoints: + print(' case 0x%04X:' % (codepoint,), file=fp) + print(' return (double) %s;' % (value,), file=fp) + print(' }', file=fp) + print(' return -1.0;', file=fp) + print('}', file=fp) + print(file=fp) + + # Generate code for _PyUnicode_IsWhitespace() + print("/* Returns 1 for Unicode characters having the bidirectional", file=fp) + print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp) + print(" */", file=fp) + print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp) + print('{', file=fp) + print(' switch (ch) {', file=fp) + + for codepoint in sorted(spaces): + print(' case 0x%04X:' % (codepoint,), file=fp) + print(' return 1;', file=fp) + + print(' }', file=fp) + print(' return 0;', file=fp) + print('}', file=fp) + print(file=fp) + + # Generate code for _PyUnicode_IsLinebreak() + print("/* Returns 1 for Unicode characters having the line break", file=fp) + print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp) + print(" * type 'B', 0 otherwise.", file=fp) + print(" */", file=fp) + print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp) + print('{', file=fp) + print(' switch (ch) {', file=fp) + for codepoint in sorted(linebreaks): + print(' case 0x%04X:' % (codepoint,), file=fp) + print(' return 1;', file=fp) + + print(' }', file=fp) + print(' return 0;', file=fp) + print('}', file=fp) + print(file=fp) + fp.close() # -------------------------------------------------------------------- @@ -670,12 +745,11 @@ def merge_old_version(version, new, old): elif k == 8: # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] # Since 0 encodes "no change", the old value is better not 0 - assert value != "0" and value != "-1" if not value: numeric_changes[i] = -1 else: - assert re.match("^[0-9]+$", value) - numeric_changes[i] = int(value) + numeric_changes[i] = float(value) + assert numeric_changes[i] not in (0, -1) elif k == 9: if value == 'Y': mirrored_changes[i] = '1' @@ -696,6 +770,10 @@ def merge_old_version(version, new, old): elif k == 16: # derived property changes; not yet pass + elif k == 17: + # normalization quickchecks are not performed + # for older versions + pass else: class Difference(Exception):pass raise Difference(hex(i), k, old.table[i], new.table[i]) @@ -704,6 +782,21 @@ def merge_old_version(version, new, old): numeric_changes)), normalization_changes)) +def open_data(template, version): + local = template % ('-'+version,) + if not os.path.exists(local): + import urllib.request + if version == '3.2.0': + # irregular url structure + url = 'http://www.unicode.org/Public/3.2-Update/' + local + else: + url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '') + urllib.request.urlretrieve(url, filename=local) + if local.endswith('.txt'): + return open(local, encoding='utf-8') + else: + # Unihan.zip + return open(local, 'rb') # -------------------------------------------------------------------- # the following support code is taken from the unidb utilities @@ -711,8 +804,6 @@ def merge_old_version(version, new, old): # load a unicode-data file from disk -import sys - class UnicodeData: # Record structure: # [ID, name, category, combining, bidi, decomp, (6) @@ -720,10 +811,12 @@ class UnicodeData: # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) # derived-props] (17) - def __init__(self, filename, exclusions, eastasianwidth, - derivedprops, derivednormalizationprops=None, expand=1): + def __init__(self, version, + linebreakprops=False, + expand=1, + cjk_check=True): self.changed = [] - file = open(filename) + file = open_data(UNICODE_DATA, version) table = [None] * 0x110000 while 1: s = file.readline() @@ -733,6 +826,8 @@ class UnicodeData: char = int(s[0], 16) table[char] = s + cjk_ranges_found = [] + # expand first-last ranges if expand: field = None @@ -743,19 +838,24 @@ class UnicodeData: s[1] = "" field = s elif s[1][-5:] == "Last>": + if s[1].startswith("<CJK Ideograph"): + cjk_ranges_found.append((field[0], + s[0])) s[1] = "" field = None elif field: f2 = field[:] f2[0] = "%X" % i table[i] = f2 + if cjk_check and cjk_ranges != cjk_ranges_found: + raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) # public attributes - self.filename = filename + self.filename = UNICODE_DATA % '' self.table = table self.chars = list(range(0x110000)) # unicode 3.2 - file = open(exclusions) + file = open_data(COMPOSITION_EXCLUSIONS, version) self.exclusions = {} for s in file: s = s.strip() @@ -767,7 +867,7 @@ class UnicodeData: self.exclusions[char] = 1 widths = [None] * 0x110000 - for s in open(eastasianwidth): + for s in open_data(EASTASIAN_WIDTH, version): s = s.strip() if not s: continue @@ -788,7 +888,7 @@ class UnicodeData: for i in range(0, 0x110000): if table[i] is not None: table[i].append(set()) - for s in open(derivedprops): + for s in open_data(DERIVED_CORE_PROPERTIES, version): s = s.split('#', 1)[0].strip() if not s: continue @@ -807,28 +907,64 @@ class UnicodeData: # apply to unassigned code points; ignore them table[char][-1].add(p) - if derivednormalizationprops: - quickchecks = [0] * 0x110000 # default is Yes - qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() - for s in open(derivednormalizationprops): - if '#' in s: - s = s[:s.index('#')] - s = [i.strip() for i in s.split(';')] - if len(s) < 2 or s[1] not in qc_order: - continue - quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No - quickcheck_shift = qc_order.index(s[1])*2 - quickcheck <<= quickcheck_shift - if '..' not in s[0]: - first = last = int(s[0], 16) - else: - first, last = [int(c, 16) for c in s[0].split('..')] - for char in range(first, last+1): - assert not (quickchecks[char]>>quickcheck_shift)&3 - quickchecks[char] |= quickcheck - for i in range(0, 0x110000): - if table[i] is not None: - table[i].append(quickchecks[i]) + for s in open_data(LINE_BREAK, version): + s = s.partition('#')[0] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: + continue + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + table[char][-1].add('Line_Break') + + # We only want the quickcheck properties + # Format: NF?_QC; Y(es)/N(o)/M(aybe) + # Yes is the default, hence only N and M occur + # In 3.2.0, the format was different (NF?_NO) + # The parsing will incorrectly determine these as + # "yes", however, unicodedata.c will not perform quickchecks + # for older versions, and no delta records will be created. + quickchecks = [0] * 0x110000 + qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() + for s in open_data(DERIVEDNORMALIZATION_PROPS, version): + if '#' in s: + s = s[:s.index('#')] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in qc_order: + continue + quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No + quickcheck_shift = qc_order.index(s[1])*2 + quickcheck <<= quickcheck_shift + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + assert not (quickchecks[char]>>quickcheck_shift)&3 + quickchecks[char] |= quickcheck + for i in range(0, 0x110000): + if table[i] is not None: + table[i].append(quickchecks[i]) + + zip = zipfile.ZipFile(open_data(UNIHAN, version)) + if version == '3.2.0': + data = zip.open('Unihan-3.2.0.txt').read() + else: + data = zip.open('Unihan_NumericValues.txt').read() + for line in data.decode("utf-8").splitlines(): + if not line.startswith('U+'): + continue + code, tag, value = line.split(None, 3)[:3] + if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', + 'kOtherNumeric'): + continue + value = value.strip().replace(',', '') + i = int(code[2:], 16) + # Patch the numeric field + if table[i] is not None: + table[i][8] = value def uselatin1(self): # restrict character range to ISO Latin 1 @@ -979,7 +1115,6 @@ def splitbins(t, trace=0): you'll get. """ - import sys if trace: def dump(t1, t2, shift, bytes): print("%d+%d bins at shift %d; %d bytes" % ( |