From a65678c5c90002c5e40fa82746de07e6217df625 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Thu, 12 Sep 2019 02:23:43 -0700 Subject: bpo-37760: Convert from length-18 lists to a dataclass, in makeunicodedata. (GH-15265) Now the fields have names! Much easier to keep straight as a reader than the elements of an 18-tuple. Runs about 10-15% slower: from 10.8s to 12.3s, on my laptop. Fortunately that's perfectly fine for this maintenance script. --- .../Build/2019-08-24-17-39-09.bpo-37760.f3jXuH.rst | 6 + Tools/unicode/makeunicodedata.py | 150 ++++++++++++--------- 2 files changed, 94 insertions(+), 62 deletions(-) create mode 100644 Misc/NEWS.d/next/Build/2019-08-24-17-39-09.bpo-37760.f3jXuH.rst diff --git a/Misc/NEWS.d/next/Build/2019-08-24-17-39-09.bpo-37760.f3jXuH.rst b/Misc/NEWS.d/next/Build/2019-08-24-17-39-09.bpo-37760.f3jXuH.rst new file mode 100644 index 0000000..0498173 --- /dev/null +++ b/Misc/NEWS.d/next/Build/2019-08-24-17-39-09.bpo-37760.f3jXuH.rst @@ -0,0 +1,6 @@ +The :file:`Tools/unicode/makeunicodedata.py` script, which is used for +converting information from the Unicode Character Database into generated +code and data used by the methods of :class:`str` and by the +:mod:`unicodedata` module, now handles each character's data as a +``dataclass`` with named attributes, rather than a length-18 list of +different fields. diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 464a4eb..a8e92be 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -26,13 +26,14 @@ # written by Fredrik Lundh (fredrik@pythonware.com) # +import dataclasses import os import sys import zipfile from functools import partial from textwrap import dedent -from typing import Iterator, List, Tuple +from typing import Iterator, List, Optional, Set, Tuple SCRIPT = sys.argv[0] VERSION = "3.3" @@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace): record = unicode.table[char] if record: # extract database properties - category = CATEGORY_NAMES.index(record[2]) - combining = int(record[3]) - bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) - mirrored = record[9] == "Y" - eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) - normalizationquickcheck = record[17] + category = CATEGORY_NAMES.index(record.general_category) + combining = int(record.canonical_combining_class) + bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class) + mirrored = record.bidi_mirrored == "Y" + eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) + normalizationquickcheck = record.quick_check item = ( category, combining, bidirectional, mirrored, eastasianwidth, normalizationquickcheck @@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace): for char in unicode.chars: record = unicode.table[char] if record: - if record[5]: - decomp = record[5].split() + if record.decomposition_type: + decomp = record.decomposition_type.split() if len(decomp) > 19: raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) # prefix @@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace): # Collect NFC pairs if not prefix and len(decomp) == 3 and \ char not in unicode.exclusions and \ - unicode.table[decomp[1]][3] == "0": + unicode.table[decomp[1]].canonical_combining_class == "0": p, l, r = decomp comp_first[l] = 1 comp_last[r] = 1 @@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace): record = unicode.table[char] if record: # extract database properties - category = record[2] - bidirectional = record[4] - properties = record[16] + category = record.general_category + bidirectional = record.bidi_class + properties = record.binary_properties flags = 0 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: flags |= ALPHA_MASK @@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace): flags |= CASE_IGNORABLE_MASK sc = unicode.special_casing.get(char) cf = unicode.case_folding.get(char, [char]) - if record[12]: - upper = int(record[12], 16) + if record.simple_uppercase_mapping: + upper = int(record.simple_uppercase_mapping, 16) else: upper = char - if record[13]: - lower = int(record[13], 16) + if record.simple_lowercase_mapping: + lower = int(record.simple_lowercase_mapping, 16) else: lower = char - if record[14]: - title = int(record[14], 16) + if record.simple_titlecase_mapping: + title = int(record.simple_titlecase_mapping, 16) else: title = upper if sc is None and cf != [lower]: @@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace): extra_casing.extend(sc[1]) # decimal digit, integer digit decimal = 0 - if record[6]: + if record.decomposition_mapping: flags |= DECIMAL_MASK - decimal = int(record[6]) + decimal = int(record.decomposition_mapping) digit = 0 - if record[7]: + if record.numeric_type: flags |= DIGIT_MASK - digit = int(record[7]) - if record[8]: + digit = int(record.numeric_type) + if record.numeric_value: flags |= NUMERIC_MASK - numeric.setdefault(record[8], []).append(char) + numeric.setdefault(record.numeric_value, []).append(char) item = ( upper, lower, title, decimal, digit, flags ) @@ -609,7 +610,7 @@ def makeunicodename(unicode, trace): for char in unicode.chars: record = unicode.table[char] if record: - name = record[1].strip() + name = record.name.strip() if name and name[0] != "<": names[char] = name + chr(0) @@ -719,7 +720,7 @@ def makeunicodename(unicode, trace): for char in unicode.chars: record = unicode.table[char] if record: - name = record[1].strip() + name = record.name.strip() if name and name[0] != "<": data.append((name, char)) @@ -819,31 +820,27 @@ def merge_old_version(version, new, old): continue # check characters that differ if old.table[i] != new.table[i]: - for k in range(len(old.table[i])): - if old.table[i][k] != new.table[i][k]: - value = old.table[i][k] + for k, field in enumerate(dataclasses.fields(UcdRecord)): + value = getattr(old.table[i], field.name) + new_value = getattr(new.table[i], field.name) + if value != new_value: if k == 1 and i in PUA_15: # the name is not set in the old.table, but in the # new.table we are using it for aliases and named seq assert value == '' elif k == 2: - #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] category_changes[i] = CATEGORY_NAMES.index(value) elif k == 4: - #print "BIDIR",hex(i), old.table[i][k], new.table[i][k] bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) elif k == 5: - #print "DECOMP",hex(i), old.table[i][k], new.table[i][k] # We assume that all normalization changes are in 1:1 mappings assert " " not in value normalization_changes.append((i, value)) elif k == 6: - #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k] # we only support changes where the old value is a single digit assert value in "0123456789" decimal_changes[i] = int(value) elif k == 8: - # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] # Since 0 encodes "no change", the old value is better not 0 if not value: numeric_changes[i] = -1 @@ -952,6 +949,45 @@ class UcdFile: yield char, rest +@dataclasses.dataclass +class UcdRecord: + # 15 fields from UnicodeData.txt . See: + # https://www.unicode.org/reports/tr44/#UnicodeData.txt + codepoint: str + name: str + general_category: str + canonical_combining_class: str + bidi_class: str + decomposition_type: str + decomposition_mapping: str + numeric_type: str + numeric_value: str + bidi_mirrored: str + unicode_1_name: str # obsolete + iso_comment: str # obsolete + simple_uppercase_mapping: str + simple_lowercase_mapping: str + simple_titlecase_mapping: str + + # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt + east_asian_width: Optional[str] + + # Binary properties, as a set of those that are true. + # Taken from multiple files: + # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt + # https://www.unicode.org/reports/tr44/#LineBreak.txt + binary_properties: Set[str] + + # The Quick_Check properties related to normalization: + # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization + # We store them as a bitmask. + quick_check: int + + +def from_row(row: List[str]) -> UcdRecord: + return UcdRecord(*row, None, set(), 0) + + # -------------------------------------------------------------------- # the following support code is taken from the unidb utilities # Copyright (c) 1999-2000 by Secret Labs AB @@ -959,18 +995,14 @@ class UcdFile: # load a unicode-data file from disk class UnicodeData: - # Record structure: - # [ID, name, category, combining, bidi, decomp, (6) - # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) - # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) - # derived-props] (17) + # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned def __init__(self, version, cjk_check=True): self.changed = [] table = [None] * 0x110000 for s in UcdFile(UNICODE_DATA, version): char = int(s[0], 16) - table[char] = s + table[char] = from_row(s) cjk_ranges_found = [] @@ -982,19 +1014,17 @@ class UnicodeData: # https://www.unicode.org/reports/tr44/#Code_Point_Ranges s = table[i] if s: - if s[1][-6:] == "First>": - s[1] = "" - field = s - elif s[1][-5:] == "Last>": - if s[1].startswith("": + s.name = "" + field = dataclasses.astuple(s)[:15] + elif s.name[-5:] == "Last>": + if s.name.startswith("