diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2009-04-27 22:31:40 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2009-04-27 22:31:40 (GMT) |
commit | 7a0fedfd1d387424c9e19059a126939d5f7bdea2 (patch) | |
tree | 0a180a7793e5b590a846869d01379a41e3078565 /Tools/unicode/makeunicodedata.py | |
parent | 57f3d93552edf5f4f5d5e8fad5aff9f72788bc7a (diff) | |
download | cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.zip cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.tar.gz cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.tar.bz2 |
Merged revisions 72054 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines
Issue #1734234: Massively speedup `unicodedata.normalize()` when the
string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
........
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 37 |
1 files changed, 32 insertions, 5 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 930a0df..52cb365 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -36,6 +36,7 @@ UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" +DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" old_versions = ["3.2.0"] @@ -72,7 +73,8 @@ def maketables(trace=0): unicode = UnicodeData(UNICODE_DATA % version, COMPOSITION_EXCLUSIONS % version, EASTASIAN_WIDTH % version, - DERIVED_CORE_PROPERTIES % version) + DERIVED_CORE_PROPERTIES % version, + DERIVEDNORMALIZATION_PROPS % version) print(len(list(filter(None, unicode.table))), "characters") @@ -94,7 +96,7 @@ def maketables(trace=0): def makeunicodedata(unicode, trace): - dummy = (0, 0, 0, 0, 0) + dummy = (0, 0, 0, 0, 0, 0) table = [dummy] cache = {0: dummy} index = [0] * len(unicode.chars) @@ -114,8 +116,10 @@ def makeunicodedata(unicode, trace): bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) mirrored = record[9] == "Y" eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) + normalizationquickcheck = record[17] item = ( - category, combining, bidirectional, mirrored, eastasianwidth + category, combining, bidirectional, mirrored, eastasianwidth, + normalizationquickcheck ) # add entry to index and item tables i = cache.get(item) @@ -227,7 +231,7 @@ def makeunicodedata(unicode, trace): print("/* a list of unique database records */", file=fp) print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp) for item in table: - print(" {%d, %d, %d, %d, %d}," % item, file=fp) + print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp) print("};", file=fp) print(file=fp) @@ -717,7 +721,7 @@ class UnicodeData: # derived-props] (17) def __init__(self, filename, exclusions, eastasianwidth, - derivedprops, expand=1): + derivedprops, derivednormalizationprops=None, expand=1): self.changed = [] file = open(filename) table = [None] * 0x110000 @@ -803,6 +807,29 @@ class UnicodeData: # apply to unassigned code points; ignore them table[char][-1].add(p) + if derivednormalizationprops: + quickchecks = [0] * 0x110000 # default is Yes + qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() + for s in open(derivednormalizationprops): + if '#' in s: + s = s[:s.index('#')] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in qc_order: + continue + quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No + quickcheck_shift = qc_order.index(s[1])*2 + quickcheck <<= quickcheck_shift + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + assert not (quickchecks[char]>>quickcheck_shift)&3 + quickchecks[char] |= quickcheck + for i in range(0, 0x110000): + if table[i] is not None: + table[i].append(quickchecks[i]) + def uselatin1(self): # restrict character range to ISO Latin 1 self.chars = list(range(256)) |