summaryrefslogtreecommitdiffstats
path: root/Tools/unicode/makeunicodedata.py
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2009-04-27 22:31:40 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2009-04-27 22:31:40 (GMT)
commit7a0fedfd1d387424c9e19059a126939d5f7bdea2 (patch)
tree0a180a7793e5b590a846869d01379a41e3078565 /Tools/unicode/makeunicodedata.py
parent57f3d93552edf5f4f5d5e8fad5aff9f72788bc7a (diff)
downloadcpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.zip
cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.tar.gz
cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.tar.bz2
Merged revisions 72054 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines Issue #1734234: Massively speedup `unicodedata.normalize()` when the string is already in normalized form, by performing a quick check beforehand. Original patch by Rauli Ruohonen. ........
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r--Tools/unicode/makeunicodedata.py37
1 files changed, 32 insertions, 5 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 930a0df..52cb365 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -36,6 +36,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
+DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
old_versions = ["3.2.0"]
@@ -72,7 +73,8 @@ def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version,
- DERIVED_CORE_PROPERTIES % version)
+ DERIVED_CORE_PROPERTIES % version,
+ DERIVEDNORMALIZATION_PROPS % version)
print(len(list(filter(None, unicode.table))), "characters")
@@ -94,7 +96,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace):
- dummy = (0, 0, 0, 0, 0)
+ dummy = (0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
@@ -114,8 +116,10 @@ def makeunicodedata(unicode, trace):
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
+ normalizationquickcheck = record[17]
item = (
- category, combining, bidirectional, mirrored, eastasianwidth
+ category, combining, bidirectional, mirrored, eastasianwidth,
+ normalizationquickcheck
)
# add entry to index and item tables
i = cache.get(item)
@@ -227,7 +231,7 @@ def makeunicodedata(unicode, trace):
print("/* a list of unique database records */", file=fp)
print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
for item in table:
- print(" {%d, %d, %d, %d, %d}," % item, file=fp)
+ print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
print("};", file=fp)
print(file=fp)
@@ -717,7 +721,7 @@ class UnicodeData:
# derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth,
- derivedprops, expand=1):
+ derivedprops, derivednormalizationprops=None, expand=1):
self.changed = []
file = open(filename)
table = [None] * 0x110000
@@ -803,6 +807,29 @@ class UnicodeData:
# apply to unassigned code points; ignore them
table[char][-1].add(p)
+ if derivednormalizationprops:
+ quickchecks = [0] * 0x110000 # default is Yes
+ qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+ for s in open(derivednormalizationprops):
+ if '#' in s:
+ s = s[:s.index('#')]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in qc_order:
+ continue
+ quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+ quickcheck_shift = qc_order.index(s[1])*2
+ quickcheck <<= quickcheck_shift
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ assert not (quickchecks[char]>>quickcheck_shift)&3
+ quickchecks[char] |= quickcheck
+ for i in range(0, 0x110000):
+ if table[i] is not None:
+ table[i].append(quickchecks[i])
+
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = list(range(256))