summaryrefslogtreecommitdiffstats
path: root/Tools/unicode/makeunicodedata.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r--Tools/unicode/makeunicodedata.py231
1 files changed, 183 insertions, 48 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index c35170c..d503190 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -25,18 +25,20 @@
# written by Fredrik Lundh (fredrik@pythonware.com)
#
-import sys
+import sys, os, zipfile
SCRIPT = sys.argv[0]
-VERSION = "2.6"
+VERSION = "3.2"
# The Unicode Database
-UNIDATA_VERSION = "5.1.0"
+UNIDATA_VERSION = "6.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
+LINE_BREAK = "LineBreak%s.txt"
old_versions = ["3.2.0"]
@@ -51,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
+MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
+
# note: should match definitions in Objects/unicodectype.c
ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02
@@ -64,26 +68,29 @@ XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
NODELTA_MASK = 0x800
+NUMERIC_MASK = 0x1000
+
+# these ranges need to match unicodedata.c:is_unified_ideograph
+cjk_ranges = [
+ ('3400', '4DB5'),
+ ('4E00', '9FCB'),
+ ('20000', '2A6D6'),
+ ('2A700', '2B734'),
+ ('2B740', '2B81D')
+]
def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...")
version = ""
- unicode = UnicodeData(UNICODE_DATA % version,
- COMPOSITION_EXCLUSIONS % version,
- EASTASIAN_WIDTH % version,
- DERIVED_CORE_PROPERTIES % version,
- DERIVEDNORMALIZATION_PROPS % version)
+ unicode = UnicodeData(UNIDATA_VERSION)
print(len(list(filter(None, unicode.table))), "characters")
for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
- old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
- COMPOSITION_EXCLUSIONS % ("-"+version),
- EASTASIAN_WIDTH % ("-"+version),
- DERIVED_CORE_PROPERTIES % ("-"+version))
+ old_unicode = UnicodeData(version, cjk_check=False)
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
@@ -357,6 +364,9 @@ def makeunicodetype(unicode, trace):
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
+ numeric = {}
+ spaces = []
+ linebreaks = []
for char in unicode.chars:
record = unicode.table[char]
@@ -371,10 +381,12 @@ def makeunicodetype(unicode, trace):
flags |= ALPHA_MASK
if category == "Ll":
flags |= LOWER_MASK
- if category == "Zl" or bidirectional == "B":
+ if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK
+ linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK
+ spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
if category == "Lu":
@@ -423,6 +435,9 @@ def makeunicodetype(unicode, trace):
if record[7]:
flags |= DIGIT_MASK
digit = int(record[7])
+ if record[8]:
+ flags |= NUMERIC_MASK
+ numeric.setdefault(record[8], []).append(char)
item = (
upper, lower, title, decimal, digit, flags
)
@@ -434,6 +449,9 @@ def makeunicodetype(unicode, trace):
index[char] = i
print(len(table), "unique character type entries")
+ print(sum(map(len, numeric.values())), "numeric code points")
+ print(len(spaces), "whitespace code points")
+ print(len(linebreaks), "linebreak code points")
print("--- Writing", FILE, "...")
@@ -455,6 +473,63 @@ def makeunicodetype(unicode, trace):
Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace)
+ # Generate code for _PyUnicode_ToNumeric()
+ numeric_items = sorted(numeric.items())
+ print('/* Returns the numeric value as double for Unicode characters', file=fp)
+ print(' * having this property, -1.0 otherwise.', file=fp)
+ print(' */', file=fp)
+ print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
+ print('{', file=fp)
+ print(' switch (ch) {', file=fp)
+ for value, codepoints in numeric_items:
+ # Turn text into float literals
+ parts = value.split('/')
+ parts = [repr(float(part)) for part in parts]
+ value = '/'.join(parts)
+
+ codepoints.sort()
+ for codepoint in codepoints:
+ print(' case 0x%04X:' % (codepoint,), file=fp)
+ print(' return (double) %s;' % (value,), file=fp)
+ print(' }', file=fp)
+ print(' return -1.0;', file=fp)
+ print('}', file=fp)
+ print(file=fp)
+
+ # Generate code for _PyUnicode_IsWhitespace()
+ print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
+ print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
+ print(" */", file=fp)
+ print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
+ print('{', file=fp)
+ print(' switch (ch) {', file=fp)
+
+ for codepoint in sorted(spaces):
+ print(' case 0x%04X:' % (codepoint,), file=fp)
+ print(' return 1;', file=fp)
+
+ print(' }', file=fp)
+ print(' return 0;', file=fp)
+ print('}', file=fp)
+ print(file=fp)
+
+ # Generate code for _PyUnicode_IsLinebreak()
+ print("/* Returns 1 for Unicode characters having the line break", file=fp)
+ print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
+ print(" * type 'B', 0 otherwise.", file=fp)
+ print(" */", file=fp)
+ print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
+ print('{', file=fp)
+ print(' switch (ch) {', file=fp)
+ for codepoint in sorted(linebreaks):
+ print(' case 0x%04X:' % (codepoint,), file=fp)
+ print(' return 1;', file=fp)
+
+ print(' }', file=fp)
+ print(' return 0;', file=fp)
+ print('}', file=fp)
+ print(file=fp)
+
fp.close()
# --------------------------------------------------------------------
@@ -670,12 +745,11 @@ def merge_old_version(version, new, old):
elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
- assert value != "0" and value != "-1"
if not value:
numeric_changes[i] = -1
else:
- assert re.match("^[0-9]+$", value)
- numeric_changes[i] = int(value)
+ numeric_changes[i] = float(value)
+ assert numeric_changes[i] not in (0, -1)
elif k == 9:
if value == 'Y':
mirrored_changes[i] = '1'
@@ -696,6 +770,10 @@ def merge_old_version(version, new, old):
elif k == 16:
# derived property changes; not yet
pass
+ elif k == 17:
+ # normalization quickchecks are not performed
+ # for older versions
+ pass
else:
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
@@ -704,6 +782,21 @@ def merge_old_version(version, new, old):
numeric_changes)),
normalization_changes))
+def open_data(template, version):
+ local = template % ('-'+version,)
+ if not os.path.exists(local):
+ import urllib.request
+ if version == '3.2.0':
+ # irregular url structure
+ url = 'http://www.unicode.org/Public/3.2-Update/' + local
+ else:
+ url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
+ urllib.request.urlretrieve(url, filename=local)
+ if local.endswith('.txt'):
+ return open(local, encoding='utf-8')
+ else:
+ # Unihan.zip
+ return open(local, 'rb')
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
@@ -711,8 +804,6 @@ def merge_old_version(version, new, old):
# load a unicode-data file from disk
-import sys
-
class UnicodeData:
# Record structure:
# [ID, name, category, combining, bidi, decomp, (6)
@@ -720,10 +811,12 @@ class UnicodeData:
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
- def __init__(self, filename, exclusions, eastasianwidth,
- derivedprops, derivednormalizationprops=None, expand=1):
+ def __init__(self, version,
+ linebreakprops=False,
+ expand=1,
+ cjk_check=True):
self.changed = []
- file = open(filename)
+ file = open_data(UNICODE_DATA, version)
table = [None] * 0x110000
while 1:
s = file.readline()
@@ -733,6 +826,8 @@ class UnicodeData:
char = int(s[0], 16)
table[char] = s
+ cjk_ranges_found = []
+
# expand first-last ranges
if expand:
field = None
@@ -743,19 +838,24 @@ class UnicodeData:
s[1] = ""
field = s
elif s[1][-5:] == "Last>":
+ if s[1].startswith("<CJK Ideograph"):
+ cjk_ranges_found.append((field[0],
+ s[0]))
s[1] = ""
field = None
elif field:
f2 = field[:]
f2[0] = "%X" % i
table[i] = f2
+ if cjk_check and cjk_ranges != cjk_ranges_found:
+ raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
# public attributes
- self.filename = filename
+ self.filename = UNICODE_DATA % ''
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
- file = open(exclusions)
+ file = open_data(COMPOSITION_EXCLUSIONS, version)
self.exclusions = {}
for s in file:
s = s.strip()
@@ -767,7 +867,7 @@ class UnicodeData:
self.exclusions[char] = 1
widths = [None] * 0x110000
- for s in open(eastasianwidth):
+ for s in open_data(EASTASIAN_WIDTH, version):
s = s.strip()
if not s:
continue
@@ -788,7 +888,7 @@ class UnicodeData:
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
- for s in open(derivedprops):
+ for s in open_data(DERIVED_CORE_PROPERTIES, version):
s = s.split('#', 1)[0].strip()
if not s:
continue
@@ -807,28 +907,64 @@ class UnicodeData:
# apply to unassigned code points; ignore them
table[char][-1].add(p)
- if derivednormalizationprops:
- quickchecks = [0] * 0x110000 # default is Yes
- qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
- for s in open(derivednormalizationprops):
- if '#' in s:
- s = s[:s.index('#')]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in qc_order:
- continue
- quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
- quickcheck_shift = qc_order.index(s[1])*2
- quickcheck <<= quickcheck_shift
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- assert not (quickchecks[char]>>quickcheck_shift)&3
- quickchecks[char] |= quickcheck
- for i in range(0, 0x110000):
- if table[i] is not None:
- table[i].append(quickchecks[i])
+ for s in open_data(LINE_BREAK, version):
+ s = s.partition('#')[0]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ continue
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ table[char][-1].add('Line_Break')
+
+ # We only want the quickcheck properties
+ # Format: NF?_QC; Y(es)/N(o)/M(aybe)
+ # Yes is the default, hence only N and M occur
+ # In 3.2.0, the format was different (NF?_NO)
+ # The parsing will incorrectly determine these as
+ # "yes", however, unicodedata.c will not perform quickchecks
+ # for older versions, and no delta records will be created.
+ quickchecks = [0] * 0x110000
+ qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+ for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
+ if '#' in s:
+ s = s[:s.index('#')]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in qc_order:
+ continue
+ quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+ quickcheck_shift = qc_order.index(s[1])*2
+ quickcheck <<= quickcheck_shift
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ assert not (quickchecks[char]>>quickcheck_shift)&3
+ quickchecks[char] |= quickcheck
+ for i in range(0, 0x110000):
+ if table[i] is not None:
+ table[i].append(quickchecks[i])
+
+ zip = zipfile.ZipFile(open_data(UNIHAN, version))
+ if version == '3.2.0':
+ data = zip.open('Unihan-3.2.0.txt').read()
+ else:
+ data = zip.open('Unihan_NumericValues.txt').read()
+ for line in data.decode("utf-8").splitlines():
+ if not line.startswith('U+'):
+ continue
+ code, tag, value = line.split(None, 3)[:3]
+ if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
+ 'kOtherNumeric'):
+ continue
+ value = value.strip().replace(',', '')
+ i = int(code[2:], 16)
+ # Patch the numeric field
+ if table[i] is not None:
+ table[i][8] = value
def uselatin1(self):
# restrict character range to ISO Latin 1
@@ -979,7 +1115,6 @@ def splitbins(t, trace=0):
you'll get.
"""
- import sys
if trace:
def dump(t1, t2, shift, bytes):
print("%d+%d bins at shift %d; %d bytes" % (