diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2010-10-11 22:42:28 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2010-10-11 22:42:28 (GMT) |
commit | baecd7243ad2d671fa6f675777dfe7094e8c5c39 (patch) | |
tree | a4e6b946feafacd5c5f1bccd8a30d69fa7dd6922 /Tools | |
parent | e8930228c7482d39faf24f4a805c37ff7ca9223a (diff) | |
download | cpython-baecd7243ad2d671fa6f675777dfe7094e8c5c39.zip cpython-baecd7243ad2d671fa6f675777dfe7094e8c5c39.tar.gz cpython-baecd7243ad2d671fa6f675777dfe7094e8c5c39.tar.bz2 |
Upgrade to Unicode 6.0.0.
makeunicodedata.py: download all data files from unicode.org,
switch to extracting Unihan data from zip file.
Read linebreakprops and derivednormalizationprops even for
old versions, even though they are not used in delta records.
test:unicode.py: U+11000 is now assigned, use U+14000 instead.
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 135 |
1 files changed, 77 insertions, 58 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index b2615ee..0783f17 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -25,17 +25,17 @@ # written by Fredrik Lundh (fredrik@pythonware.com) # -import sys +import sys, os, zipfile SCRIPT = sys.argv[0] VERSION = "3.2" # The Unicode Database -UNIDATA_VERSION = "5.2.0" +UNIDATA_VERSION = "6.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt" -UNIHAN = "Unihan%s.txt" +UNIHAN = "Unihan%s.zip" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" LINE_BREAK = "LineBreak%s.txt" @@ -75,23 +75,13 @@ def maketables(trace=0): print("--- Reading", UNICODE_DATA % "", "...") version = "" - unicode = UnicodeData(UNICODE_DATA % version, - COMPOSITION_EXCLUSIONS % version, - EASTASIAN_WIDTH % version, - UNIHAN % version, - DERIVED_CORE_PROPERTIES % version, - DERIVEDNORMALIZATION_PROPS % version, - LINE_BREAK % version) + unicode = UnicodeData(UNIDATA_VERSION) print(len(list(filter(None, unicode.table))), "characters") for version in old_versions: print("--- Reading", UNICODE_DATA % ("-"+version), "...") - old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), - COMPOSITION_EXCLUSIONS % ("-"+version), - EASTASIAN_WIDTH % ("-"+version), - UNIHAN % ("-"+version), - DERIVED_CORE_PROPERTIES % ("-"+version)) + old_unicode = UnicodeData(version) print(len(list(filter(None, old_unicode.table))), "characters") merge_old_version(version, unicode, old_unicode) @@ -771,6 +761,10 @@ def merge_old_version(version, new, old): elif k == 16: # derived property changes; not yet pass + elif k == 17: + # normalization quickchecks are not performed + # for older versions + pass else: class Difference(Exception):pass raise Difference(hex(i), k, old.table[i], new.table[i]) @@ -779,6 +773,21 @@ def merge_old_version(version, new, old): numeric_changes)), normalization_changes)) +def open_data(template, version): + local = template % ('-'+version,) + if not os.path.exists(local): + import urllib.request + if version == '3.2.0': + # irregular url structure + url = 'http://www.unicode.org/Public/3.2-Update/' + local + else: + url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '') + urllib.request.urlretrieve(url, filename=local) + if local.endswith('.txt'): + return open(local, encoding='utf-8') + else: + # Unihan.zip + return open(local, 'rb') # -------------------------------------------------------------------- # the following support code is taken from the unidb utilities @@ -793,11 +802,11 @@ class UnicodeData: # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) # derived-props] (17) - def __init__(self, filename, exclusions, eastasianwidth, unihan, - derivedprops, derivednormalizationprops=None, linebreakprops=None, + def __init__(self, version, + linebreakprops=False, expand=1): self.changed = [] - file = open(filename) + file = open_data(UNICODE_DATA, version) table = [None] * 0x110000 while 1: s = file.readline() @@ -825,11 +834,11 @@ class UnicodeData: table[i] = f2 # public attributes - self.filename = filename + self.filename = UNICODE_DATA % '' self.table = table self.chars = list(range(0x110000)) # unicode 3.2 - file = open(exclusions) + file = open_data(COMPOSITION_EXCLUSIONS, version) self.exclusions = {} for s in file: s = s.strip() @@ -841,7 +850,7 @@ class UnicodeData: self.exclusions[char] = 1 widths = [None] * 0x110000 - for s in open(eastasianwidth): + for s in open_data(EASTASIAN_WIDTH, version): s = s.strip() if not s: continue @@ -862,7 +871,7 @@ class UnicodeData: for i in range(0, 0x110000): if table[i] is not None: table[i].append(set()) - for s in open(derivedprops): + for s in open_data(DERIVED_CORE_PROPERTIES, version): s = s.split('#', 1)[0].strip() if not s: continue @@ -881,43 +890,53 @@ class UnicodeData: # apply to unassigned code points; ignore them table[char][-1].add(p) - if linebreakprops: - for s in open(linebreakprops): - s = s.partition('#')[0] - s = [i.strip() for i in s.split(';')] - if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: - continue - if '..' not in s[0]: - first = last = int(s[0], 16) - else: - first, last = [int(c, 16) for c in s[0].split('..')] - for char in range(first, last+1): - table[char][-1].add('Line_Break') - - if derivednormalizationprops: - quickchecks = [0] * 0x110000 # default is Yes - qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() - for s in open(derivednormalizationprops): - if '#' in s: - s = s[:s.index('#')] - s = [i.strip() for i in s.split(';')] - if len(s) < 2 or s[1] not in qc_order: - continue - quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No - quickcheck_shift = qc_order.index(s[1])*2 - quickcheck <<= quickcheck_shift - if '..' not in s[0]: - first = last = int(s[0], 16) - else: - first, last = [int(c, 16) for c in s[0].split('..')] - for char in range(first, last+1): - assert not (quickchecks[char]>>quickcheck_shift)&3 - quickchecks[char] |= quickcheck - for i in range(0, 0x110000): - if table[i] is not None: - table[i].append(quickchecks[i]) + for s in open_data(LINE_BREAK, version): + s = s.partition('#')[0] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: + continue + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + table[char][-1].add('Line_Break') + + # We only want the quickcheck properties + # Format: NF?_QC; Y(es)/N(o)/M(aybe) + # Yes is the default, hence only N and M occur + # In 3.2.0, the format was different (NF?_NO) + # The parsing will incorrectly determine these as + # "yes", however, unicodedata.c will not perform quickchecks + # for older versions, and no delta records will be created. + quickchecks = [0] * 0x110000 + qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() + for s in open_data(DERIVEDNORMALIZATION_PROPS, version): + if '#' in s: + s = s[:s.index('#')] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in qc_order: + continue + quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No + quickcheck_shift = qc_order.index(s[1])*2 + quickcheck <<= quickcheck_shift + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + assert not (quickchecks[char]>>quickcheck_shift)&3 + quickchecks[char] |= quickcheck + for i in range(0, 0x110000): + if table[i] is not None: + table[i].append(quickchecks[i]) - for line in open(unihan, encoding='utf-8'): + zip = zipfile.ZipFile(open_data(UNIHAN, version)) + if version == '3.2.0': + data = zip.open('Unihan-3.2.0.txt').read() + else: + data = zip.open('Unihan_NumericValues.txt').read() + for line in data.decode("utf-8").splitlines(): if not line.startswith('U+'): continue code, tag, value = line.split(None, 3)[:3] |