summaryrefslogtreecommitdiffstats
path: root/Tools/unicode/makeunicodedata.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r--Tools/unicode/makeunicodedata.py1187
1 files changed, 481 insertions, 706 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index a8e92be..3f5ad51 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -1,8 +1,9 @@
#
# (re)generate unicode property and type databases
#
-# This script converts Unicode database files to Modules/unicodedata_db.h,
-# Modules/unicodename_db.h, and Objects/unicodetype_db.h
+# this script converts a unicode 3.2 database file to
+# Modules/unicodedata_db.h, Modules/unicodename_db.h,
+# and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
@@ -19,52 +20,23 @@
# 2002-11-25 mvl add UNIDATA_VERSION
# 2004-05-29 perky add east asian width information
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
-# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
-# 2011-10-21 ezio add support for name aliases and named sequences
-# 2012-01 benjamin add full case mappings
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#
-import dataclasses
-import os
import sys
-import zipfile
-
-from functools import partial
-from textwrap import dedent
-from typing import Iterator, List, Optional, Set, Tuple
SCRIPT = sys.argv[0]
-VERSION = "3.3"
+VERSION = "2.6"
# The Unicode Database
-# --------------------
-# When changing UCD version please update
-# * Doc/library/stdtypes.rst, and
-# * Doc/library/unicodedata.rst
-# * Doc/reference/lexical_analysis.rst (two occurrences)
-UNIDATA_VERSION = "12.1.0"
+UNIDATA_VERSION = "5.2.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
-UNIHAN = "Unihan%s.zip"
-DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
+UNIHAN = "Unihan%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
-NAME_ALIASES = "NameAliases%s.txt"
-NAMED_SEQUENCES = "NamedSequences%s.txt"
-SPECIAL_CASING = "SpecialCasing%s.txt"
-CASE_FOLDING = "CaseFolding%s.txt"
-
-# Private Use Areas -- in planes 1, 15, 16
-PUA_1 = range(0xE000, 0xF900)
-PUA_15 = range(0xF0000, 0xFFFFE)
-PUA_16 = range(0x100000, 0x10FFFE)
-
-# we use this ranges of PUA_15 to store name aliases and named sequences
-NAME_ALIASES_START = 0xF0000
-NAMED_SEQUENCES_START = 0xF0200
old_versions = ["3.2.0"]
@@ -75,7 +47,7 @@ CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
- "ON", "LRI", "RLI", "FSI", "PDI" ]
+ "ON" ]
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
@@ -90,45 +62,36 @@ LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
-XID_START_MASK = 0x100
-XID_CONTINUE_MASK = 0x200
-PRINTABLE_MASK = 0x400
-NUMERIC_MASK = 0x800
-CASE_IGNORABLE_MASK = 0x1000
-CASED_MASK = 0x2000
-EXTENDED_CASE_MASK = 0x4000
-
-# these ranges need to match unicodedata.c:is_unified_ideograph
-cjk_ranges = [
- ('3400', '4DB5'),
- ('4E00', '9FEF'),
- ('20000', '2A6D6'),
- ('2A700', '2B734'),
- ('2B740', '2B81D'),
- ('2B820', '2CEA1'),
- ('2CEB0', '2EBE0'),
-]
-
+NODELTA_MASK = 0x100
+NUMERIC_MASK = 0x200
def maketables(trace=0):
- print("--- Reading", UNICODE_DATA % "", "...")
+ print "--- Reading", UNICODE_DATA % "", "..."
- unicode = UnicodeData(UNIDATA_VERSION)
+ version = ""
+ unicode = UnicodeData(UNICODE_DATA % version,
+ COMPOSITION_EXCLUSIONS % version,
+ EASTASIAN_WIDTH % version,
+ UNIHAN % version,
+ DERIVEDNORMALIZATION_PROPS % version,
+ LINE_BREAK % version)
- print(len(list(filter(None, unicode.table))), "characters")
+ print len(filter(None, unicode.table)), "characters"
for version in old_versions:
- print("--- Reading", UNICODE_DATA % ("-"+version), "...")
- old_unicode = UnicodeData(version, cjk_check=False)
- print(len(list(filter(None, old_unicode.table))), "characters")
+ print "--- Reading", UNICODE_DATA % ("-"+version), "..."
+ old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
+ COMPOSITION_EXCLUSIONS % ("-"+version),
+ EASTASIAN_WIDTH % ("-"+version),
+ UNIHAN % ("-"+version))
+ print len(filter(None, old_unicode.table)), "characters"
merge_old_version(version, unicode, old_unicode)
makeunicodename(unicode, trace)
makeunicodedata(unicode, trace)
makeunicodetype(unicode, trace)
-
# --------------------------------------------------------------------
# unicode character properties
@@ -141,7 +104,7 @@ def makeunicodedata(unicode, trace):
FILE = "Modules/unicodedata_db.h"
- print("--- Preparing", FILE, "...")
+ print "--- Preparing", FILE, "..."
# 1) database properties
@@ -149,12 +112,12 @@ def makeunicodedata(unicode, trace):
record = unicode.table[char]
if record:
# extract database properties
- category = CATEGORY_NAMES.index(record.general_category)
- combining = int(record.canonical_combining_class)
- bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
- mirrored = record.bidi_mirrored == "Y"
- eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
- normalizationquickcheck = record.quick_check
+ category = CATEGORY_NAMES.index(record[2])
+ combining = int(record[3])
+ bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
+ mirrored = record[9] == "Y"
+ eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
+ normalizationquickcheck = record[17]
item = (
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
@@ -180,10 +143,10 @@ def makeunicodedata(unicode, trace):
for char in unicode.chars:
record = unicode.table[char]
if record:
- if record.decomposition_type:
- decomp = record.decomposition_type.split()
+ if record[5]:
+ decomp = record[5].split()
if len(decomp) > 19:
- raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
+ raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
# prefix
if decomp[0][0] == "<":
prefix = decomp.pop(0)
@@ -201,7 +164,7 @@ def makeunicodedata(unicode, trace):
# Collect NFC pairs
if not prefix and len(decomp) == 3 and \
char not in unicode.exclusions and \
- unicode.table[decomp[1]].canonical_combining_class == "0":
+ unicode.table[decomp[1]][3] == "0":
p, l, r = decomp
comp_first[l] = 1
comp_last[r] = 1
@@ -252,135 +215,135 @@ def makeunicodedata(unicode, trace):
l = comp_last[l]
comp_data[f*total_last+l] = char
- print(len(table), "unique properties")
- print(len(decomp_prefix), "unique decomposition prefixes")
- print(len(decomp_data), "unique decomposition entries:", end=' ')
- print(decomp_size, "bytes")
- print(total_first, "first characters in NFC")
- print(total_last, "last characters in NFC")
- print(len(comp_pairs), "NFC pairs")
-
- print("--- Writing", FILE, "...")
-
- with open(FILE, "w") as fp:
- fprint = partial(print, file=fp)
-
- fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
- fprint()
- fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
- fprint("/* a list of unique database records */")
- fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
- for item in table:
- fprint(" {%d, %d, %d, %d, %d, %d}," % item)
- fprint("};")
- fprint()
-
- fprint("/* Reindexing of NFC first characters. */")
- fprint("#define TOTAL_FIRST",total_first)
- fprint("#define TOTAL_LAST",total_last)
- fprint("struct reindex{int start;short count,index;};")
- fprint("static struct reindex nfc_first[] = {")
- for start,end in comp_first_ranges:
- fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
- fprint(" {0,0,0}")
- fprint("};\n")
- fprint("static struct reindex nfc_last[] = {")
- for start,end in comp_last_ranges:
- fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
- fprint(" {0,0,0}")
- fprint("};\n")
-
- # FIXME: <fl> the following tables could be made static, and
- # the support code moved into unicodedatabase.c
-
- fprint("/* string literals */")
- fprint("const char *_PyUnicode_CategoryNames[] = {")
- for name in CATEGORY_NAMES:
- fprint(" \"%s\"," % name)
- fprint(" NULL")
- fprint("};")
-
- fprint("const char *_PyUnicode_BidirectionalNames[] = {")
- for name in BIDIRECTIONAL_NAMES:
- fprint(" \"%s\"," % name)
- fprint(" NULL")
- fprint("};")
-
- fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
- for name in EASTASIANWIDTH_NAMES:
- fprint(" \"%s\"," % name)
- fprint(" NULL")
- fprint("};")
-
- fprint("static const char *decomp_prefix[] = {")
- for name in decomp_prefix:
- fprint(" \"%s\"," % name)
- fprint(" NULL")
- fprint("};")
-
- # split record index table
+ print len(table), "unique properties"
+ print len(decomp_prefix), "unique decomposition prefixes"
+ print len(decomp_data), "unique decomposition entries:",
+ print decomp_size, "bytes"
+ print total_first, "first characters in NFC"
+ print total_last, "last characters in NFC"
+ print len(comp_pairs), "NFC pairs"
+
+ print "--- Writing", FILE, "..."
+
+ fp = open(FILE, "w")
+ print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+ print >>fp
+ print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
+ print >>fp, "/* a list of unique database records */"
+ print >>fp, \
+ "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
+ for item in table:
+ print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
+ print >>fp, "};"
+ print >>fp
+
+ print >>fp, "/* Reindexing of NFC first characters. */"
+ print >>fp, "#define TOTAL_FIRST",total_first
+ print >>fp, "#define TOTAL_LAST",total_last
+ print >>fp, "struct reindex{int start;short count,index;};"
+ print >>fp, "static struct reindex nfc_first[] = {"
+ for start,end in comp_first_ranges:
+ print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
+ print >>fp," {0,0,0}"
+ print >>fp,"};\n"
+ print >>fp, "static struct reindex nfc_last[] = {"
+ for start,end in comp_last_ranges:
+ print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
+ print >>fp," {0,0,0}"
+ print >>fp,"};\n"
+
+ # FIXME: <fl> the following tables could be made static, and
+ # the support code moved into unicodedatabase.c
+
+ print >>fp, "/* string literals */"
+ print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
+ for name in CATEGORY_NAMES:
+ print >>fp, " \"%s\"," % name
+ print >>fp, " NULL"
+ print >>fp, "};"
+
+ print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
+ for name in BIDIRECTIONAL_NAMES:
+ print >>fp, " \"%s\"," % name
+ print >>fp, " NULL"
+ print >>fp, "};"
+
+ print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
+ for name in EASTASIANWIDTH_NAMES:
+ print >>fp, " \"%s\"," % name
+ print >>fp, " NULL"
+ print >>fp, "};"
+
+ print >>fp, "static const char *decomp_prefix[] = {"
+ for name in decomp_prefix:
+ print >>fp, " \"%s\"," % name
+ print >>fp, " NULL"
+ print >>fp, "};"
+
+ # split record index table
+ index1, index2, shift = splitbins(index, trace)
+
+ print >>fp, "/* index tables for the database records */"
+ print >>fp, "#define SHIFT", shift
+ Array("index1", index1).dump(fp, trace)
+ Array("index2", index2).dump(fp, trace)
+
+ # split decomposition index table
+ index1, index2, shift = splitbins(decomp_index, trace)
+
+ print >>fp, "/* decomposition data */"
+ Array("decomp_data", decomp_data).dump(fp, trace)
+
+ print >>fp, "/* index tables for the decomposition data */"
+ print >>fp, "#define DECOMP_SHIFT", shift
+ Array("decomp_index1", index1).dump(fp, trace)
+ Array("decomp_index2", index2).dump(fp, trace)
+
+ index, index2, shift = splitbins(comp_data, trace)
+ print >>fp, "/* NFC pairs */"
+ print >>fp, "#define COMP_SHIFT", shift
+ Array("comp_index", index).dump(fp, trace)
+ Array("comp_data", index2).dump(fp, trace)
+
+ # Generate delta tables for old versions
+ for version, table, normalization in unicode.changed:
+ cversion = version.replace(".","_")
+ records = [table[0]]
+ cache = {table[0]:0}
+ index = [0] * len(table)
+ for i, record in enumerate(table):
+ try:
+ index[i] = cache[record]
+ except KeyError:
+ index[i] = cache[record] = len(records)
+ records.append(record)
index1, index2, shift = splitbins(index, trace)
-
- fprint("/* index tables for the database records */")
- fprint("#define SHIFT", shift)
- Array("index1", index1).dump(fp, trace)
- Array("index2", index2).dump(fp, trace)
-
- # split decomposition index table
- index1, index2, shift = splitbins(decomp_index, trace)
-
- fprint("/* decomposition data */")
- Array("decomp_data", decomp_data).dump(fp, trace)
-
- fprint("/* index tables for the decomposition data */")
- fprint("#define DECOMP_SHIFT", shift)
- Array("decomp_index1", index1).dump(fp, trace)
- Array("decomp_index2", index2).dump(fp, trace)
-
- index, index2, shift = splitbins(comp_data, trace)
- fprint("/* NFC pairs */")
- fprint("#define COMP_SHIFT", shift)
- Array("comp_index", index).dump(fp, trace)
- Array("comp_data", index2).dump(fp, trace)
-
- # Generate delta tables for old versions
- for version, table, normalization in unicode.changed:
- cversion = version.replace(".","_")
- records = [table[0]]
- cache = {table[0]:0}
- index = [0] * len(table)
- for i, record in enumerate(table):
- try:
- index[i] = cache[record]
- except KeyError:
- index[i] = cache[record] = len(records)
- records.append(record)
- index1, index2, shift = splitbins(index, trace)
- fprint("static const change_record change_records_%s[] = {" % cversion)
- for record in records:
- fprint(" { %s }," % ", ".join(map(str,record)))
- fprint("};")
- Array("changes_%s_index" % cversion, index1).dump(fp, trace)
- Array("changes_%s_data" % cversion, index2).dump(fp, trace)
- fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
- fprint("{")
- fprint(" int index;")
- fprint(" if (n >= 0x110000) index = 0;")
- fprint(" else {")
- fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
- fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
- (cversion, shift, ((1<<shift)-1)))
- fprint(" }")
- fprint(" return change_records_%s+index;" % cversion)
- fprint("}\n")
- fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
- fprint("{")
- fprint(" switch(n) {")
- for k, v in normalization:
- fprint(" case %s: return 0x%s;" % (hex(k), v))
- fprint(" default: return 0;")
- fprint(" }\n}\n")
-
+ print >>fp, "static const change_record change_records_%s[] = {" % cversion
+ for record in records:
+ print >>fp, "\t{ %s }," % ", ".join(map(str,record))
+ print >>fp, "};"
+ Array("changes_%s_index" % cversion, index1).dump(fp, trace)
+ Array("changes_%s_data" % cversion, index2).dump(fp, trace)
+ print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
+ print >>fp, "{"
+ print >>fp, "\tint index;"
+ print >>fp, "\tif (n >= 0x110000) index = 0;"
+ print >>fp, "\telse {"
+ print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
+ print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
+ (cversion, shift, ((1<<shift)-1))
+ print >>fp, "\t}"
+ print >>fp, "\treturn change_records_%s+index;" % cversion
+ print >>fp, "}\n"
+ print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
+ print >>fp, "{"
+ print >>fp, "\tswitch(n) {"
+ for k, v in normalization:
+ print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
+ print >>fp, "\tdefault: return 0;"
+ print >>fp, "\t}\n}\n"
+
+ fp.close()
# --------------------------------------------------------------------
# unicode character type tables
@@ -389,7 +352,7 @@ def makeunicodetype(unicode, trace):
FILE = "Objects/unicodetype_db.h"
- print("--- Preparing", FILE, "...")
+ print "--- Preparing", FILE, "..."
# extract unicode types
dummy = (0, 0, 0, 0, 0, 0)
@@ -399,19 +362,19 @@ def makeunicodetype(unicode, trace):
numeric = {}
spaces = []
linebreaks = []
- extra_casing = []
for char in unicode.chars:
record = unicode.table[char]
if record:
# extract database properties
- category = record.general_category
- bidirectional = record.bidi_class
- properties = record.binary_properties
+ category = record[2]
+ bidirectional = record[4]
+ properties = record[16]
flags = 0
+ delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
- if "Lowercase" in properties:
+ if category == "Ll":
flags |= LOWER_MASK
if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK
@@ -421,76 +384,49 @@ def makeunicodetype(unicode, trace):
spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
- if "Uppercase" in properties:
+ if category == "Lu":
flags |= UPPER_MASK
- if char == ord(" ") or category[0] not in ("C", "Z"):
- flags |= PRINTABLE_MASK
- if "XID_Start" in properties:
- flags |= XID_START_MASK
- if "XID_Continue" in properties:
- flags |= XID_CONTINUE_MASK
- if "Cased" in properties:
- flags |= CASED_MASK
- if "Case_Ignorable" in properties:
- flags |= CASE_IGNORABLE_MASK
- sc = unicode.special_casing.get(char)
- cf = unicode.case_folding.get(char, [char])
- if record.simple_uppercase_mapping:
- upper = int(record.simple_uppercase_mapping, 16)
+ # use delta predictor for upper/lower/title if it fits
+ if record[12]:
+ upper = int(record[12], 16)
else:
upper = char
- if record.simple_lowercase_mapping:
- lower = int(record.simple_lowercase_mapping, 16)
+ if record[13]:
+ lower = int(record[13], 16)
else:
lower = char
- if record.simple_titlecase_mapping:
- title = int(record.simple_titlecase_mapping, 16)
+ if record[14]:
+ title = int(record[14], 16)
else:
+ # UCD.html says that a missing title char means that
+ # it defaults to the uppercase character, not to the
+ # character itself. Apparently, in the current UCD (5.x)
+ # this feature is never used
title = upper
- if sc is None and cf != [lower]:
- sc = ([lower], [title], [upper])
- if sc is None:
- if upper == lower == title:
- upper = lower = title = 0
- else:
- upper = upper - char
- lower = lower - char
- title = title - char
- assert (abs(upper) <= 2147483647 and
- abs(lower) <= 2147483647 and
- abs(title) <= 2147483647)
+ upper_d = upper - char
+ lower_d = lower - char
+ title_d = title - char
+ if -32768 <= upper_d <= 32767 and \
+ -32768 <= lower_d <= 32767 and \
+ -32768 <= title_d <= 32767:
+ # use deltas
+ upper = upper_d & 0xffff
+ lower = lower_d & 0xffff
+ title = title_d & 0xffff
else:
- # This happens either when some character maps to more than one
- # character in uppercase, lowercase, or titlecase or the
- # casefolded version of the character is different from the
- # lowercase. The extra characters are stored in a different
- # array.
- flags |= EXTENDED_CASE_MASK
- lower = len(extra_casing) | (len(sc[0]) << 24)
- extra_casing.extend(sc[0])
- if cf != sc[0]:
- lower |= len(cf) << 20
- extra_casing.extend(cf)
- upper = len(extra_casing) | (len(sc[2]) << 24)
- extra_casing.extend(sc[2])
- # Title is probably equal to upper.
- if sc[1] == sc[2]:
- title = upper
- else:
- title = len(extra_casing) | (len(sc[1]) << 24)
- extra_casing.extend(sc[1])
+ flags |= NODELTA_MASK
# decimal digit, integer digit
decimal = 0
- if record.decomposition_mapping:
+ if record[6]:
flags |= DECIMAL_MASK
- decimal = int(record.decomposition_mapping)
+ decimal = int(record[6])
digit = 0
- if record.numeric_type:
+ if record[7]:
flags |= DIGIT_MASK
- digit = int(record.numeric_type)
- if record.numeric_value:
+ digit = int(record[7])
+ if record[8]:
flags |= NUMERIC_MASK
- numeric.setdefault(record.numeric_value, []).append(char)
+ numeric.setdefault(record[8], []).append(char)
item = (
upper, lower, title, decimal, digit, flags
)
@@ -501,99 +437,126 @@ def makeunicodetype(unicode, trace):
table.append(item)
index[char] = i
- print(len(table), "unique character type entries")
- print(sum(map(len, numeric.values())), "numeric code points")
- print(len(spaces), "whitespace code points")
- print(len(linebreaks), "linebreak code points")
- print(len(extra_casing), "extended case array")
-
- print("--- Writing", FILE, "...")
-
- with open(FILE, "w") as fp:
- fprint = partial(print, file=fp)
-
- fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
- fprint()
- fprint("/* a list of unique character type descriptors */")
- fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
- for item in table:
- fprint(" {%d, %d, %d, %d, %d, %d}," % item)
- fprint("};")
- fprint()
-
- fprint("/* extended case mappings */")
- fprint()
- fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
- for c in extra_casing:
- fprint(" %d," % c)
- fprint("};")
- fprint()
-
- # split decomposition index table
- index1, index2, shift = splitbins(index, trace)
-
- fprint("/* type indexes */")
- fprint("#define SHIFT", shift)
- Array("index1", index1).dump(fp, trace)
- Array("index2", index2).dump(fp, trace)
-
- # Generate code for _PyUnicode_ToNumeric()
- numeric_items = sorted(numeric.items())
- fprint('/* Returns the numeric value as double for Unicode characters')
- fprint(' * having this property, -1.0 otherwise.')
- fprint(' */')
- fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
- fprint('{')
- fprint(' switch (ch) {')
- for value, codepoints in numeric_items:
- # Turn text into float literals
- parts = value.split('/')
- parts = [repr(float(part)) for part in parts]
- value = '/'.join(parts)
-
- codepoints.sort()
- for codepoint in codepoints:
- fprint(' case 0x%04X:' % (codepoint,))
- fprint(' return (double) %s;' % (value,))
- fprint(' }')
- fprint(' return -1.0;')
- fprint('}')
- fprint()
-
- # Generate code for _PyUnicode_IsWhitespace()
- fprint("/* Returns 1 for Unicode characters having the bidirectional")
- fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
- fprint(" */")
- fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
- fprint('{')
- fprint(' switch (ch) {')
-
- for codepoint in sorted(spaces):
- fprint(' case 0x%04X:' % (codepoint,))
- fprint(' return 1;')
-
- fprint(' }')
- fprint(' return 0;')
- fprint('}')
- fprint()
-
- # Generate code for _PyUnicode_IsLinebreak()
- fprint("/* Returns 1 for Unicode characters having the line break")
- fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
- fprint(" * type 'B', 0 otherwise.")
- fprint(" */")
- fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
- fprint('{')
- fprint(' switch (ch) {')
- for codepoint in sorted(linebreaks):
- fprint(' case 0x%04X:' % (codepoint,))
- fprint(' return 1;')
-
- fprint(' }')
- fprint(' return 0;')
- fprint('}')
- fprint()
-
+ print len(table), "unique character type entries"
+ print sum(map(len, numeric.values())), "numeric code points"
+ print len(spaces), "whitespace code points"
+ print len(linebreaks), "linebreak code points"
+
+ print "--- Writing", FILE, "..."
+
+ fp = open(FILE, "w")
+ print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+ print >>fp
+ print >>fp, "/* a list of unique character type descriptors */"
+ print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
+ for item in table:
+ print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
+ print >>fp, "};"
+ print >>fp
+
+ # split decomposition index table
+ index1, index2, shift = splitbins(index, trace)
+
+ print >>fp, "/* type indexes */"
+ print >>fp, "#define SHIFT", shift
+ Array("index1", index1).dump(fp, trace)
+ Array("index2", index2).dump(fp, trace)
+
+ # Generate code for _PyUnicode_ToNumeric()
+ numeric_items = sorted(numeric.items())
+ print >>fp, '/* Returns the numeric value as double for Unicode characters'
+ print >>fp, ' * having this property, -1.0 otherwise.'
+ print >>fp, ' */'
+ print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
+ print >>fp, '{'
+ print >>fp, ' switch (ch) {'
+ for value, codepoints in numeric_items:
+ # Turn text into float literals
+ parts = value.split('/')
+ parts = [repr(float(part)) for part in parts]
+ value = '/'.join(parts)
+
+ haswide = False
+ hasnonewide = False
+ codepoints.sort()
+ for codepoint in codepoints:
+ if codepoint < 0x10000:
+ hasnonewide = True
+ if codepoint >= 0x10000 and not haswide:
+ print >>fp, '#ifdef Py_UNICODE_WIDE'
+ haswide = True
+ print >>fp, ' case 0x%04X:' % (codepoint,)
+ if haswide and hasnonewide:
+ print >>fp, '#endif'
+ print >>fp, ' return (double) %s;' % (value,)
+ if haswide and not hasnonewide:
+ print >>fp, '#endif'
+ print >>fp,' }'
+ print >>fp,' return -1.0;'
+ print >>fp,'}'
+ print >>fp
+
+ # Generate code for _PyUnicode_IsWhitespace()
+ print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
+ print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
+ print >>fp, " */"
+ print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
+ print >>fp, '{'
+ print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
+ print >>fp, ' return iswspace(ch);'
+ print >>fp, '#else'
+ print >>fp, ' switch (ch) {'
+
+ haswide = False
+ hasnonewide = False
+ for codepoint in sorted(spaces):
+ if codepoint < 0x10000:
+ hasnonewide = True
+ if codepoint >= 0x10000 and not haswide:
+ print >>fp, '#ifdef Py_UNICODE_WIDE'
+ haswide = True
+ print >>fp, ' case 0x%04X:' % (codepoint,)
+ if haswide and hasnonewide:
+ print >>fp, '#endif'
+ print >>fp, ' return 1;'
+ if haswide and not hasnonewide:
+ print >>fp, '#endif'
+
+ print >>fp,' }'
+ print >>fp,' return 0;'
+ print >>fp, '#endif'
+ print >>fp,'}'
+ print >>fp
+
+ # Generate code for _PyUnicode_IsLinebreak()
+ print >>fp, "/* Returns 1 for Unicode characters having the line break"
+ print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
+ print >>fp, " * type 'B', 0 otherwise."
+ print >>fp, " */"
+ print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
+ print >>fp, '{'
+ print >>fp, ' switch (ch) {'
+ haswide = False
+ hasnonewide = False
+ for codepoint in sorted(linebreaks):
+ if codepoint < 0x10000:
+ hasnonewide = True
+ if codepoint >= 0x10000 and not haswide:
+ print >>fp, '#ifdef Py_UNICODE_WIDE'
+ haswide = True
+ print >>fp, ' case 0x%04X:' % (codepoint,)
+ if haswide and hasnonewide:
+ print >>fp, '#endif'
+ print >>fp, ' return 1;'
+ if haswide and not hasnonewide:
+ print >>fp, '#endif'
+
+ print >>fp,' }'
+ print >>fp,' return 0;'
+ print >>fp,'}'
+ print >>fp
+
+ fp.close()
# --------------------------------------------------------------------
# unicode name database
@@ -602,7 +565,7 @@ def makeunicodename(unicode, trace):
FILE = "Modules/unicodename_db.h"
- print("--- Preparing", FILE, "...")
+ print "--- Preparing", FILE, "..."
# collect names
names = [None] * len(unicode.chars)
@@ -610,11 +573,11 @@ def makeunicodename(unicode, trace):
for char in unicode.chars:
record = unicode.table[char]
if record:
- name = record.name.strip()
+ name = record[1].strip()
if name and name[0] != "<":
names[char] = name + chr(0)
- print(len([n for n in names if n is not None]), "distinct names")
+ print len(filter(lambda n: n is not None, names)), "distinct names"
# collect unique words from names (note that we differ between
# words inside a sentence, and words ending a sentence. the
@@ -635,9 +598,9 @@ def makeunicodename(unicode, trace):
else:
words[w] = [len(words)]
- print(n, "words in text;", b, "bytes")
+ print n, "words in text;", b, "bytes"
- wordlist = list(words.items())
+ wordlist = words.items()
# sort on falling frequency, then by name
def word_key(a):
@@ -649,19 +612,19 @@ def makeunicodename(unicode, trace):
escapes = 0
while escapes * 256 < len(wordlist):
escapes = escapes + 1
- print(escapes, "escapes")
+ print escapes, "escapes"
short = 256 - escapes
assert short > 0
- print(short, "short indexes in lexicon")
+ print short, "short indexes in lexicon"
# statistics
n = 0
for i in range(short):
n = n + len(wordlist[i][1])
- print(n, "short indexes in phrasebook")
+ print n, "short indexes in phrasebook"
# pick the most commonly used words, and sort the rest on falling
# length (to maximize overlap)
@@ -691,7 +654,7 @@ def makeunicodename(unicode, trace):
words[w] = len(lexicon_offset)
lexicon_offset.append(o)
- lexicon = list(map(ord, lexicon))
+ lexicon = map(ord, lexicon)
# generate phrasebook from names and lexicon
phrasebook = [0]
@@ -720,7 +683,7 @@ def makeunicodename(unicode, trace):
for char in unicode.chars:
record = unicode.table[char]
if record:
- name = record.name.strip()
+ name = record[1].strip()
if name and name[0] != "<":
data.append((name, char))
@@ -730,78 +693,44 @@ def makeunicodename(unicode, trace):
codehash = Hash("code", data, 47)
- print("--- Writing", FILE, "...")
-
- with open(FILE, "w") as fp:
- fprint = partial(print, file=fp)
-
- fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
- fprint()
- fprint("#define NAME_MAXLEN", 256)
- fprint()
- fprint("/* lexicon */")
- Array("lexicon", lexicon).dump(fp, trace)
- Array("lexicon_offset", lexicon_offset).dump(fp, trace)
-
- # split decomposition index table
- offset1, offset2, shift = splitbins(phrasebook_offset, trace)
-
- fprint("/* code->name phrasebook */")
- fprint("#define phrasebook_shift", shift)
- fprint("#define phrasebook_short", short)
-
- Array("phrasebook", phrasebook).dump(fp, trace)
- Array("phrasebook_offset1", offset1).dump(fp, trace)
- Array("phrasebook_offset2", offset2).dump(fp, trace)
-
- fprint("/* name->code dictionary */")
- codehash.dump(fp, trace)
-
- fprint()
- fprint('static const unsigned int aliases_start = %#x;' %
- NAME_ALIASES_START)
- fprint('static const unsigned int aliases_end = %#x;' %
- (NAME_ALIASES_START + len(unicode.aliases)))
-
- fprint('static const unsigned int name_aliases[] = {')
- for name, codepoint in unicode.aliases:
- fprint(' 0x%04X,' % codepoint)
- fprint('};')
-
- # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
- # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
- # sequences or sequences with non-BMP chars are added.
- # unicodedata_lookup should be adapted too.
- fprint(dedent("""
- typedef struct NamedSequence {
- int seqlen;
- Py_UCS2 seq[4];
- } named_sequence;
- """))
-
- fprint('static const unsigned int named_sequences_start = %#x;' %
- NAMED_SEQUENCES_START)
- fprint('static const unsigned int named_sequences_end = %#x;' %
- (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
-
- fprint('static const named_sequence named_sequences[] = {')
- for name, sequence in unicode.named_sequences:
- seq_str = ', '.join('0x%04X' % cp for cp in sequence)
- fprint(' {%d, {%s}},' % (len(sequence), seq_str))
- fprint('};')
+ print "--- Writing", FILE, "..."
+
+ fp = open(FILE, "w")
+ print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+ print >>fp
+ print >>fp, "#define NAME_MAXLEN", 256
+ print >>fp
+ print >>fp, "/* lexicon */"
+ Array("lexicon", lexicon).dump(fp, trace)
+ Array("lexicon_offset", lexicon_offset).dump(fp, trace)
+
+ # split decomposition index table
+ offset1, offset2, shift = splitbins(phrasebook_offset, trace)
+
+ print >>fp, "/* code->name phrasebook */"
+ print >>fp, "#define phrasebook_shift", shift
+ print >>fp, "#define phrasebook_short", short
+
+ Array("phrasebook", phrasebook).dump(fp, trace)
+ Array("phrasebook_offset1", offset1).dump(fp, trace)
+ Array("phrasebook_offset2", offset2).dump(fp, trace)
+
+ print >>fp, "/* name->code dictionary */"
+ codehash.dump(fp, trace)
+
+ fp.close()
def merge_old_version(version, new, old):
# Changes to exclusion file not implemented yet
if old.exclusions != new.exclusions:
- raise NotImplementedError("exclusions differ")
+ raise NotImplementedError, "exclusions differ"
# In these change records, 0xFF means "no change"
bidir_changes = [0xFF]*0x110000
category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000
mirrored_changes = [0xFF]*0x110000
- east_asian_width_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change",
# -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000
@@ -820,27 +749,27 @@ def merge_old_version(version, new, old):
continue
# check characters that differ
if old.table[i] != new.table[i]:
- for k, field in enumerate(dataclasses.fields(UcdRecord)):
- value = getattr(old.table[i], field.name)
- new_value = getattr(new.table[i], field.name)
- if value != new_value:
- if k == 1 and i in PUA_15:
- # the name is not set in the old.table, but in the
- # new.table we are using it for aliases and named seq
- assert value == ''
- elif k == 2:
+ for k in range(len(old.table[i])):
+ if old.table[i][k] != new.table[i][k]:
+ value = old.table[i][k]
+ if k == 2:
+ #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
+ #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
elif k == 5:
+ #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
# We assume that all normalization changes are in 1:1 mappings
assert " " not in value
normalization_changes.append((i, value))
elif k == 6:
+ #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
# we only support changes where the old value is a single digit
assert value in "0123456789"
decimal_changes[i] = int(value)
elif k == 8:
+ # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
if not value:
numeric_changes[i] = -1
@@ -864,130 +793,18 @@ def merge_old_version(version, new, old):
elif k == 14:
# change to simple titlecase mapping; ignore
pass
- elif k == 15:
- # change to east asian width
- east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
elif k == 16:
- # derived property changes; not yet
- pass
- elif k == 17:
- # normalization quickchecks are not performed
- # for older versions
+ # change to properties; not yet
pass
else:
class Difference(Exception):pass
- raise Difference(hex(i), k, old.table[i], new.table[i])
- new.changed.append((version, list(zip(bidir_changes, category_changes,
- decimal_changes, mirrored_changes,
- east_asian_width_changes,
- numeric_changes)),
+ raise Difference, (hex(i), k, old.table[i], new.table[i])
+ new.changed.append((version, zip(bidir_changes, category_changes,
+ decimal_changes, mirrored_changes,
+ numeric_changes),
normalization_changes))
-DATA_DIR = os.path.join('Tools', 'unicode', 'data')
-
-def open_data(template, version):
- local = os.path.join(DATA_DIR, template % ('-'+version,))
- if not os.path.exists(local):
- import urllib.request
- if version == '3.2.0':
- # irregular url structure
- url = ('http://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
- else:
- url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
- os.makedirs(DATA_DIR, exist_ok=True)
- urllib.request.urlretrieve(url, filename=local)
- if local.endswith('.txt'):
- return open(local, encoding='utf-8')
- else:
- # Unihan.zip
- return open(local, 'rb')
-
-
-def expand_range(char_range: str) -> Iterator[int]:
- '''
- Parses ranges of code points, as described in UAX #44:
- https://www.unicode.org/reports/tr44/#Code_Point_Ranges
- '''
- if '..' in char_range:
- first, last = [int(c, 16) for c in char_range.split('..')]
- else:
- first = last = int(char_range, 16)
- for char in range(first, last+1):
- yield char
-
-
-class UcdFile:
- '''
- A file in the standard format of the UCD.
-
- See: https://www.unicode.org/reports/tr44/#Format_Conventions
-
- Note that, as described there, the Unihan data files have their
- own separate format.
- '''
-
- def __init__(self, template: str, version: str) -> None:
- self.template = template
- self.version = version
-
- def records(self) -> Iterator[List[str]]:
- with open_data(self.template, self.version) as file:
- for line in file:
- line = line.split('#', 1)[0].strip()
- if not line:
- continue
- yield [field.strip() for field in line.split(';')]
-
- def __iter__(self) -> Iterator[List[str]]:
- return self.records()
-
- def expanded(self) -> Iterator[Tuple[int, List[str]]]:
- for record in self.records():
- char_range, rest = record[0], record[1:]
- for char in expand_range(char_range):
- yield char, rest
-
-
-@dataclasses.dataclass
-class UcdRecord:
- # 15 fields from UnicodeData.txt . See:
- # https://www.unicode.org/reports/tr44/#UnicodeData.txt
- codepoint: str
- name: str
- general_category: str
- canonical_combining_class: str
- bidi_class: str
- decomposition_type: str
- decomposition_mapping: str
- numeric_type: str
- numeric_value: str
- bidi_mirrored: str
- unicode_1_name: str # obsolete
- iso_comment: str # obsolete
- simple_uppercase_mapping: str
- simple_lowercase_mapping: str
- simple_titlecase_mapping: str
-
- # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
- east_asian_width: Optional[str]
-
- # Binary properties, as a set of those that are true.
- # Taken from multiple files:
- # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
- # https://www.unicode.org/reports/tr44/#LineBreak.txt
- binary_properties: Set[str]
-
- # The Quick_Check properties related to normalization:
- # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
- # We store them as a bitmask.
- quick_check: int
-
-
-def from_row(row: List[str]) -> UcdRecord:
- return UcdRecord(*row, None, set(), 0)
-
-
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
@@ -995,133 +812,118 @@ def from_row(row: List[str]) -> UcdRecord:
# load a unicode-data file from disk
class UnicodeData:
- # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
-
- def __init__(self, version, cjk_check=True):
+ # Record structure:
+ # [ID, name, category, combining, bidi, decomp, (6)
+ # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
+ # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
+ # properties] (17)
+
+ def __init__(self, filename, exclusions, eastasianwidth, unihan,
+ derivednormalizationprops=None, linebreakprops=None,
+ expand=1):
self.changed = []
+ file = open(filename)
table = [None] * 0x110000
- for s in UcdFile(UNICODE_DATA, version):
+ while 1:
+ s = file.readline()
+ if not s:
+ break
+ s = s.strip().split(";")
char = int(s[0], 16)
- table[char] = from_row(s)
-
- cjk_ranges_found = []
+ table[char] = s
# expand first-last ranges
- field = None
- for i in range(0, 0x110000):
- # The file UnicodeData.txt has its own distinct way of
- # expressing ranges. See:
- # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
- s = table[i]
- if s:
- if s.name[-6:] == "First>":
- s.name = ""
- field = dataclasses.astuple(s)[:15]
- elif s.name[-5:] == "Last>":
- if s.name.startswith("<CJK Ideograph"):
- cjk_ranges_found.append((field[0],
- s.codepoint))
- s.name = ""
- field = None
- elif field:
- table[i] = from_row(('%X' % i,) + field[1:])
- if cjk_check and cjk_ranges != cjk_ranges_found:
- raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
+ if expand:
+ field = None
+ for i in range(0, 0x110000):
+ s = table[i]
+ if s:
+ if s[1][-6:] == "First>":
+ s[1] = ""
+ field = s
+ elif s[1][-5:] == "Last>":
+ s[1] = ""
+ field = None
+ elif field:
+ f2 = field[:]
+ f2[0] = "%X" % i
+ table[i] = f2
# public attributes
- self.filename = UNICODE_DATA % ''
+ self.filename = filename
self.table = table
- self.chars = list(range(0x110000)) # unicode 3.2
-
- # check for name aliases and named sequences, see #12753
- # aliases and named sequences are not in 3.2.0
- if version != '3.2.0':
- self.aliases = []
- # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
- # in order to take advantage of the compression and lookup
- # algorithms used for the other characters
- pua_index = NAME_ALIASES_START
- for char, name, abbrev in UcdFile(NAME_ALIASES, version):
- char = int(char, 16)
- self.aliases.append((name, char))
- # also store the name in the PUA 1
- self.table[pua_index].name = name
- pua_index += 1
- assert pua_index - NAME_ALIASES_START == len(self.aliases)
-
- self.named_sequences = []
- # store named sequences in the PUA 1, in range U+F0100..,
- # in order to take advantage of the compression and lookup
- # algorithms used for the other characters.
-
- assert pua_index < NAMED_SEQUENCES_START
- pua_index = NAMED_SEQUENCES_START
- for name, chars in UcdFile(NAMED_SEQUENCES, version):
- chars = tuple(int(char, 16) for char in chars.split())
- # check that the structure defined in makeunicodename is OK
- assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
- assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
- "the NamedSequence struct and in unicodedata_lookup")
- self.named_sequences.append((name, chars))
- # also store these in the PUA 1
- self.table[pua_index].name = name
- pua_index += 1
- assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
+ self.chars = range(0x110000) # unicode 3.2
+ file = open(exclusions)
self.exclusions = {}
- for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
- char = int(char, 16)
+ for s in file:
+ s = s.strip()
+ if not s:
+ continue
+ if s[0] == '#':
+ continue
+ char = int(s.split()[0],16)
self.exclusions[char] = 1
widths = [None] * 0x110000
- for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
- widths[char] = width
-
- for i in range(0, 0x110000):
- if table[i] is not None:
- table[i].east_asian_width = widths[i]
-
- for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
- if table[char]:
- # Some properties (e.g. Default_Ignorable_Code_Point)
- # apply to unassigned code points; ignore them
- table[char].binary_properties.add(p)
-
- for char_range, value in UcdFile(LINE_BREAK, version):
- if value not in MANDATORY_LINE_BREAKS:
+ for s in open(eastasianwidth):
+ s = s.strip()
+ if not s:
continue
- for char in expand_range(char_range):
- table[char].binary_properties.add('Line_Break')
-
- # We only want the quickcheck properties
- # Format: NF?_QC; Y(es)/N(o)/M(aybe)
- # Yes is the default, hence only N and M occur
- # In 3.2.0, the format was different (NF?_NO)
- # The parsing will incorrectly determine these as
- # "yes", however, unicodedata.c will not perform quickchecks
- # for older versions, and no delta records will be created.
- quickchecks = [0] * 0x110000
- qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
- for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
- if len(s) < 2 or s[1] not in qc_order:
+ if s[0] == '#':
continue
- quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
- quickcheck_shift = qc_order.index(s[1])*2
- quickcheck <<= quickcheck_shift
- for char in expand_range(s[0]):
- assert not (quickchecks[char]>>quickcheck_shift)&3
- quickchecks[char] |= quickcheck
+ s = s.split()[0].split(';')
+ if '..' in s[0]:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ chars = range(first, last+1)
+ else:
+ chars = [int(s[0], 16)]
+ for char in chars:
+ widths[char] = s[1]
for i in range(0, 0x110000):
if table[i] is not None:
- table[i].quick_check = quickchecks[i]
+ table[i].append(widths[i])
- with open_data(UNIHAN, version) as file:
- zip = zipfile.ZipFile(file)
- if version == '3.2.0':
- data = zip.open('Unihan-3.2.0.txt').read()
- else:
- data = zip.open('Unihan_NumericValues.txt').read()
- for line in data.decode("utf-8").splitlines():
+ for i in range(0, 0x110000):
+ if table[i] is not None:
+ table[i].append(set())
+ if linebreakprops:
+ for s in open(linebreakprops):
+ s = s.partition('#')[0]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ continue
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ table[char][-1].add('Line_Break')
+
+ if derivednormalizationprops:
+ quickchecks = [0] * 0x110000 # default is Yes
+ qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+ for s in open(derivednormalizationprops):
+ if '#' in s:
+ s = s[:s.index('#')]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in qc_order:
+ continue
+ quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+ quickcheck_shift = qc_order.index(s[1])*2
+ quickcheck <<= quickcheck_shift
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ assert not (quickchecks[char]>>quickcheck_shift)&3
+ quickchecks[char] |= quickcheck
+ for i in range(0, 0x110000):
+ if table[i] is not None:
+ table[i].append(quickchecks[i])
+
+ for line in open(unihan):
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]
@@ -1132,32 +934,11 @@ class UnicodeData:
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
- table[i].numeric_value = value
-
- sc = self.special_casing = {}
- for data in UcdFile(SPECIAL_CASING, version):
- if data[4]:
- # We ignore all conditionals (since they depend on
- # languages) except for one, which is hardcoded. See
- # handle_capital_sigma in unicodeobject.c.
- continue
- c = int(data[0], 16)
- lower = [int(char, 16) for char in data[1].split()]
- title = [int(char, 16) for char in data[2].split()]
- upper = [int(char, 16) for char in data[3].split()]
- sc[c] = (lower, title, upper)
-
- cf = self.case_folding = {}
- if version != '3.2.0':
- for data in UcdFile(CASE_FOLDING, version):
- if data[1] in "CF":
- c = int(data[0], 16)
- cf[c] = [int(char, 16) for char in data[2].split()]
+ table[i][8] = value
def uselatin1(self):
# restrict character range to ISO Latin 1
- self.chars = list(range(256))
-
+ self.chars = range(256)
# hash table tools
@@ -1169,12 +950,11 @@ def myhash(s, magic):
h = 0
for c in map(ord, s.upper()):
h = (h * magic) + c
- ix = h & 0xff000000
+ ix = h & 0xff000000L
if ix:
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
return h
-
SIZES = [
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
@@ -1182,7 +962,6 @@ SIZES = [
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
]
-
class Hash:
def __init__(self, name, data, magic):
# turn a (key, value) list into a static hash table structure
@@ -1193,9 +972,9 @@ class Hash:
poly = size + poly
break
else:
- raise AssertionError("ran out of polynomials")
+ raise AssertionError, "ran out of polynominals"
- print(size, "slots in hash table")
+ print size, "slots in hash table"
table = [None] * size
@@ -1213,7 +992,7 @@ class Hash:
if v is None:
table[i] = value
continue
- incr = (h ^ (h >> 3)) & mask
+ incr = (h ^ (h >> 3)) & mask;
if not incr:
incr = mask
while 1:
@@ -1227,7 +1006,7 @@ class Hash:
if incr > mask:
incr = incr ^ poly
- print(n, "collisions")
+ print n, "collisions"
self.collisions = n
for i in range(len(table)):
@@ -1247,7 +1026,6 @@ class Hash:
file.write("#define %s_size %d\n" % (self.name, self.size))
file.write("#define %s_poly %d\n" % (self.name, self.poly))
-
# stuff to deal with arrays of unsigned integers
class Array:
@@ -1260,8 +1038,8 @@ class Array:
# write data to file, as a C array
size = getsize(self.data)
if trace:
- print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
- file.write("static const ")
+ print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
+ file.write("static ")
if size == 1:
file.write("unsigned char")
elif size == 2:
@@ -1274,15 +1052,14 @@ class Array:
for item in self.data:
i = str(item) + ", "
if len(s) + len(i) > 78:
- file.write(s.rstrip() + "\n")
+ file.write(s + "\n")
s = " " + i
else:
s = s + i
if s.strip():
- file.write(s.rstrip() + "\n")
+ file.write(s + "\n")
file.write("};\n\n")
-
def getsize(data):
# return smallest possible integer size for the given array
maxdata = max(data)
@@ -1293,7 +1070,6 @@ def getsize(data):
else:
return 4
-
def splitbins(t, trace=0):
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
@@ -1311,10 +1087,10 @@ def splitbins(t, trace=0):
if trace:
def dump(t1, t2, shift, bytes):
- print("%d+%d bins at shift %d; %d bytes" % (
- len(t1), len(t2), shift, bytes), file=sys.stderr)
- print("Size of original table:", len(t)*getsize(t), "bytes",
- file=sys.stderr)
+ print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
+ len(t1), len(t2), shift, bytes)
+ print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
+ "bytes"
n = len(t)-1 # last valid index
maxshift = 0 # the most we can shift n and still have something left
if n > 0:
@@ -1322,7 +1098,7 @@ def splitbins(t, trace=0):
n >>= 1
maxshift += 1
del n
- bytes = sys.maxsize # smallest total size so far
+ bytes = sys.maxint # smallest total size so far
t = tuple(t) # so slices can be dict keys
for shift in range(maxshift + 1):
t1 = []
@@ -1346,15 +1122,14 @@ def splitbins(t, trace=0):
bytes = b
t1, t2, shift = best
if trace:
- print("Best:", end=' ', file=sys.stderr)
+ print >>sys.stderr, "Best:",
dump(t1, t2, shift, bytes)
if __debug__:
# exhaustively verify that the decomposition is correct
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
- for i in range(len(t)):
+ for i in xrange(len(t)):
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
return best
-
if __name__ == "__main__":
maketables(1)