summaryrefslogtreecommitdiffstats
path: root/Tools/unicode
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/unicode')
-rw-r--r--Tools/unicode/makeunicodedata.py242
1 files changed, 109 insertions, 133 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 5b9427a..1be93ec 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -30,8 +30,9 @@ import os
import sys
import zipfile
-from textwrap import dedent
from functools import partial
+from textwrap import dedent
+from typing import *
SCRIPT = sys.argv[0]
VERSION = "3.3"
@@ -903,6 +904,32 @@ def open_data(template, version):
return open(local, 'rb')
+class UcdFile:
+ '''
+ A file in the standard format of the UCD.
+
+ See: https://www.unicode.org/reports/tr44/#Format_Conventions
+
+ Note that, as described there, the Unihan data files have their
+ own separate format.
+ '''
+
+ def __init__(self, template: str, version: str) -> None:
+ self.template = template
+ self.version = version
+
+ def records(self) -> Iterator[List[str]]:
+ with open_data(self.template, self.version) as file:
+ for line in file:
+ line = line.split('#', 1)[0].strip()
+ if not line:
+ continue
+ yield [field.strip() for field in line.split(';')]
+
+ def __iter__(self) -> Iterator[List[str]]:
+ return self.records()
+
+
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
@@ -922,14 +949,9 @@ class UnicodeData:
cjk_check=True):
self.changed = []
table = [None] * 0x110000
- with open_data(UNICODE_DATA, version) as file:
- while 1:
- s = file.readline()
- if not s:
- break
- s = s.strip().split(";")
- char = int(s[0], 16)
- table[char] = s
+ for s in UcdFile(UNICODE_DATA, version):
+ char = int(s[0], 16)
+ table[char] = s
cjk_ranges_found = []
@@ -968,17 +990,12 @@ class UnicodeData:
# in order to take advantage of the compression and lookup
# algorithms used for the other characters
pua_index = NAME_ALIASES_START
- with open_data(NAME_ALIASES, version) as file:
- for s in file:
- s = s.strip()
- if not s or s.startswith('#'):
- continue
- char, name, abbrev = s.split(';')
- char = int(char, 16)
- self.aliases.append((name, char))
- # also store the name in the PUA 1
- self.table[pua_index][1] = name
- pua_index += 1
+ for char, name, abbrev in UcdFile(NAME_ALIASES, version):
+ char = int(char, 16)
+ self.aliases.append((name, char))
+ # also store the name in the PUA 1
+ self.table[pua_index][1] = name
+ pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)
self.named_sequences = []
@@ -988,50 +1005,32 @@ class UnicodeData:
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
- with open_data(NAMED_SEQUENCES, version) as file:
- for s in file:
- s = s.strip()
- if not s or s.startswith('#'):
- continue
- name, chars = s.split(';')
- chars = tuple(int(char, 16) for char in chars.split())
- # check that the structure defined in makeunicodename is OK
- assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
- assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
- "the NamedSequence struct and in unicodedata_lookup")
- self.named_sequences.append((name, chars))
- # also store these in the PUA 1
- self.table[pua_index][1] = name
- pua_index += 1
+ for name, chars in UcdFile(NAMED_SEQUENCES, version):
+ chars = tuple(int(char, 16) for char in chars.split())
+ # check that the structure defined in makeunicodename is OK
+ assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+ assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
+ "the NamedSequence struct and in unicodedata_lookup")
+ self.named_sequences.append((name, chars))
+ # also store these in the PUA 1
+ self.table[pua_index][1] = name
+ pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = {}
- with open_data(COMPOSITION_EXCLUSIONS, version) as file:
- for s in file:
- s = s.strip()
- if not s:
- continue
- if s[0] == '#':
- continue
- char = int(s.split()[0],16)
- self.exclusions[char] = 1
+ for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
+ char = int(char, 16)
+ self.exclusions[char] = 1
widths = [None] * 0x110000
- with open_data(EASTASIAN_WIDTH, version) as file:
- for s in file:
- s = s.strip()
- if not s:
- continue
- if s[0] == '#':
- continue
- s = s.split()[0].split(';')
- if '..' in s[0]:
- first, last = [int(c, 16) for c in s[0].split('..')]
- chars = list(range(first, last+1))
- else:
- chars = [int(s[0], 16)]
- for char in chars:
- widths[char] = s[1]
+ for s in UcdFile(EASTASIAN_WIDTH, version):
+ if '..' in s[0]:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ chars = list(range(first, last+1))
+ else:
+ chars = [int(s[0], 16)]
+ for char in chars:
+ widths[char] = s[1]
for i in range(0, 0x110000):
if table[i] is not None:
@@ -1041,38 +1040,27 @@ class UnicodeData:
if table[i] is not None:
table[i].append(set())
- with open_data(DERIVED_CORE_PROPERTIES, version) as file:
- for s in file:
- s = s.split('#', 1)[0].strip()
- if not s:
- continue
-
- r, p = s.split(";")
- r = r.strip()
- p = p.strip()
- if ".." in r:
- first, last = [int(c, 16) for c in r.split('..')]
- chars = list(range(first, last+1))
- else:
- chars = [int(r, 16)]
- for char in chars:
- if table[char]:
- # Some properties (e.g. Default_Ignorable_Code_Point)
- # apply to unassigned code points; ignore them
- table[char][-1].add(p)
-
- with open_data(LINE_BREAK, version) as file:
- for s in file:
- s = s.partition('#')[0]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
- continue
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- table[char][-1].add('Line_Break')
+ for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
+ if ".." in r:
+ first, last = [int(c, 16) for c in r.split('..')]
+ chars = list(range(first, last+1))
+ else:
+ chars = [int(r, 16)]
+ for char in chars:
+ if table[char]:
+ # Some properties (e.g. Default_Ignorable_Code_Point)
+ # apply to unassigned code points; ignore them
+ table[char][-1].add(p)
+
+ for s in UcdFile(LINE_BREAK, version):
+ if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ continue
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ table[char][-1].add('Line_Break')
# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1083,23 +1071,19 @@ class UnicodeData:
# for older versions, and no delta records will be created.
quickchecks = [0] * 0x110000
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
- with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
- for s in file:
- if '#' in s:
- s = s[:s.index('#')]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in qc_order:
- continue
- quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
- quickcheck_shift = qc_order.index(s[1])*2
- quickcheck <<= quickcheck_shift
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- assert not (quickchecks[char]>>quickcheck_shift)&3
- quickchecks[char] |= quickcheck
+ for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
+ if len(s) < 2 or s[1] not in qc_order:
+ continue
+ quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+ quickcheck_shift = qc_order.index(s[1])*2
+ quickcheck <<= quickcheck_shift
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ assert not (quickchecks[char]>>quickcheck_shift)&3
+ quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
@@ -1122,34 +1106,26 @@ class UnicodeData:
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
+
sc = self.special_casing = {}
- with open_data(SPECIAL_CASING, version) as file:
- for s in file:
- s = s[:-1].split('#', 1)[0]
- if not s:
- continue
- data = s.split("; ")
- if data[4]:
- # We ignore all conditionals (since they depend on
- # languages) except for one, which is hardcoded. See
- # handle_capital_sigma in unicodeobject.c.
- continue
- c = int(data[0], 16)
- lower = [int(char, 16) for char in data[1].split()]
- title = [int(char, 16) for char in data[2].split()]
- upper = [int(char, 16) for char in data[3].split()]
- sc[c] = (lower, title, upper)
+ for data in UcdFile(SPECIAL_CASING, version):
+ if data[4]:
+ # We ignore all conditionals (since they depend on
+ # languages) except for one, which is hardcoded. See
+ # handle_capital_sigma in unicodeobject.c.
+ continue
+ c = int(data[0], 16)
+ lower = [int(char, 16) for char in data[1].split()]
+ title = [int(char, 16) for char in data[2].split()]
+ upper = [int(char, 16) for char in data[3].split()]
+ sc[c] = (lower, title, upper)
+
cf = self.case_folding = {}
if version != '3.2.0':
- with open_data(CASE_FOLDING, version) as file:
- for s in file:
- s = s[:-1].split('#', 1)[0]
- if not s:
- continue
- data = s.split("; ")
- if data[1] in "CF":
- c = int(data[0], 16)
- cf[c] = [int(char, 16) for char in data[2].split()]
+ for data in UcdFile(CASE_FOLDING, version):
+ if data[1] in "CF":
+ c = int(data[0], 16)
+ cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self):
# restrict character range to ISO Latin 1