summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGreg Price <gnprice@gmail.com>2019-08-14 02:28:38 (GMT)
committerBenjamin Peterson <benjamin@python.org>2019-08-14 02:28:38 (GMT)
commitc03e698c344dfc557555b6b07a3ee2702e45f6ee (patch)
tree947a399fe68f91cab7a96937a500d23443b765b6
parent38c7199beb30ae9a5005c0f0d9df9fae0da3680a (diff)
downloadcpython-c03e698c344dfc557555b6b07a3ee2702e45f6ee.zip
cpython-c03e698c344dfc557555b6b07a3ee2702e45f6ee.tar.gz
cpython-c03e698c344dfc557555b6b07a3ee2702e45f6ee.tar.bz2
bpo-37760: Factor out standard range-expanding logic in makeunicodedata. (GH-15248)
Much like the lower-level logic in commit ef2af1ad4, we had 4 copies of this logic, written in a couple of different ways. They're all implementing the same standard, so write it just once.
-rw-r--r--Tools/unicode/makeunicodedata.py68
1 files changed, 35 insertions, 33 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 38c1f19..cc2b298 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -32,7 +32,7 @@ import zipfile
from functools import partial
from textwrap import dedent
-from typing import *
+from typing import Iterator, List, Tuple
SCRIPT = sys.argv[0]
VERSION = "3.3"
@@ -904,6 +904,19 @@ def open_data(template, version):
return open(local, 'rb')
+def expand_range(char_range: str) -> Iterator[int]:
+ '''
+ Parses ranges of code points, as described in UAX #44:
+ https://www.unicode.org/reports/tr44/#Code_Point_Ranges
+ '''
+ if '..' in char_range:
+ first, last = [int(c, 16) for c in char_range.split('..')]
+ else:
+ first = last = int(char_range, 16)
+ for char in range(first, last+1):
+ yield char
+
+
class UcdFile:
'''
A file in the standard format of the UCD.
@@ -929,6 +942,12 @@ class UcdFile:
def __iter__(self) -> Iterator[List[str]]:
return self.records()
+ def expanded(self) -> Iterator[Tuple[int, List[str]]]:
+ for record in self.records():
+ char_range, rest = record[0], record[1:]
+ for char in expand_range(char_range):
+ yield char, rest
+
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
@@ -955,6 +974,9 @@ class UnicodeData:
# expand first-last ranges
field = None
for i in range(0, 0x110000):
+ # The file UnicodeData.txt has its own distinct way of
+ # expressing ranges. See:
+ # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
s = table[i]
if s:
if s[1][-6:] == "First>":
@@ -1019,14 +1041,8 @@ class UnicodeData:
self.exclusions[char] = 1
widths = [None] * 0x110000
- for s in UcdFile(EASTASIAN_WIDTH, version):
- if '..' in s[0]:
- first, last = [int(c, 16) for c in s[0].split('..')]
- chars = list(range(first, last+1))
- else:
- chars = [int(s[0], 16)]
- for char in chars:
- widths[char] = s[1]
+ for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
+ widths[char] = width
for i in range(0, 0x110000):
if table[i] is not None:
@@ -1036,26 +1052,16 @@ class UnicodeData:
if table[i] is not None:
table[i].append(set())
- for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
- if ".." in r:
- first, last = [int(c, 16) for c in r.split('..')]
- chars = list(range(first, last+1))
- else:
- chars = [int(r, 16)]
- for char in chars:
- if table[char]:
- # Some properties (e.g. Default_Ignorable_Code_Point)
- # apply to unassigned code points; ignore them
- table[char][-1].add(p)
-
- for s in UcdFile(LINE_BREAK, version):
- if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
+ if table[char]:
+ # Some properties (e.g. Default_Ignorable_Code_Point)
+ # apply to unassigned code points; ignore them
+ table[char][-1].add(p)
+
+ for char_range, value in UcdFile(LINE_BREAK, version):
+ if value not in MANDATORY_LINE_BREAKS:
continue
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
+ for char in expand_range(char_range):
table[char][-1].add('Line_Break')
# We only want the quickcheck properties
@@ -1073,11 +1079,7 @@ class UnicodeData:
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
+ for char in expand_range(s[0]):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):