1 files changed, 109 insertions, 133 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 5b9427a..1be93ec 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -30,8 +30,9 @@ import os
 import sys
 import zipfile
 
-from textwrap import dedent
 from functools import partial
+from textwrap import dedent
+from typing import *
 
 SCRIPT = sys.argv[0]
 VERSION = "3.3"
@@ -903,6 +904,32 @@ def open_data(template, version):
         return open(local, 'rb')
 
 
+class UcdFile:
+    '''
+    A file in the standard format of the UCD.
+
+    See: https://www.unicode.org/reports/tr44/#Format_Conventions
+
+    Note that, as described there, the Unihan data files have their
+    own separate format.
+    '''
+
+    def __init__(self, template: str, version: str) -> None:
+        self.template = template
+        self.version = version
+
+    def records(self) -> Iterator[List[str]]:
+        with open_data(self.template, self.version) as file:
+            for line in file:
+                line = line.split('#', 1)[0].strip()
+                if not line:
+                    continue
+                yield [field.strip() for field in line.split(';')]
+
+    def __iter__(self) -> Iterator[List[str]]:
+        return self.records()
+
+
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@@ -922,14 +949,9 @@ class UnicodeData:
                  cjk_check=True):
         self.changed = []
         table = [None] * 0x110000
-        with open_data(UNICODE_DATA, version) as file:
-            while 1:
-                s = file.readline()
-                if not s:
-                    break
-                s = s.strip().split(";")
-                char = int(s[0], 16)
-                table[char] = s
+        for s in UcdFile(UNICODE_DATA, version):
+            char = int(s[0], 16)
+            table[char] = s
 
         cjk_ranges_found = []
 
@@ -968,17 +990,12 @@ class UnicodeData:
             # in order to take advantage of the compression and lookup
             # algorithms used for the other characters
             pua_index = NAME_ALIASES_START
-            with open_data(NAME_ALIASES, version) as file:
-                for s in file:
-                    s = s.strip()
-                    if not s or s.startswith('#'):
-                        continue
-                    char, name, abbrev = s.split(';')
-                    char = int(char, 16)
-                    self.aliases.append((name, char))
-                    # also store the name in the PUA 1
-                    self.table[pua_index][1] = name
-                    pua_index += 1
+            for char, name, abbrev in UcdFile(NAME_ALIASES, version):
+                char = int(char, 16)
+                self.aliases.append((name, char))
+                # also store the name in the PUA 1
+                self.table[pua_index][1] = name
+                pua_index += 1
             assert pua_index - NAME_ALIASES_START == len(self.aliases)
 
             self.named_sequences = []
@@ -988,50 +1005,32 @@ class UnicodeData:
 
             assert pua_index < NAMED_SEQUENCES_START
             pua_index = NAMED_SEQUENCES_START
-            with open_data(NAMED_SEQUENCES, version) as file:
-                for s in file:
-                    s = s.strip()
-                    if not s or s.startswith('#'):
-                        continue
-                    name, chars = s.split(';')
-                    chars = tuple(int(char, 16) for char in chars.split())
-                    # check that the structure defined in makeunicodename is OK
-                    assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
-                    assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
-                        "the NamedSequence struct and in unicodedata_lookup")
-                    self.named_sequences.append((name, chars))
-                    # also store these in the PUA 1
-                    self.table[pua_index][1] = name
-                    pua_index += 1
+            for name, chars in UcdFile(NAMED_SEQUENCES, version):
+                chars = tuple(int(char, 16) for char in chars.split())
+                # check that the structure defined in makeunicodename is OK
+                assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+                assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
+                    "the NamedSequence struct and in unicodedata_lookup")
+                self.named_sequences.append((name, chars))
+                # also store these in the PUA 1
+                self.table[pua_index][1] = name
+                pua_index += 1
             assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
 
         self.exclusions = {}
-        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
-            for s in file:
-                s = s.strip()
-                if not s:
-                    continue
-                if s[0] == '#':
-                    continue
-                char = int(s.split()[0],16)
-                self.exclusions[char] = 1
+        for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
+            char = int(char, 16)
+            self.exclusions[char] = 1
 
         widths = [None] * 0x110000
-        with open_data(EASTASIAN_WIDTH, version) as file:
-            for s in file:
-                s = s.strip()
-                if not s:
-                    continue
-                if s[0] == '#':
-                    continue
-                s = s.split()[0].split(';')
-                if '..' in s[0]:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                    chars = list(range(first, last+1))
-                else:
-                    chars = [int(s[0], 16)]
-                for char in chars:
-                    widths[char] = s[1]
+        for s in UcdFile(EASTASIAN_WIDTH, version):
+            if '..' in s[0]:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+                chars = list(range(first, last+1))
+            else:
+                chars = [int(s[0], 16)]
+            for char in chars:
+                widths[char] = s[1]
 
         for i in range(0, 0x110000):
             if table[i] is not None:
@@ -1041,38 +1040,27 @@ class UnicodeData:
             if table[i] is not None:
                 table[i].append(set())
 
-        with open_data(DERIVED_CORE_PROPERTIES, version) as file:
-            for s in file:
-                s = s.split('#', 1)[0].strip()
-                if not s:
-                    continue
-
-                r, p = s.split(";")
-                r = r.strip()
-                p = p.strip()
-                if ".." in r:
-                    first, last = [int(c, 16) for c in r.split('..')]
-                    chars = list(range(first, last+1))
-                else:
-                    chars = [int(r, 16)]
-                for char in chars:
-                    if table[char]:
-                        # Some properties (e.g. Default_Ignorable_Code_Point)
-                        # apply to unassigned code points; ignore them
-                        table[char][-1].add(p)
-
-        with open_data(LINE_BREAK, version) as file:
-            for s in file:
-                s = s.partition('#')[0]
-                s = [i.strip() for i in s.split(';')]
-                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
-                    continue
-                if '..' not in s[0]:
-                    first = last = int(s[0], 16)
-                else:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                for char in range(first, last+1):
-                    table[char][-1].add('Line_Break')
+        for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
+            if ".." in r:
+                first, last = [int(c, 16) for c in r.split('..')]
+                chars = list(range(first, last+1))
+            else:
+                chars = [int(r, 16)]
+            for char in chars:
+                if table[char]:
+                    # Some properties (e.g. Default_Ignorable_Code_Point)
+                    # apply to unassigned code points; ignore them
+                    table[char][-1].add(p)
+
+        for s in UcdFile(LINE_BREAK, version):
+            if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                continue
+            if '..' not in s[0]:
+                first = last = int(s[0], 16)
+            else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
+                table[char][-1].add('Line_Break')
 
         # We only want the quickcheck properties
         # Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1083,23 +1071,19 @@ class UnicodeData:
         # for older versions, and no delta records will be created.
         quickchecks = [0] * 0x110000
         qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
-        with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
-            for s in file:
-                if '#' in s:
-                    s = s[:s.index('#')]
-                s = [i.strip() for i in s.split(';')]
-                if len(s) < 2 or s[1] not in qc_order:
-                    continue
-                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
-                quickcheck_shift = qc_order.index(s[1])*2
-                quickcheck <<= quickcheck_shift
-                if '..' not in s[0]:
-                    first = last = int(s[0], 16)
-                else:
-                    first, last = [int(c, 16) for c in s[0].split('..')]
-                for char in range(first, last+1):
-                    assert not (quickchecks[char]>>quickcheck_shift)&3
-                    quickchecks[char] |= quickcheck
+        for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
+            if len(s) < 2 or s[1] not in qc_order:
+                continue
+            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+            quickcheck_shift = qc_order.index(s[1])*2
+            quickcheck <<= quickcheck_shift
+            if '..' not in s[0]:
+                first = last = int(s[0], 16)
+            else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
+                assert not (quickchecks[char]>>quickcheck_shift)&3
+                quickchecks[char] |= quickcheck
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].append(quickchecks[i])
@@ -1122,34 +1106,26 @@ class UnicodeData:
             # Patch the numeric field
             if table[i] is not None:
                 table[i][8] = value
+
         sc = self.special_casing = {}
-        with open_data(SPECIAL_CASING, version) as file:
-            for s in file:
-                s = s[:-1].split('#', 1)[0]
-                if not s:
-                    continue
-                data = s.split("; ")
-                if data[4]:
-                    # We ignore all conditionals (since they depend on
-                    # languages) except for one, which is hardcoded. See
-                    # handle_capital_sigma in unicodeobject.c.
-                    continue
-                c = int(data[0], 16)
-                lower = [int(char, 16) for char in data[1].split()]
-                title = [int(char, 16) for char in data[2].split()]
-                upper = [int(char, 16) for char in data[3].split()]
-                sc[c] = (lower, title, upper)
+        for data in UcdFile(SPECIAL_CASING, version):
+            if data[4]:
+                # We ignore all conditionals (since they depend on
+                # languages) except for one, which is hardcoded. See
+                # handle_capital_sigma in unicodeobject.c.
+                continue
+            c = int(data[0], 16)
+            lower = [int(char, 16) for char in data[1].split()]
+            title = [int(char, 16) for char in data[2].split()]
+            upper = [int(char, 16) for char in data[3].split()]
+            sc[c] = (lower, title, upper)
+
         cf = self.case_folding = {}
         if version != '3.2.0':
-            with open_data(CASE_FOLDING, version) as file:
-                for s in file:
-                    s = s[:-1].split('#', 1)[0]
-                    if not s:
-                        continue
-                    data = s.split("; ")
-                    if data[1] in "CF":
-                        c = int(data[0], 16)
-                        cf[c] = [int(char, 16) for char in data[2].split()]
+            for data in UcdFile(CASE_FOLDING, version):
+                if data[1] in "CF":
+                    c = int(data[0], 16)
+                    cf[c] = [int(char, 16) for char in data[2].split()]
 
     def uselatin1(self):
         # restrict character range to ISO Latin 1