GH-96172 fix unicodedata.east_asian_width being wrong on unassigned code points (#96207)

author: Carl Friedrich Bolz-Tereick <cfbolz@gmx.de> 2022-08-26 16:29:39 (GMT)
committer: GitHub <noreply@github.com> 2022-08-26 16:29:39 (GMT)
commit: 9c197bc8bfa153522927aa03ff854bbc4dce437f (patch)
tree: a6cc9960a08893c201a7c0da13ec17d0aba95c40 /Tools
parent: c1581a928cc153076593c5c433a8dd36ce10fbfb (diff)
download: cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.zip
cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.tar.gz
cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.tar.bz2
1 files changed, 25 insertions, 7 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index f28266f..c301759 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -77,7 +77,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
     "ON", "LRI", "RLI", "FSI", "PDI" ]
 
-EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
+# "N" needs to be the first entry, see the comment in makeunicodedata
+EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
 
 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
 
@@ -135,6 +136,14 @@ def maketables(trace=0):
 
 def makeunicodedata(unicode, trace):
 
+    # the default value of east_asian_width is "N", for unassigned code points
+    # not mentioned in EastAsianWidth.txt
+    # in addition there are some reserved but unassigned code points in CJK
+    # ranges that are classified as "W". code points in private use areas
+    # have a width of "A". both of these have entries in
+    # EastAsianWidth.txt
+    # see https://unicode.org/reports/tr11/#Unassigned
+    assert EASTASIANWIDTH_NAMES[0] == "N"
     dummy = (0, 0, 0, 0, 0, 0)
     table = [dummy]
     cache = {0: dummy}
@@ -160,12 +169,20 @@ def makeunicodedata(unicode, trace):
                 category, combining, bidirectional, mirrored, eastasianwidth,
                 normalizationquickcheck
                 )
-            # add entry to index and item tables
-            i = cache.get(item)
-            if i is None:
-                cache[item] = i = len(table)
-                table.append(item)
-            index[char] = i
+        elif unicode.widths[char] is not None:
+            # an unassigned but reserved character, with a known
+            # east_asian_width
+            eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
+            item = (0, 0, 0, 0, eastasianwidth, 0)
+        else:
+            continue
+
+        # add entry to index and item tables
+        i = cache.get(item)
+        if i is None:
+            cache[item] = i = len(table)
+            table.append(item)
+        index[char] = i
 
     # 2) decomposition data
 
@@ -1085,6 +1102,7 @@ class UnicodeData:
         for i in range(0, 0x110000):
             if table[i] is not None:
                 table[i].east_asian_width = widths[i]
+        self.widths = widths
 
         for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
             if table[char]:
author	Carl Friedrich Bolz-Tereick <cfbolz@gmx.de>	2022-08-26 16:29:39 (GMT)
committer	GitHub <noreply@github.com>	2022-08-26 16:29:39 (GMT)
commit	9c197bc8bfa153522927aa03ff854bbc4dce437f (patch)
tree	a6cc9960a08893c201a7c0da13ec17d0aba95c40 /Tools
parent	c1581a928cc153076593c5c433a8dd36ce10fbfb (diff)
download	cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.zip cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.tar.gz cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.tar.bz2