diff options
author | Carl Friedrich Bolz-Tereick <cfbolz@gmx.de> | 2022-08-26 16:29:39 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-26 16:29:39 (GMT) |
commit | 9c197bc8bfa153522927aa03ff854bbc4dce437f (patch) | |
tree | a6cc9960a08893c201a7c0da13ec17d0aba95c40 /Tools | |
parent | c1581a928cc153076593c5c433a8dd36ce10fbfb (diff) | |
download | cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.zip cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.tar.gz cpython-9c197bc8bfa153522927aa03ff854bbc4dce437f.tar.bz2 |
GH-96172 fix unicodedata.east_asian_width being wrong on unassigned code points (#96207)
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 32 |
1 files changed, 25 insertions, 7 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index f28266f..c301759 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -77,7 +77,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", "ON", "LRI", "RLI", "FSI", "PDI" ] -EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +# "N" needs to be the first entry, see the comment in makeunicodedata +EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ] MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] @@ -135,6 +136,14 @@ def maketables(trace=0): def makeunicodedata(unicode, trace): + # the default value of east_asian_width is "N", for unassigned code points + # not mentioned in EastAsianWidth.txt + # in addition there are some reserved but unassigned code points in CJK + # ranges that are classified as "W". code points in private use areas + # have a width of "A". both of these have entries in + # EastAsianWidth.txt + # see https://unicode.org/reports/tr11/#Unassigned + assert EASTASIANWIDTH_NAMES[0] == "N" dummy = (0, 0, 0, 0, 0, 0) table = [dummy] cache = {0: dummy} @@ -160,12 +169,20 @@ def makeunicodedata(unicode, trace): category, combining, bidirectional, mirrored, eastasianwidth, normalizationquickcheck ) - # add entry to index and item tables - i = cache.get(item) - if i is None: - cache[item] = i = len(table) - table.append(item) - index[char] = i + elif unicode.widths[char] is not None: + # an unassigned but reserved character, with a known + # east_asian_width + eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char]) + item = (0, 0, 0, 0, eastasianwidth, 0) + else: + continue + + # add entry to index and item tables + i = cache.get(item) + if i is None: + cache[item] = i = len(table) + table.append(item) + index[char] = i # 2) decomposition data @@ -1085,6 +1102,7 @@ class UnicodeData: for i in range(0, 0x110000): if table[i] is not None: table[i].east_asian_width = widths[i] + self.widths = widths for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): if table[char]: |