From 5cbc71e50ab6fa8c3cb0cfc64ca4bf5fb174ffcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Mon, 22 Nov 2010 09:00:02 +0000 Subject: Issue #10459: Update CJK character names to Unicode 6.0. --- Lib/test/test_ucn.py | 6 +++++- Misc/NEWS | 2 ++ Modules/unicodedata.c | 11 +++++++---- Tools/unicode/makeunicodedata.py | 21 +++++++++++++++++++-- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 2d48179..fd620f0 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -88,9 +88,13 @@ class UnicodeNamesTest(unittest.TestCase): self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400") self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5") self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00") - self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", "\u9fa5") + self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB") self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000") self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6") + self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700") + self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734") + self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740") + self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D") def test_bmp_characters(self): import unicodedata diff --git a/Misc/NEWS b/Misc/NEWS index f5307ef..efbae8b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -32,6 +32,8 @@ Core and Builtins Library ------- +- Issue #10459: Update CJK character names to Unicode 6.0. + - Issue #4493: urllib.request adds '/' in front of path components which does not start with '/. Common behavior exhibited by browsers and other clients. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 2926c35..233f8e0 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -866,13 +866,16 @@ static char *hangul_syllables[][3] = { { 0, 0, "H" } }; +/* These ranges need to match makeunicodedata.py:cjk_ranges. */ static int is_unified_ideograph(Py_UCS4 code) { - return ( - (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ - (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ - (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ + return + (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ + (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */ + (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ + (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ + (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ } static int diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 0783f17..02b6892 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -70,6 +70,15 @@ PRINTABLE_MASK = 0x400 NODELTA_MASK = 0x800 NUMERIC_MASK = 0x1000 +# these ranges need to match unicodedata.c:is_unified_ideograph +cjk_ranges = [ + ('3400', '4DB5'), + ('4E00', '9FCB'), + ('20000', '2A6D6'), + ('2A700', '2B734'), + ('2B740', '2B81D') +] + def maketables(trace=0): print("--- Reading", UNICODE_DATA % "", "...") @@ -81,7 +90,7 @@ def maketables(trace=0): for version in old_versions: print("--- Reading", UNICODE_DATA % ("-"+version), "...") - old_unicode = UnicodeData(version) + old_unicode = UnicodeData(version, cjk_check=False) print(len(list(filter(None, old_unicode.table))), "characters") merge_old_version(version, unicode, old_unicode) @@ -804,7 +813,8 @@ class UnicodeData: def __init__(self, version, linebreakprops=False, - expand=1): + expand=1, + cjk_check=True): self.changed = [] file = open_data(UNICODE_DATA, version) table = [None] * 0x110000 @@ -816,6 +826,8 @@ class UnicodeData: char = int(s[0], 16) table[char] = s + cjk_ranges_found = [] + # expand first-last ranges if expand: field = None @@ -826,12 +838,17 @@ class UnicodeData: s[1] = "" field = s elif s[1][-5:] == "Last>": + if s[1].startswith("