From ef7fe2e8137824248cf45d316535b76dae302b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Sat, 23 Nov 2002 18:01:32 +0000 Subject: Implement names for CJK unified ideographs. Add name to KeyError output. Verify that the lookup for an existing name succeeds. --- Lib/test/output/test_ucn | 5 +++-- Lib/test/test_ucn.py | 20 ++++++++++++++++---- Misc/NEWS | 2 +- Modules/unicodedata.c | 40 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/Lib/test/output/test_ucn b/Lib/test/output/test_ucn index 1006c07..c41017b 100644 --- a/Lib/test/output/test_ucn +++ b/Lib/test/output/test_ucn @@ -2,7 +2,8 @@ test_ucn Testing General Unicode Character Name, and case insensitivity... done. Testing name to code mapping.... done. Testing hangul syllable names.... done. -Testing code to name mapping for all characters.... done. -Found 22728 characters in the unicode name database +Testing names of CJK unified ideographs.... done. +Testing code to name mapping for all BMP characters.... done. +Found 50212 characters in the unicode name database Testing misc. symbols for unicode character name expansion.... done. Testing unicode character name expansion strict error handling.... done. diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 6f2b022..e7b8bbd 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -80,16 +80,28 @@ else: raise AssertionError, "Found name for U+D7A4" print "done." -print "Testing code to name mapping for all characters....", +print "Testing names of CJK unified ideographs....", +exec r""" +verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400") +verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5") +verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00") +verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5") +verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000") +verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6") +""" +print "done." + +print "Testing code to name mapping for all BMP characters....", count = 0 -for code in range(65536): +for code in range(0x10000): try: char = unichr(code) name = unicodedata.name(char) - verify(unicodedata.lookup(name) == char) - count += 1 except (KeyError, ValueError): pass + else: + verify(unicodedata.lookup(name) == char) + count += 1 print "done." print "Found", count, "characters in the unicode name database" diff --git a/Misc/NEWS b/Misc/NEWS index 71da82d..8fdd1f6 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -318,7 +318,7 @@ Extension modules is now named bsddb185. - unicodedata was updated to Unicode 3.2. In now also supports names - for Hangul syllables. + for Hangul syllables and CJK unified ideographs. - resource.getrlimit() now returns longs instead of ints. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 330b376..3620936 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen) return 1; } + if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ + (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */ + (0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */ + if (buflen < 28) + /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ + return 0; + sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); + return 1; + } + if (code >= 0x110000) return 0; @@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code) *code = SBase + (L*VCount+V)*TCount + T; return 1; } + /* Otherwise, it's an illegal syllable name. */ + return 0; + } + + /* Check for unified ideographs. */ + if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { + /* Four or five hexdigits must follow. */ + v = 0; + name += 22; + namelen -= 22; + if (namelen != 4 && namelen != 5) + return 0; + while (namelen--) { + v *= 16; + if (*name >= '0' && *name <= '9') + v += *name - '0'; + else if (*name >= 'A' && *name <= 'F') + v += *name - 'A' + 10; + else + return 0; + name++; + } + *code = v; + return 1; } /* the following is the same as python's dictionary lookup, with @@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args) return NULL; if (!_getcode(name, namelen, &code)) { - PyErr_SetString(PyExc_KeyError, "undefined character name"); + char fmt[] = "undefined character name '%s'"; + char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); + sprintf(buf, fmt, name); + PyErr_SetString(PyExc_KeyError, buf); + PyMem_FREE(buf); return NULL; } -- cgit v0.12