summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2002-11-23 18:01:32 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2002-11-23 18:01:32 (GMT)
commitef7fe2e8137824248cf45d316535b76dae302b5a (patch)
tree01a0b67fae5af758f8c8cc18459266756f4e1074
parent8579efc86c18b7b824ec080582f032674e2f8a5e (diff)
downloadcpython-ef7fe2e8137824248cf45d316535b76dae302b5a.zip
cpython-ef7fe2e8137824248cf45d316535b76dae302b5a.tar.gz
cpython-ef7fe2e8137824248cf45d316535b76dae302b5a.tar.bz2
Implement names for CJK unified ideographs. Add name to KeyError output.
Verify that the lookup for an existing name succeeds.
-rw-r--r--Lib/test/output/test_ucn5
-rw-r--r--Lib/test/test_ucn.py20
-rw-r--r--Misc/NEWS2
-rw-r--r--Modules/unicodedata.c40
4 files changed, 59 insertions, 8 deletions
diff --git a/Lib/test/output/test_ucn b/Lib/test/output/test_ucn
index 1006c07..c41017b 100644
--- a/Lib/test/output/test_ucn
+++ b/Lib/test/output/test_ucn
@@ -2,7 +2,8 @@ test_ucn
Testing General Unicode Character Name, and case insensitivity... done.
Testing name to code mapping.... done.
Testing hangul syllable names.... done.
-Testing code to name mapping for all characters.... done.
-Found 22728 characters in the unicode name database
+Testing names of CJK unified ideographs.... done.
+Testing code to name mapping for all BMP characters.... done.
+Found 50212 characters in the unicode name database
Testing misc. symbols for unicode character name expansion.... done.
Testing unicode character name expansion strict error handling.... done.
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
index 6f2b022..e7b8bbd 100644
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -80,16 +80,28 @@ else:
raise AssertionError, "Found name for U+D7A4"
print "done."
-print "Testing code to name mapping for all characters....",
+print "Testing names of CJK unified ideographs....",
+exec r"""
+verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
+verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
+"""
+print "done."
+
+print "Testing code to name mapping for all BMP characters....",
count = 0
-for code in range(65536):
+for code in range(0x10000):
try:
char = unichr(code)
name = unicodedata.name(char)
- verify(unicodedata.lookup(name) == char)
- count += 1
except (KeyError, ValueError):
pass
+ else:
+ verify(unicodedata.lookup(name) == char)
+ count += 1
print "done."
print "Found", count, "characters in the unicode name database"
diff --git a/Misc/NEWS b/Misc/NEWS
index 71da82d..8fdd1f6 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -318,7 +318,7 @@ Extension modules
is now named bsddb185.
- unicodedata was updated to Unicode 3.2. In now also supports names
- for Hangul syllables.
+ for Hangul syllables and CJK unified ideographs.
- resource.getrlimit() now returns longs instead of ints.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 330b376..3620936 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -348,6 +348,16 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
return 1;
}
+ if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
+ (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
+ (0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
+ if (buflen < 28)
+ /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
+ return 0;
+ sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
+ return 1;
+ }
+
if (code >= 0x110000)
return 0;
@@ -449,6 +459,30 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
*code = SBase + (L*VCount+V)*TCount + T;
return 1;
}
+ /* Otherwise, it's an illegal syllable name. */
+ return 0;
+ }
+
+ /* Check for unified ideographs. */
+ if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+ /* Four or five hexdigits must follow. */
+ v = 0;
+ name += 22;
+ namelen -= 22;
+ if (namelen != 4 && namelen != 5)
+ return 0;
+ while (namelen--) {
+ v *= 16;
+ if (*name >= '0' && *name <= '9')
+ v += *name - '0';
+ else if (*name >= 'A' && *name <= 'F')
+ v += *name - 'A' + 10;
+ else
+ return 0;
+ name++;
+ }
+ *code = v;
+ return 1;
}
/* the following is the same as python's dictionary lookup, with
@@ -535,7 +569,11 @@ unicodedata_lookup(PyObject* self, PyObject* args)
return NULL;
if (!_getcode(name, namelen, &code)) {
- PyErr_SetString(PyExc_KeyError, "undefined character name");
+ char fmt[] = "undefined character name '%s'";
+ char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
+ sprintf(buf, fmt, name);
+ PyErr_SetString(PyExc_KeyError, buf);
+ PyMem_FREE(buf);
return NULL;
}