diff options
-rw-r--r-- | Lib/test/test_ucn.py | 15 | ||||
-rw-r--r-- | Modules/ucnhash.c | 196 | ||||
-rw-r--r-- | Modules/unicodedata.c | 245 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 20 |
4 files changed, 248 insertions, 228 deletions
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 0797f2c..f7d3ce4 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -1,6 +1,7 @@ """ Test script for the Unicode implementation. Written by Bill Tutt. +Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. @@ -46,23 +47,24 @@ except UnicodeError, v: print v print "done." -import ucnhash +import unicodedata print "Testing name to code mapping....", for char in "SPAM": name = "LATIN SMALL LETTER %s" % char - code = ucnhash.getcode(name) - verify(ucnhash.getname(code) == name) + code = unicodedata.lookup(name) + verify(unicodedata.name(code) == name) print "done." print "Testing code to name mapping for all characters....", count = 0 for code in range(65536): try: - name = ucnhash.getname(code) - verify(ucnhash.getcode(name) == code) + char = unichr(code) + name = unicodedata.name(char) + verify(unicodedata.lookup(name) == char) count += 1 - except ValueError: + except (KeyError, ValueError): pass print "done." @@ -78,7 +80,6 @@ verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41") """ print "done." - # strict error testing: print "Testing unicode character name expansion strict error handling....", try: diff --git a/Modules/ucnhash.c b/Modules/ucnhash.c index bdcdab1..424b6c5 100644 --- a/Modules/ucnhash.c +++ b/Modules/ucnhash.c @@ -1,212 +1,22 @@ -/* unicode character name tables */ -/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */ +/* obsolete -- remove this file! */ #include "Python.h" -#include "ucnhash.h" - -/* data file generated by Tools/unicode/makeunicodedata.py */ -#include "unicodename_db.h" - -/* -------------------------------------------------------------------- */ -/* database code (cut and pasted from the unidb package) */ - -static unsigned long -gethash(const char *s, int len, int scale) -{ - int i; - unsigned long h = 0; - unsigned long ix; - for (i = 0; i < len; i++) { - h = (h * scale) + (unsigned char) toupper(s[i]); - ix = h & 0xff000000; - if (ix) - h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; - } - return h; -} - -static int -getname(Py_UCS4 code, char* buffer, int buflen) -{ - int offset; - int i; - int word; - unsigned char* w; - - if (code < 0 || code >= 65536) - return 0; - - /* get offset into phrasebook */ - offset = phrasebook_offset1[(code>>phrasebook_shift)]; - offset = phrasebook_offset2[(offset<<phrasebook_shift) + - (code&((1<<phrasebook_shift)-1))]; - if (!offset) - return 0; - - i = 0; - - for (;;) { - /* get word index */ - word = phrasebook[offset] - phrasebook_short; - if (word >= 0) { - word = (word << 8) + phrasebook[offset+1]; - offset += 2; - } else - word = phrasebook[offset++]; - if (i) { - if (i > buflen) - return 0; /* buffer overflow */ - buffer[i++] = ' '; - } - /* copy word string from lexicon. the last character in the - word has bit 7 set. the last word in a string ends with - 0x80 */ - w = lexicon + lexicon_offset[word]; - while (*w < 128) { - if (i >= buflen) - return 0; /* buffer overflow */ - buffer[i++] = *w++; - } - if (i >= buflen) - return 0; /* buffer overflow */ - buffer[i++] = *w & 127; - if (*w == 128) - break; /* end of word */ - } - - return 1; -} - -static int -cmpname(int code, const char* name, int namelen) -{ - /* check if code corresponds to the given name */ - int i; - char buffer[NAME_MAXLEN]; - if (!getname(code, buffer, sizeof(buffer))) - return 0; - for (i = 0; i < namelen; i++) { - if (toupper(name[i]) != buffer[i]) - return 0; - } - return buffer[namelen] == '\0'; -} - -static int -getcode(const char* name, int namelen, Py_UCS4* code) -{ - unsigned int h, v; - unsigned int mask = code_size-1; - unsigned int i, incr; - - /* the following is the same as python's dictionary lookup, with - only minor changes. see the makeunicodedata script for more - details */ - - h = (unsigned int) gethash(name, namelen, code_magic); - i = (~h) & mask; - v = code_hash[i]; - if (!v) - return 0; - if (cmpname(v, name, namelen)) { - *code = v; - return 1; - } - incr = (h ^ (h >> 3)) & mask; - if (!incr) - incr = mask; - for (;;) { - i = (i + incr) & mask; - v = code_hash[i]; - if (!v) - return -1; - if (cmpname(v, name, namelen)) { - *code = v; - return 1; - } - incr = incr << 1; - if (incr > mask) - incr = incr ^ code_poly; - } -} - -static const _PyUnicode_Name_CAPI hashAPI = -{ - sizeof(_PyUnicode_Name_CAPI), - getname, - getcode -}; - -/* -------------------------------------------------------------------- */ -/* Python bindings */ - -static PyObject * -ucnhash_getname(PyObject* self, PyObject* args) -{ - char name[NAME_MAXLEN]; - - int code; - if (!PyArg_ParseTuple(args, "i", &code)) - return NULL; - - if (!getname((Py_UCS4) code, name, sizeof(name))) { - PyErr_SetString(PyExc_ValueError, "undefined character code"); - return NULL; - } - - return Py_BuildValue("s", name); -} - -static PyObject * -ucnhash_getcode(PyObject* self, PyObject* args) -{ - Py_UCS4 code; - - char* name; - int namelen; - if (!PyArg_ParseTuple(args, "s#", &name, &namelen)) - return NULL; - - if (!getcode(name, namelen, &code)) { - PyErr_SetString(PyExc_ValueError, "undefined character name"); - return NULL; - } - - return Py_BuildValue("i", code); -} static PyMethodDef ucnhash_methods[] = { - {"getname", ucnhash_getname, 1}, - {"getcode", ucnhash_getcode, 1}, {NULL, NULL}, }; -static char *ucnhash_docstring = "ucnhash hash function module"; - +static char *ucnhash_docstring = "ucnhash hash function module (obsolete)"; -/* Create PyMethodObjects and register them in the module's dict */ DL_EXPORT(void) initucnhash(void) { - PyObject *m, *d, *v; - - m = Py_InitModule4( + Py_InitModule4( "ucnhash", /* Module name */ ucnhash_methods, /* Method list */ ucnhash_docstring, /* Module doc-string */ (PyObject *)NULL, /* always pass this as *self */ PYTHON_API_VERSION); /* API Version */ - if (!m) - return; - - d = PyModule_GetDict(m); - if (!d) - return; - - /* Export C API */ - v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); - PyDict_SetItemString(d, "Unicode_Names_CAPI", v); - Py_XDECREF(v); } diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 06e5f04..dfe2f7b 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -12,6 +12,9 @@ ------------------------------------------------------------------------ */ #include "Python.h" +#include "ucnhash.h" + +/* character properties */ typedef struct { const unsigned char category; /* index into @@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args) PyObject *defobj = NULL; long rc; - if (!PyArg_ParseTuple(args, "O!|O:decimal", - &PyUnicode_Type, &v, &defobj)) + if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, @@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args) PyObject *defobj = NULL; long rc; - if (!PyArg_ParseTuple(args, "O!|O:digit", - &PyUnicode_Type, &v, &defobj)) + if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, @@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args) rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { - PyErr_SetString(PyExc_ValueError, - "not a digit"); + PyErr_SetString(PyExc_ValueError, "not a digit"); return NULL; } else { @@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) PyObject *defobj = NULL; double rc; - if (!PyArg_ParseTuple(args, "O!|O:numeric", - &PyUnicode_Type, &v, &defobj)) + if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, @@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { - PyErr_SetString(PyExc_ValueError, - "not a numeric character"); + PyErr_SetString(PyExc_ValueError, "not a numeric character"); return NULL; } else { @@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args) return PyString_FromString(decomp); } +/* -------------------------------------------------------------------- */ +/* unicode character name tables */ + +/* data file generated by Tools/unicode/makeunicodedata.py */ +#include "unicodename_db.h" + +/* -------------------------------------------------------------------- */ +/* database code (cut and pasted from the unidb package) */ + +static unsigned long +gethash(const char *s, int len, int scale) +{ + int i; + unsigned long h = 0; + unsigned long ix; + for (i = 0; i < len; i++) { + h = (h * scale) + (unsigned char) toupper(s[i]); + ix = h & 0xff000000; + if (ix) + h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; + } + return h; +} + +static int +getname(Py_UCS4 code, char* buffer, int buflen) +{ + int offset; + int i; + int word; + unsigned char* w; + + if (code < 0 || code >= 65536) + return 0; + + /* get offset into phrasebook */ + offset = phrasebook_offset1[(code>>phrasebook_shift)]; + offset = phrasebook_offset2[(offset<<phrasebook_shift) + + (code&((1<<phrasebook_shift)-1))]; + if (!offset) + return 0; + + i = 0; + + for (;;) { + /* get word index */ + word = phrasebook[offset] - phrasebook_short; + if (word >= 0) { + word = (word << 8) + phrasebook[offset+1]; + offset += 2; + } else + word = phrasebook[offset++]; + if (i) { + if (i > buflen) + return 0; /* buffer overflow */ + buffer[i++] = ' '; + } + /* copy word string from lexicon. the last character in the + word has bit 7 set. the last word in a string ends with + 0x80 */ + w = lexicon + lexicon_offset[word]; + while (*w < 128) { + if (i >= buflen) + return 0; /* buffer overflow */ + buffer[i++] = *w++; + } + if (i >= buflen) + return 0; /* buffer overflow */ + buffer[i++] = *w & 127; + if (*w == 128) + break; /* end of word */ + } + + return 1; +} + +static int +cmpname(int code, const char* name, int namelen) +{ + /* check if code corresponds to the given name */ + int i; + char buffer[NAME_MAXLEN]; + if (!getname(code, buffer, sizeof(buffer))) + return 0; + for (i = 0; i < namelen; i++) { + if (toupper(name[i]) != buffer[i]) + return 0; + } + return buffer[namelen] == '\0'; +} + +static int +getcode(const char* name, int namelen, Py_UCS4* code) +{ + unsigned int h, v; + unsigned int mask = code_size-1; + unsigned int i, incr; + + /* the following is the same as python's dictionary lookup, with + only minor changes. see the makeunicodedata script for more + details */ + + h = (unsigned int) gethash(name, namelen, code_magic); + i = (~h) & mask; + v = code_hash[i]; + if (!v) + return 0; + if (cmpname(v, name, namelen)) { + *code = v; + return 1; + } + incr = (h ^ (h >> 3)) & mask; + if (!incr) + incr = mask; + for (;;) { + i = (i + incr) & mask; + v = code_hash[i]; + if (!v) + return -1; + if (cmpname(v, name, namelen)) { + *code = v; + return 1; + } + incr = incr << 1; + if (incr > mask) + incr = incr ^ code_poly; + } +} + +static const _PyUnicode_Name_CAPI hashAPI = +{ + sizeof(_PyUnicode_Name_CAPI), + getname, + getcode +}; + +/* -------------------------------------------------------------------- */ +/* Python bindings */ + +static PyObject * +unicodedata_name(PyObject* self, PyObject* args) +{ + char name[NAME_MAXLEN]; + + PyUnicodeObject* v; + PyObject* defobj = NULL; + if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) + return NULL; + + if (PyUnicode_GET_SIZE(v) != 1) { + PyErr_SetString(PyExc_TypeError, + "need a single Unicode character as parameter"); + return NULL; + } + + if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) { + if (defobj == NULL) { + PyErr_SetString(PyExc_ValueError, "no such name"); + return NULL; + } + else { + Py_INCREF(defobj); + return defobj; + } + } + + return Py_BuildValue("s", name); +} + +static PyObject * +unicodedata_lookup(PyObject* self, PyObject* args) +{ + Py_UCS4 code; + Py_UNICODE str[1]; + + char* name; + int namelen; + if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) + return NULL; + + if (!getcode(name, namelen, &code)) { + PyErr_SetString(PyExc_KeyError, "undefined character name"); + return NULL; + } + + str[0] = (Py_UNICODE) code; + return PyUnicode_FromUnicode(str, 1); +} + /* XXX Add doc strings. */ static PyMethodDef unicodedata_functions[] = { - {"decimal", unicodedata_decimal, 1}, - {"digit", unicodedata_digit, 1}, - {"numeric", unicodedata_numeric, 1}, - {"category", unicodedata_category, 1}, - {"bidirectional", unicodedata_bidirectional, 1}, - {"combining", unicodedata_combining, 1}, - {"mirrored", unicodedata_mirrored, 1}, - {"decomposition", unicodedata_decomposition, 1}, + {"decimal", unicodedata_decimal, METH_VARARGS}, + {"digit", unicodedata_digit, METH_VARARGS}, + {"numeric", unicodedata_numeric, METH_VARARGS}, + {"category", unicodedata_category, METH_VARARGS}, + {"bidirectional", unicodedata_bidirectional, METH_VARARGS}, + {"combining", unicodedata_combining, METH_VARARGS}, + {"mirrored", unicodedata_mirrored, METH_VARARGS}, + {"decomposition",unicodedata_decomposition, METH_VARARGS}, + {"name", unicodedata_name, METH_VARARGS}, + {"lookup", unicodedata_lookup, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; +static char *unicodedata_docstring = "unicode character database"; + DL_EXPORT(void) initunicodedata(void) { - Py_InitModule("unicodedata", unicodedata_functions); + PyObject *m, *d, *v; + + m = Py_InitModule4( + "unicodedata", unicodedata_functions, + unicodedata_docstring, NULL, PYTHON_API_VERSION); + if (!m) + return; + + d = PyModule_GetDict(m); + if (!d) + return; + + /* Export C API */ + v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); + PyDict_SetItemString(d, "ucnhash_CAPI", v); + Py_XDECREF(v); + } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 585afe6..39ea071 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1103,7 +1103,7 @@ int unicodeescape_decoding_error(const char **source, } } -static _PyUnicode_Name_CAPI *unicode_names = NULL; +static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, int size, @@ -1236,18 +1236,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, /* Ok, we need to deal with Unicode Character Names now, * make sure we've imported the hash table data... */ - if (unicode_names == NULL) { + if (ucnhash_CAPI == NULL) { PyObject *mod = 0, *v = 0; - mod = PyImport_ImportModule("ucnhash"); + mod = PyImport_ImportModule("unicodedata"); if (mod == NULL) goto ucnhashError; - v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI"); + v = PyObject_GetAttrString(mod,"ucnhash_CAPI"); Py_DECREF(mod); if (v == NULL) goto ucnhashError; - unicode_names = PyCObject_AsVoidPtr(v); + ucnhash_CAPI = PyCObject_AsVoidPtr(v); Py_DECREF(v); - if (unicode_names == NULL) + if (ucnhash_CAPI == NULL) goto ucnhashError; } @@ -1259,7 +1259,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, while (*endBrace != '}' && endBrace < end) endBrace++; if (endBrace != end && *endBrace == '}') { - if (!unicode_names->getcode(start, endBrace-start, &chr)) { + if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) { if (unicodeescape_decoding_error( &s, &x, errors, "Invalid Unicode Character Name") @@ -1312,8 +1312,10 @@ store: return (PyObject *)v; ucnhashError: - PyErr_SetString(PyExc_UnicodeError, - "\\N escapes not supported (can't load ucnhash module)"); + PyErr_SetString( + PyExc_UnicodeError, + "\\N escapes not supported (can't load unicodedata module)" + ); return NULL; onError: |