diff options
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r-- | Modules/unicodedata.c | 245 |
1 files changed, 226 insertions, 19 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 06e5f04..dfe2f7b 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -12,6 +12,9 @@ ------------------------------------------------------------------------ */ #include "Python.h" +#include "ucnhash.h" + +/* character properties */ typedef struct { const unsigned char category; /* index into @@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args) PyObject *defobj = NULL; long rc; - if (!PyArg_ParseTuple(args, "O!|O:decimal", - &PyUnicode_Type, &v, &defobj)) + if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, @@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args) PyObject *defobj = NULL; long rc; - if (!PyArg_ParseTuple(args, "O!|O:digit", - &PyUnicode_Type, &v, &defobj)) + if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, @@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args) rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { - PyErr_SetString(PyExc_ValueError, - "not a digit"); + PyErr_SetString(PyExc_ValueError, "not a digit"); return NULL; } else { @@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) PyObject *defobj = NULL; double rc; - if (!PyArg_ParseTuple(args, "O!|O:numeric", - &PyUnicode_Type, &v, &defobj)) + if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, @@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { - PyErr_SetString(PyExc_ValueError, - "not a numeric character"); + PyErr_SetString(PyExc_ValueError, "not a numeric character"); return NULL; } else { @@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args) return PyString_FromString(decomp); } +/* -------------------------------------------------------------------- */ +/* unicode character name tables */ + +/* data file generated by Tools/unicode/makeunicodedata.py */ +#include "unicodename_db.h" + +/* -------------------------------------------------------------------- */ +/* database code (cut and pasted from the unidb package) */ + +static unsigned long +gethash(const char *s, int len, int scale) +{ + int i; + unsigned long h = 0; + unsigned long ix; + for (i = 0; i < len; i++) { + h = (h * scale) + (unsigned char) toupper(s[i]); + ix = h & 0xff000000; + if (ix) + h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; + } + return h; +} + +static int +getname(Py_UCS4 code, char* buffer, int buflen) +{ + int offset; + int i; + int word; + unsigned char* w; + + if (code < 0 || code >= 65536) + return 0; + + /* get offset into phrasebook */ + offset = phrasebook_offset1[(code>>phrasebook_shift)]; + offset = phrasebook_offset2[(offset<<phrasebook_shift) + + (code&((1<<phrasebook_shift)-1))]; + if (!offset) + return 0; + + i = 0; + + for (;;) { + /* get word index */ + word = phrasebook[offset] - phrasebook_short; + if (word >= 0) { + word = (word << 8) + phrasebook[offset+1]; + offset += 2; + } else + word = phrasebook[offset++]; + if (i) { + if (i > buflen) + return 0; /* buffer overflow */ + buffer[i++] = ' '; + } + /* copy word string from lexicon. the last character in the + word has bit 7 set. the last word in a string ends with + 0x80 */ + w = lexicon + lexicon_offset[word]; + while (*w < 128) { + if (i >= buflen) + return 0; /* buffer overflow */ + buffer[i++] = *w++; + } + if (i >= buflen) + return 0; /* buffer overflow */ + buffer[i++] = *w & 127; + if (*w == 128) + break; /* end of word */ + } + + return 1; +} + +static int +cmpname(int code, const char* name, int namelen) +{ + /* check if code corresponds to the given name */ + int i; + char buffer[NAME_MAXLEN]; + if (!getname(code, buffer, sizeof(buffer))) + return 0; + for (i = 0; i < namelen; i++) { + if (toupper(name[i]) != buffer[i]) + return 0; + } + return buffer[namelen] == '\0'; +} + +static int +getcode(const char* name, int namelen, Py_UCS4* code) +{ + unsigned int h, v; + unsigned int mask = code_size-1; + unsigned int i, incr; + + /* the following is the same as python's dictionary lookup, with + only minor changes. see the makeunicodedata script for more + details */ + + h = (unsigned int) gethash(name, namelen, code_magic); + i = (~h) & mask; + v = code_hash[i]; + if (!v) + return 0; + if (cmpname(v, name, namelen)) { + *code = v; + return 1; + } + incr = (h ^ (h >> 3)) & mask; + if (!incr) + incr = mask; + for (;;) { + i = (i + incr) & mask; + v = code_hash[i]; + if (!v) + return -1; + if (cmpname(v, name, namelen)) { + *code = v; + return 1; + } + incr = incr << 1; + if (incr > mask) + incr = incr ^ code_poly; + } +} + +static const _PyUnicode_Name_CAPI hashAPI = +{ + sizeof(_PyUnicode_Name_CAPI), + getname, + getcode +}; + +/* -------------------------------------------------------------------- */ +/* Python bindings */ + +static PyObject * +unicodedata_name(PyObject* self, PyObject* args) +{ + char name[NAME_MAXLEN]; + + PyUnicodeObject* v; + PyObject* defobj = NULL; + if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) + return NULL; + + if (PyUnicode_GET_SIZE(v) != 1) { + PyErr_SetString(PyExc_TypeError, + "need a single Unicode character as parameter"); + return NULL; + } + + if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) { + if (defobj == NULL) { + PyErr_SetString(PyExc_ValueError, "no such name"); + return NULL; + } + else { + Py_INCREF(defobj); + return defobj; + } + } + + return Py_BuildValue("s", name); +} + +static PyObject * +unicodedata_lookup(PyObject* self, PyObject* args) +{ + Py_UCS4 code; + Py_UNICODE str[1]; + + char* name; + int namelen; + if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) + return NULL; + + if (!getcode(name, namelen, &code)) { + PyErr_SetString(PyExc_KeyError, "undefined character name"); + return NULL; + } + + str[0] = (Py_UNICODE) code; + return PyUnicode_FromUnicode(str, 1); +} + /* XXX Add doc strings. */ static PyMethodDef unicodedata_functions[] = { - {"decimal", unicodedata_decimal, 1}, - {"digit", unicodedata_digit, 1}, - {"numeric", unicodedata_numeric, 1}, - {"category", unicodedata_category, 1}, - {"bidirectional", unicodedata_bidirectional, 1}, - {"combining", unicodedata_combining, 1}, - {"mirrored", unicodedata_mirrored, 1}, - {"decomposition", unicodedata_decomposition, 1}, + {"decimal", unicodedata_decimal, METH_VARARGS}, + {"digit", unicodedata_digit, METH_VARARGS}, + {"numeric", unicodedata_numeric, METH_VARARGS}, + {"category", unicodedata_category, METH_VARARGS}, + {"bidirectional", unicodedata_bidirectional, METH_VARARGS}, + {"combining", unicodedata_combining, METH_VARARGS}, + {"mirrored", unicodedata_mirrored, METH_VARARGS}, + {"decomposition",unicodedata_decomposition, METH_VARARGS}, + {"name", unicodedata_name, METH_VARARGS}, + {"lookup", unicodedata_lookup, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; +static char *unicodedata_docstring = "unicode character database"; + DL_EXPORT(void) initunicodedata(void) { - Py_InitModule("unicodedata", unicodedata_functions); + PyObject *m, *d, *v; + + m = Py_InitModule4( + "unicodedata", unicodedata_functions, + unicodedata_docstring, NULL, PYTHON_API_VERSION); + if (!m) + return; + + d = PyModule_GetDict(m); + if (!d) + return; + + /* Export C API */ + v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); + PyDict_SetItemString(d, "ucnhash_CAPI", v); + Py_XDECREF(v); + } |