summaryrefslogtreecommitdiffstats
path: root/Modules/unicodedata.c
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r--Modules/unicodedata.c245
1 files changed, 226 insertions, 19 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 06e5f04..dfe2f7b 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -12,6 +12,9 @@
------------------------------------------------------------------------ */
#include "Python.h"
+#include "ucnhash.h"
+
+/* character properties */
typedef struct {
const unsigned char category; /* index into
@@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
long rc;
- if (!PyArg_ParseTuple(args, "O!|O:decimal",
- &PyUnicode_Type, &v, &defobj))
+ if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
long rc;
- if (!PyArg_ParseTuple(args, "O!|O:digit",
- &PyUnicode_Type, &v, &defobj))
+ if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
- PyErr_SetString(PyExc_ValueError,
- "not a digit");
+ PyErr_SetString(PyExc_ValueError, "not a digit");
return NULL;
}
else {
@@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
double rc;
- if (!PyArg_ParseTuple(args, "O!|O:numeric",
- &PyUnicode_Type, &v, &defobj))
+ if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
- PyErr_SetString(PyExc_ValueError,
- "not a numeric character");
+ PyErr_SetString(PyExc_ValueError, "not a numeric character");
return NULL;
}
else {
@@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
return PyString_FromString(decomp);
}
+/* -------------------------------------------------------------------- */
+/* unicode character name tables */
+
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodename_db.h"
+
+/* -------------------------------------------------------------------- */
+/* database code (cut and pasted from the unidb package) */
+
+static unsigned long
+gethash(const char *s, int len, int scale)
+{
+ int i;
+ unsigned long h = 0;
+ unsigned long ix;
+ for (i = 0; i < len; i++) {
+ h = (h * scale) + (unsigned char) toupper(s[i]);
+ ix = h & 0xff000000;
+ if (ix)
+ h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
+ }
+ return h;
+}
+
+static int
+getname(Py_UCS4 code, char* buffer, int buflen)
+{
+ int offset;
+ int i;
+ int word;
+ unsigned char* w;
+
+ if (code < 0 || code >= 65536)
+ return 0;
+
+ /* get offset into phrasebook */
+ offset = phrasebook_offset1[(code>>phrasebook_shift)];
+ offset = phrasebook_offset2[(offset<<phrasebook_shift) +
+ (code&((1<<phrasebook_shift)-1))];
+ if (!offset)
+ return 0;
+
+ i = 0;
+
+ for (;;) {
+ /* get word index */
+ word = phrasebook[offset] - phrasebook_short;
+ if (word >= 0) {
+ word = (word << 8) + phrasebook[offset+1];
+ offset += 2;
+ } else
+ word = phrasebook[offset++];
+ if (i) {
+ if (i > buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = ' ';
+ }
+ /* copy word string from lexicon. the last character in the
+ word has bit 7 set. the last word in a string ends with
+ 0x80 */
+ w = lexicon + lexicon_offset[word];
+ while (*w < 128) {
+ if (i >= buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = *w++;
+ }
+ if (i >= buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = *w & 127;
+ if (*w == 128)
+ break; /* end of word */
+ }
+
+ return 1;
+}
+
+static int
+cmpname(int code, const char* name, int namelen)
+{
+ /* check if code corresponds to the given name */
+ int i;
+ char buffer[NAME_MAXLEN];
+ if (!getname(code, buffer, sizeof(buffer)))
+ return 0;
+ for (i = 0; i < namelen; i++) {
+ if (toupper(name[i]) != buffer[i])
+ return 0;
+ }
+ return buffer[namelen] == '\0';
+}
+
+static int
+getcode(const char* name, int namelen, Py_UCS4* code)
+{
+ unsigned int h, v;
+ unsigned int mask = code_size-1;
+ unsigned int i, incr;
+
+ /* the following is the same as python's dictionary lookup, with
+ only minor changes. see the makeunicodedata script for more
+ details */
+
+ h = (unsigned int) gethash(name, namelen, code_magic);
+ i = (~h) & mask;
+ v = code_hash[i];
+ if (!v)
+ return 0;
+ if (cmpname(v, name, namelen)) {
+ *code = v;
+ return 1;
+ }
+ incr = (h ^ (h >> 3)) & mask;
+ if (!incr)
+ incr = mask;
+ for (;;) {
+ i = (i + incr) & mask;
+ v = code_hash[i];
+ if (!v)
+ return -1;
+ if (cmpname(v, name, namelen)) {
+ *code = v;
+ return 1;
+ }
+ incr = incr << 1;
+ if (incr > mask)
+ incr = incr ^ code_poly;
+ }
+}
+
+static const _PyUnicode_Name_CAPI hashAPI =
+{
+ sizeof(_PyUnicode_Name_CAPI),
+ getname,
+ getcode
+};
+
+/* -------------------------------------------------------------------- */
+/* Python bindings */
+
+static PyObject *
+unicodedata_name(PyObject* self, PyObject* args)
+{
+ char name[NAME_MAXLEN];
+
+ PyUnicodeObject* v;
+ PyObject* defobj = NULL;
+ if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
+ return NULL;
+
+ if (PyUnicode_GET_SIZE(v) != 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "need a single Unicode character as parameter");
+ return NULL;
+ }
+
+ if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) {
+ if (defobj == NULL) {
+ PyErr_SetString(PyExc_ValueError, "no such name");
+ return NULL;
+ }
+ else {
+ Py_INCREF(defobj);
+ return defobj;
+ }
+ }
+
+ return Py_BuildValue("s", name);
+}
+
+static PyObject *
+unicodedata_lookup(PyObject* self, PyObject* args)
+{
+ Py_UCS4 code;
+ Py_UNICODE str[1];
+
+ char* name;
+ int namelen;
+ if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
+ return NULL;
+
+ if (!getcode(name, namelen, &code)) {
+ PyErr_SetString(PyExc_KeyError, "undefined character name");
+ return NULL;
+ }
+
+ str[0] = (Py_UNICODE) code;
+ return PyUnicode_FromUnicode(str, 1);
+}
+
/* XXX Add doc strings. */
static PyMethodDef unicodedata_functions[] = {
- {"decimal", unicodedata_decimal, 1},
- {"digit", unicodedata_digit, 1},
- {"numeric", unicodedata_numeric, 1},
- {"category", unicodedata_category, 1},
- {"bidirectional", unicodedata_bidirectional, 1},
- {"combining", unicodedata_combining, 1},
- {"mirrored", unicodedata_mirrored, 1},
- {"decomposition", unicodedata_decomposition, 1},
+ {"decimal", unicodedata_decimal, METH_VARARGS},
+ {"digit", unicodedata_digit, METH_VARARGS},
+ {"numeric", unicodedata_numeric, METH_VARARGS},
+ {"category", unicodedata_category, METH_VARARGS},
+ {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
+ {"combining", unicodedata_combining, METH_VARARGS},
+ {"mirrored", unicodedata_mirrored, METH_VARARGS},
+ {"decomposition",unicodedata_decomposition, METH_VARARGS},
+ {"name", unicodedata_name, METH_VARARGS},
+ {"lookup", unicodedata_lookup, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
+static char *unicodedata_docstring = "unicode character database";
+
DL_EXPORT(void)
initunicodedata(void)
{
- Py_InitModule("unicodedata", unicodedata_functions);
+ PyObject *m, *d, *v;
+
+ m = Py_InitModule4(
+ "unicodedata", unicodedata_functions,
+ unicodedata_docstring, NULL, PYTHON_API_VERSION);
+ if (!m)
+ return;
+
+ d = PyModule_GetDict(m);
+ if (!d)
+ return;
+
+ /* Export C API */
+ v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+ PyDict_SetItemString(d, "ucnhash_CAPI", v);
+ Py_XDECREF(v);
+
}