diff options
author | Fredrik Lundh <fredrik@pythonware.com> | 2001-01-21 22:41:08 (GMT) |
---|---|---|
committer | Fredrik Lundh <fredrik@pythonware.com> | 2001-01-21 22:41:08 (GMT) |
commit | 7b7dd107b3654926fb75215805d6c0c8a15bf89e (patch) | |
tree | 13b2631ae40737b8f28e2466e0a64e1278564220 /Modules/unicodedata.c | |
parent | f75c9d94b4cb9f11f0ed046aa775478e559b4081 (diff) | |
download | cpython-7b7dd107b3654926fb75215805d6c0c8a15bf89e.zip cpython-7b7dd107b3654926fb75215805d6c0c8a15bf89e.tar.gz cpython-7b7dd107b3654926fb75215805d6c0c8a15bf89e.tar.bz2 |
compress unicode decomposition tables (this saves another 55k)
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r-- | Modules/unicodedata.c | 173 |
1 files changed, 94 insertions, 79 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 3a48ba4..d5a1d17 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -14,11 +14,40 @@ #include "Python.h" #include "unicodedatabase.h" +typedef struct { + const unsigned char category; /* index into + _PyUnicode_CategoryNames */ + const unsigned char combining; /* combining class value 0 - 255 */ + const unsigned char bidirectional; /* index into + _PyUnicode_BidirectionalNames */ + const unsigned char mirrored; /* true if mirrored in bidir mode */ +} _PyUnicode_DatabaseRecord; + +/* data file generated by Tools/unicode/makeunicodedata.py */ +#include "unicodedata_db.h" + +static const _PyUnicode_DatabaseRecord* +getrecord(PyUnicodeObject* v) +{ + int code; + int index; + + code = (int) *PyUnicode_AS_UNICODE(v); + + if (code < 0 || code >= 65536) + index = 0; + else { + index = index1[(code>>SHIFT)]; + index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; + } + + return &_PyUnicode_Database_Records[index]; +} + /* --- Module API --------------------------------------------------------- */ static PyObject * -unicodedata_decimal(PyObject *self, - PyObject *args) +unicodedata_decimal(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; @@ -26,18 +55,18 @@ unicodedata_decimal(PyObject *self, if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a decimal"); - goto onError; + return NULL; } else { Py_INCREF(defobj); @@ -45,14 +74,10 @@ unicodedata_decimal(PyObject *self, } } return PyInt_FromLong(rc); - - onError: - return NULL; } static PyObject * -unicodedata_digit(PyObject *self, - PyObject *args) +unicodedata_digit(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; @@ -60,18 +85,18 @@ unicodedata_digit(PyObject *self, if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a digit"); - goto onError; + return NULL; } else { Py_INCREF(defobj); @@ -79,14 +104,10 @@ unicodedata_digit(PyObject *self, } } return PyInt_FromLong(rc); - - onError: - return NULL; } static PyObject * -unicodedata_numeric(PyObject *self, - PyObject *args) +unicodedata_numeric(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; @@ -94,18 +115,18 @@ unicodedata_numeric(PyObject *self, if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a numeric character"); - goto onError; + return NULL; } else { Py_INCREF(defobj); @@ -113,129 +134,123 @@ unicodedata_numeric(PyObject *self, } } return PyFloat_FromDouble(rc); - - onError: - return NULL; } static PyObject * -unicodedata_category(PyObject *self, - PyObject *args) +unicodedata_category(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; if (!PyArg_ParseTuple(args, "O!:category", &PyUnicode_Type, &v)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } - index = (int) _PyUnicode_Database_GetRecord( - (int) *PyUnicode_AS_UNICODE(v) - )->category; + index = (int) getrecord(v)->category; return PyString_FromString(_PyUnicode_CategoryNames[index]); - - onError: - return NULL; } static PyObject * -unicodedata_bidirectional(PyObject *self, - PyObject *args) +unicodedata_bidirectional(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; if (!PyArg_ParseTuple(args, "O!:bidirectional", &PyUnicode_Type, &v)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } - index = (int) _PyUnicode_Database_GetRecord( - (int) *PyUnicode_AS_UNICODE(v) - )->bidirectional; + index = (int) getrecord(v)->bidirectional; return PyString_FromString(_PyUnicode_BidirectionalNames[index]); - - onError: - return NULL; } static PyObject * -unicodedata_combining(PyObject *self, - PyObject *args) +unicodedata_combining(PyObject *self, PyObject *args) { PyUnicodeObject *v; - int value; if (!PyArg_ParseTuple(args, "O!:combining", &PyUnicode_Type, &v)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } - value = (int) _PyUnicode_Database_GetRecord( - (int) *PyUnicode_AS_UNICODE(v) - )->combining; - return PyInt_FromLong(value); - - onError: - return NULL; + return PyInt_FromLong((int) getrecord(v)->combining); } static PyObject * -unicodedata_mirrored(PyObject *self, - PyObject *args) +unicodedata_mirrored(PyObject *self, PyObject *args) { PyUnicodeObject *v; - int value; if (!PyArg_ParseTuple(args, "O!:mirrored", &PyUnicode_Type, &v)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; } - value = (int) _PyUnicode_Database_GetRecord( - (int) *PyUnicode_AS_UNICODE(v) - )->mirrored; - return PyInt_FromLong(value); - - onError: - return NULL; + return PyInt_FromLong((int) getrecord(v)->mirrored); } static PyObject * -unicodedata_decomposition(PyObject *self, - PyObject *args) +unicodedata_decomposition(PyObject *self, PyObject *args) { PyUnicodeObject *v; - const char *value; + char decomp[256]; + int code, index, count, i; if (!PyArg_ParseTuple(args, "O!:decomposition", &PyUnicode_Type, &v)) - goto onError; + return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); - goto onError; + return NULL; + } + + code = (int) *PyUnicode_AS_UNICODE(v); + + if (code < 0 || code >= 65536) + index = 0; + else { + index = decomp_index1[(code>>DECOMP_SHIFT)]; + index = decomp_index2[(index<<DECOMP_SHIFT)+ + (code&((1<<DECOMP_SHIFT)-1))]; + } + + /* high byte is of hex bytes (usually one or two), low byte + is prefix code (from*/ + count = decomp_data[index] >> 8; + + /* XXX: could allocate the PyString up front instead + (strlen(prefix) + 5 * count + 1 bytes) */ + + /* copy prefix */ + i = strlen(decomp_prefix[decomp_data[index] & 255]); + memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i); + + while (count-- > 0) { + if (i) + decomp[i++] = ' '; + sprintf(decomp + i, "%04X", decomp_data[++index]); + i += strlen(decomp + i); } - value = _PyUnicode_Database_GetDecomposition( - (int) *PyUnicode_AS_UNICODE(v) - ); - return PyString_FromString(value); - onError: - return NULL; + decomp[i] = '\0'; + + return PyString_FromString(decomp); } /* XXX Add doc strings. */ |