summaryrefslogtreecommitdiffstats
path: root/Modules/unicodedata.c
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2001-01-21 22:41:08 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2001-01-21 22:41:08 (GMT)
commit7b7dd107b3654926fb75215805d6c0c8a15bf89e (patch)
tree13b2631ae40737b8f28e2466e0a64e1278564220 /Modules/unicodedata.c
parentf75c9d94b4cb9f11f0ed046aa775478e559b4081 (diff)
downloadcpython-7b7dd107b3654926fb75215805d6c0c8a15bf89e.zip
cpython-7b7dd107b3654926fb75215805d6c0c8a15bf89e.tar.gz
cpython-7b7dd107b3654926fb75215805d6c0c8a15bf89e.tar.bz2
compress unicode decomposition tables (this saves another 55k)
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r--Modules/unicodedata.c173
1 files changed, 94 insertions, 79 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 3a48ba4..d5a1d17 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -14,11 +14,40 @@
#include "Python.h"
#include "unicodedatabase.h"
+typedef struct {
+ const unsigned char category; /* index into
+ _PyUnicode_CategoryNames */
+ const unsigned char combining; /* combining class value 0 - 255 */
+ const unsigned char bidirectional; /* index into
+ _PyUnicode_BidirectionalNames */
+ const unsigned char mirrored; /* true if mirrored in bidir mode */
+} _PyUnicode_DatabaseRecord;
+
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodedata_db.h"
+
+static const _PyUnicode_DatabaseRecord*
+getrecord(PyUnicodeObject* v)
+{
+ int code;
+ int index;
+
+ code = (int) *PyUnicode_AS_UNICODE(v);
+
+ if (code < 0 || code >= 65536)
+ index = 0;
+ else {
+ index = index1[(code>>SHIFT)];
+ index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
+ }
+
+ return &_PyUnicode_Database_Records[index];
+}
+
/* --- Module API --------------------------------------------------------- */
static PyObject *
-unicodedata_decimal(PyObject *self,
- PyObject *args)
+unicodedata_decimal(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
@@ -26,18 +55,18 @@ unicodedata_decimal(PyObject *self,
if (!PyArg_ParseTuple(args, "O!|O:decimal",
&PyUnicode_Type, &v, &defobj))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a decimal");
- goto onError;
+ return NULL;
}
else {
Py_INCREF(defobj);
@@ -45,14 +74,10 @@ unicodedata_decimal(PyObject *self,
}
}
return PyInt_FromLong(rc);
-
- onError:
- return NULL;
}
static PyObject *
-unicodedata_digit(PyObject *self,
- PyObject *args)
+unicodedata_digit(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
@@ -60,18 +85,18 @@ unicodedata_digit(PyObject *self,
if (!PyArg_ParseTuple(args, "O!|O:digit",
&PyUnicode_Type, &v, &defobj))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a digit");
- goto onError;
+ return NULL;
}
else {
Py_INCREF(defobj);
@@ -79,14 +104,10 @@ unicodedata_digit(PyObject *self,
}
}
return PyInt_FromLong(rc);
-
- onError:
- return NULL;
}
static PyObject *
-unicodedata_numeric(PyObject *self,
- PyObject *args)
+unicodedata_numeric(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
@@ -94,18 +115,18 @@ unicodedata_numeric(PyObject *self,
if (!PyArg_ParseTuple(args, "O!|O:numeric",
&PyUnicode_Type, &v, &defobj))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a numeric character");
- goto onError;
+ return NULL;
}
else {
Py_INCREF(defobj);
@@ -113,129 +134,123 @@ unicodedata_numeric(PyObject *self,
}
}
return PyFloat_FromDouble(rc);
-
- onError:
- return NULL;
}
static PyObject *
-unicodedata_category(PyObject *self,
- PyObject *args)
+unicodedata_category(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:category",
&PyUnicode_Type, &v))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
- index = (int) _PyUnicode_Database_GetRecord(
- (int) *PyUnicode_AS_UNICODE(v)
- )->category;
+ index = (int) getrecord(v)->category;
return PyString_FromString(_PyUnicode_CategoryNames[index]);
-
- onError:
- return NULL;
}
static PyObject *
-unicodedata_bidirectional(PyObject *self,
- PyObject *args)
+unicodedata_bidirectional(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:bidirectional",
&PyUnicode_Type, &v))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
- index = (int) _PyUnicode_Database_GetRecord(
- (int) *PyUnicode_AS_UNICODE(v)
- )->bidirectional;
+ index = (int) getrecord(v)->bidirectional;
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
-
- onError:
- return NULL;
}
static PyObject *
-unicodedata_combining(PyObject *self,
- PyObject *args)
+unicodedata_combining(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
- int value;
if (!PyArg_ParseTuple(args, "O!:combining",
&PyUnicode_Type, &v))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
- value = (int) _PyUnicode_Database_GetRecord(
- (int) *PyUnicode_AS_UNICODE(v)
- )->combining;
- return PyInt_FromLong(value);
-
- onError:
- return NULL;
+ return PyInt_FromLong((int) getrecord(v)->combining);
}
static PyObject *
-unicodedata_mirrored(PyObject *self,
- PyObject *args)
+unicodedata_mirrored(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
- int value;
if (!PyArg_ParseTuple(args, "O!:mirrored",
&PyUnicode_Type, &v))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
}
- value = (int) _PyUnicode_Database_GetRecord(
- (int) *PyUnicode_AS_UNICODE(v)
- )->mirrored;
- return PyInt_FromLong(value);
-
- onError:
- return NULL;
+ return PyInt_FromLong((int) getrecord(v)->mirrored);
}
static PyObject *
-unicodedata_decomposition(PyObject *self,
- PyObject *args)
+unicodedata_decomposition(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
- const char *value;
+ char decomp[256];
+ int code, index, count, i;
if (!PyArg_ParseTuple(args, "O!:decomposition",
&PyUnicode_Type, &v))
- goto onError;
+ return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
- goto onError;
+ return NULL;
+ }
+
+ code = (int) *PyUnicode_AS_UNICODE(v);
+
+ if (code < 0 || code >= 65536)
+ index = 0;
+ else {
+ index = decomp_index1[(code>>DECOMP_SHIFT)];
+ index = decomp_index2[(index<<DECOMP_SHIFT)+
+ (code&((1<<DECOMP_SHIFT)-1))];
+ }
+
+ /* high byte is of hex bytes (usually one or two), low byte
+ is prefix code (from*/
+ count = decomp_data[index] >> 8;
+
+ /* XXX: could allocate the PyString up front instead
+ (strlen(prefix) + 5 * count + 1 bytes) */
+
+ /* copy prefix */
+ i = strlen(decomp_prefix[decomp_data[index] & 255]);
+ memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
+
+ while (count-- > 0) {
+ if (i)
+ decomp[i++] = ' ';
+ sprintf(decomp + i, "%04X", decomp_data[++index]);
+ i += strlen(decomp + i);
}
- value = _PyUnicode_Database_GetDecomposition(
- (int) *PyUnicode_AS_UNICODE(v)
- );
- return PyString_FromString(value);
- onError:
- return NULL;
+ decomp[i] = '\0';
+
+ return PyString_FromString(decomp);
}
/* XXX Add doc strings. */