summaryrefslogtreecommitdiffstats
path: root/Modules/unicodedata.c
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2006-03-09 23:38:20 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2006-03-09 23:38:20 (GMT)
commit480f1bb67ba8d2857d87921391df278c5569774c (patch)
tree16370e5215e51cb589a2f07b93a2105c851ce956 /Modules/unicodedata.c
parente2b4677253a809228b16a2c48b6169e1ae576f55 (diff)
downloadcpython-480f1bb67ba8d2857d87921391df278c5569774c.zip
cpython-480f1bb67ba8d2857d87921391df278c5569774c.tar.gz
cpython-480f1bb67ba8d2857d87921391df278c5569774c.tar.bz2
Update Unicode database to Unicode 4.1.
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r--Modules/unicodedata.c241
1 files changed, 213 insertions, 28 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 90f2ca5..a854837 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -14,6 +14,7 @@
#include "Python.h"
#include "ucnhash.h"
+#include "structmember.h"
/* character properties */
@@ -28,6 +29,14 @@ typedef struct {
_PyUnicode_EastAsianWidth */
} _PyUnicode_DatabaseRecord;
+typedef struct change_record {
+ /* sequence of fields should be the same as in merge_old_version */
+ const unsigned char bidir_changed;
+ const unsigned char category_changed;
+ const unsigned char decimal_changed;
+ const int numeric_changed;
+} change_record;
+
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"
@@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v)
return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
}
+/* ------------- Previous-version API ------------------------------------- */
+typedef struct previous_version {
+ PyObject_HEAD
+ const char *name;
+ const change_record* (*getrecord)(Py_UCS4);
+ Py_UCS4 (*normalization)(Py_UCS4);
+} PreviousDBVersion;
+
+#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
+
+/* Forward declaration */
+static PyMethodDef unicodedata_functions[];
+
+static PyMemberDef DB_members[] = {
+ {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
+ {NULL}
+};
+
+static PyTypeObject Xxo_Type = {
+ /* The ob_type field must be initialized in the module init function
+ * to be portable to Windows without using C++. */
+ PyObject_HEAD_INIT(NULL)
+ 0, /*ob_size*/
+ "unicodedata.DB", /*tp_name*/
+ sizeof(PreviousDBVersion), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)PyObject_Del, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash*/
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ PyObject_GenericGetAttr,/*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ 0, /*tp_doc*/
+ 0, /*tp_traverse*/
+ 0, /*tp_clear*/
+ 0, /*tp_richcompare*/
+ 0, /*tp_weaklistoffset*/
+ 0, /*tp_iter*/
+ 0, /*tp_iternext*/
+ unicodedata_functions, /*tp_methods*/
+ DB_members, /*tp_members*/
+ 0, /*tp_getset*/
+ 0, /*tp_base*/
+ 0, /*tp_dict*/
+ 0, /*tp_descr_get*/
+ 0, /*tp_descr_set*/
+ 0, /*tp_dictoffset*/
+ 0, /*tp_init*/
+ 0, /*tp_alloc*/
+ 0, /*tp_new*/
+ 0, /*tp_free*/
+ 0, /*tp_is_gc*/
+};
+
+static PyObject*
+new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
+ Py_UCS4 (*normalization)(Py_UCS4))
+{
+ PreviousDBVersion *self;
+ self = PyObject_New(PreviousDBVersion, &Xxo_Type);
+ if (self == NULL)
+ return NULL;
+ self->name = name;
+ self->getrecord = getrecord;
+ self->normalization = normalization;
+ return (PyObject*)self;
+}
+
/* --- Module API --------------------------------------------------------- */
PyDoc_STRVAR(unicodedata_decimal__doc__,
@@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
+ int have_old = 0;
long rc;
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
@@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args)
"need a single Unicode character as parameter");
return NULL;
}
- rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
+
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0) {
+ /* unassigned */
+ have_old = 1;
+ rc = -1;
+ }
+ else if (old->decimal_changed != 0xFF) {
+ have_old = 1;
+ rc = old->decimal_changed;
+ }
+ }
+
+ if (!have_old)
+ rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
@@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
+ int have_old = 0;
double rc;
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
@@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args)
"need a single Unicode character as parameter");
return NULL;
}
- rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
+
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0) {
+ /* unassigned */
+ have_old = 1;
+ rc = -1;
+ }
+ else if (old->decimal_changed != 0xFF) {
+ have_old = 1;
+ rc = old->decimal_changed;
+ }
+ }
+
+ if (!have_old)
+ rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "not a numeric character");
@@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args)
return NULL;
}
index = (int) _getrecord(v)->category;
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed != 0xFF)
+ index = old->category_changed;
+ }
return PyString_FromString(_PyUnicode_CategoryNames[index]);
}
@@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
return NULL;
}
index = (int) _getrecord(v)->bidirectional;
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ else if (old->bidir_changed != 0xFF)
+ index = old->bidir_changed;
+ }
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
}
@@ -219,6 +351,7 @@ static PyObject *
unicodedata_combining(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
+ int index;
if (!PyArg_ParseTuple(args, "O!:combining",
&PyUnicode_Type, &v))
@@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args)
"need a single Unicode character as parameter");
return NULL;
}
- return PyInt_FromLong((int) _getrecord(v)->combining);
+ index = (int) _getrecord(v)->combining;
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ }
+ return PyInt_FromLong(index);
}
PyDoc_STRVAR(unicodedata_mirrored__doc__,
@@ -242,6 +381,7 @@ static PyObject *
unicodedata_mirrored(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
+ int index;
if (!PyArg_ParseTuple(args, "O!:mirrored",
&PyUnicode_Type, &v))
@@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
"need a single Unicode character as parameter");
return NULL;
}
- return PyInt_FromLong((int) _getrecord(v)->mirrored);
+ index = (int) _getrecord(v)->mirrored;
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ }
+ return PyInt_FromLong(index);
}
PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
@@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
return NULL;
}
index = (int) _getrecord(v)->east_asian_width;
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ }
return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
}
@@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
code = (int) *PyUnicode_AS_UNICODE(v);
+ if (self) {
+ const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+ if (old->category_changed == 0)
+ return PyString_FromString(""); /* unassigned */
+ }
+
if (code < 0 || code >= 0x110000)
index = 0;
else {
@@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
}
void
-get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
+get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
{
if (code >= 0x110000) {
*index = 0;
- }
+ } else if (self && get_old_record(self, code)->category_changed==0) {
+ /* unassigned in old version */
+ *index = 0;
+ }
else {
*index = decomp_index1[(code>>DECOMP_SHIFT)];
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
@@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
#define SCount (LCount*NCount)
static PyObject*
-nfd_nfkd(PyObject *input, int k)
+nfd_nfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UNICODE *i, *end, *o;
@@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k)
}
continue;
}
- /* Other decompoistions. */
- get_decomp_record(code, &index, &prefix, &count);
+ /* normalization changes */
+ if (self) {
+ Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
+ if (value != 0) {
+ stack[stackptr++] = value;
+ continue;
+ }
+ }
+
+ /* Other decompositions. */
+ get_decomp_record(self, code, &index, &prefix, &count);
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
@@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k)
}
static int
-find_nfc_index(struct reindex* nfc, Py_UNICODE code)
+find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
{
int index;
for (index = 0; nfc[index].start; index++) {
@@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code)
}
static PyObject*
-nfc_nfkc(PyObject *input, int k)
+nfc_nfkc(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UNICODE *i, *i1, *o, *end;
@@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k)
Py_UNICODE *skipped[20];
int cskipped = 0;
- result = nfd_nfkd(input, k);
+ result = nfd_nfkd(self, input, k);
if (!result)
return NULL;
@@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k)
continue;
}
- f = find_nfc_index(nfc_first, *i);
+ f = find_nfc_index(self, nfc_first, *i);
if (f == -1) {
*o++ = *i++;
continue;
@@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k)
i1++;
continue;
}
- l = find_nfc_index(nfc_last, *i1);
+ l = find_nfc_index(self, nfc_last, *i1);
/* *i1 cannot be combined with *i. If *i1
is a starter, we don't need to look further.
Otherwise, record the combining class. */
@@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k)
/* Mark the second character unused. */
skipped[cskipped++] = i1;
i1++;
- f = find_nfc_index(nfc_first, *i);
+ f = find_nfc_index(self, nfc_first, *i);
if (f == -1)
break;
}
@@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args)
}
if (strcmp(form, "NFC") == 0)
- return nfc_nfkc(input, 0);
+ return nfc_nfkc(self, input, 0);
if (strcmp(form, "NFKC") == 0)
- return nfc_nfkc(input, 1);
+ return nfc_nfkc(self, input, 1);
if (strcmp(form, "NFD") == 0)
- return nfd_nfkd(input, 0);
+ return nfd_nfkd(self, input, 0);
if (strcmp(form, "NFKD") == 0)
- return nfd_nfkd(input, 1);
+ return nfd_nfkd(self, input, 1);
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}
@@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code)
}
static int
-_getucname(Py_UCS4 code, char* buffer, int buflen)
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
{
int offset;
int i;
@@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
if (code >= 0x110000)
return 0;
+ if (self) {
+ const change_record *old = get_old_record(self, code);
+ if (old->category_changed == 0) {
+ /* unassigned */
+ return 0;
+ }
+ }
+
+
/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
}
static int
-_cmpname(int code, const char* name, int namelen)
+_cmpname(PyObject *self, int code, const char* name, int namelen)
{
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
- if (!_getucname(code, buffer, sizeof(buffer)))
+ if (!_getucname(self, code, buffer, sizeof(buffer)))
return 0;
for (i = 0; i < namelen; i++) {
if (toupper(name[i]) != buffer[i])
@@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
}
static int
-_getcode(const char* name, int namelen, Py_UCS4* code)
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
{
unsigned int h, v;
unsigned int mask = code_size-1;
@@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
v = code_hash[i];
if (!v)
return 0;
- if (_cmpname(v, name, namelen)) {
+ if (_cmpname(self, v, name, namelen)) {
*code = v;
return 1;
}
@@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
v = code_hash[i];
if (!v)
return 0;
- if (_cmpname(v, name, namelen)) {
+ if (_cmpname(self, v, name, namelen)) {
*code = v;
return 1;
}
@@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args)
return NULL;
}
- if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
- name, sizeof(name))) {
+ if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
+ name, sizeof(name))) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
@@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
- if (!_getcode(name, namelen, &code)) {
+ if (!_getcode(self, name, namelen, &code)) {
char fmt[] = "undefined character name '%s'";
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
sprintf(buf, fmt, name);
@@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = {
{NULL, NULL} /* sentinel */
};
+
+
PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\
@@ -1007,6 +1187,11 @@ initunicodedata(void)
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
+ /* Previous versions */
+ v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
+ if (v != NULL)
+ PyModule_AddObject(m, "db_3_2_0", v);
+
/* Export C API */
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
if (v != NULL)