diff options
Diffstat (limited to 'Modules/unicodedata.c')
| -rw-r--r-- | Modules/unicodedata.c | 102 |
1 files changed, 59 insertions, 43 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 95bdf3c..4db5087 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1,12 +1,13 @@ /* ------------------------------------------------------------------------ - unicodedata -- Provides access to the Unicode 5.2 data base. + unicodedata -- Provides access to the Unicode database. - Data was extracted from the Unicode 5.2 UnicodeData.txt file. + Data was extracted from the UnicodeData.txt file. + The current version number is reported in the unidata_version constant. Written by Marc-Andre Lemburg (mal@lemburg.com). Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) - Modified by Martin v. Löwis (martin@v.loewis.de) + Modified by Martin v. Löwis (martin@v.loewis.de) Copyright (c) Corporation for National Research Initiatives. @@ -73,6 +74,7 @@ static PyMemberDef DB_members[] = { /* forward declaration */ static PyTypeObject UCD_Type; +#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type) static PyObject* new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), @@ -130,7 +132,7 @@ unicodedata_decimal(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ @@ -156,7 +158,7 @@ unicodedata_decimal(PyObject *self, PyObject *args) return defobj; } } - return PyInt_FromLong(rc); + return PyLong_FromLong(rc); } PyDoc_STRVAR(unicodedata_digit__doc__, @@ -190,7 +192,7 @@ unicodedata_digit(PyObject *self, PyObject *args) return defobj; } } - return PyInt_FromLong(rc); + return PyLong_FromLong(rc); } PyDoc_STRVAR(unicodedata_numeric__doc__, @@ -215,7 +217,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ @@ -263,12 +265,12 @@ unicodedata_category(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->category; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed != 0xFF) index = old->category_changed; } - return PyString_FromString(_PyUnicode_CategoryNames[index]); + return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); } PyDoc_STRVAR(unicodedata_bidirectional__doc__, @@ -292,14 +294,14 @@ unicodedata_bidirectional(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->bidirectional; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ else if (old->bidir_changed != 0xFF) index = old->bidir_changed; } - return PyString_FromString(_PyUnicode_BidirectionalNames[index]); + return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); } PyDoc_STRVAR(unicodedata_combining__doc__, @@ -323,12 +325,12 @@ unicodedata_combining(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->combining; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ } - return PyInt_FromLong(index); + return PyLong_FromLong(index); } PyDoc_STRVAR(unicodedata_mirrored__doc__, @@ -352,14 +354,14 @@ unicodedata_mirrored(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->mirrored; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ else if (old->mirrored_changed != 0xFF) index = old->mirrored_changed; } - return PyInt_FromLong(index); + return PyLong_FromLong(index); } PyDoc_STRVAR(unicodedata_east_asian_width__doc__, @@ -382,12 +384,12 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args) if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->east_asian_width; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ } - return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); + return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); } PyDoc_STRVAR(unicodedata_decomposition__doc__, @@ -402,7 +404,8 @@ unicodedata_decomposition(PyObject *self, PyObject *args) { PyUnicodeObject *v; char decomp[256]; - int code, index, count, i; + int code, index, count; + size_t i; unsigned int prefix_index; Py_UCS4 c; @@ -415,10 +418,10 @@ unicodedata_decomposition(PyObject *self, PyObject *args) code = (int)c; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) - return PyString_FromString(""); /* unassigned */ + return PyUnicode_FromString(""); /* unassigned */ } if (code < 0 || code >= 0x110000) @@ -449,15 +452,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args) while (count-- > 0) { if (i) decomp[i++] = ' '; - assert((size_t)i < sizeof(decomp)); + assert(i < sizeof(decomp)); PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", decomp_data[++index]); i += strlen(decomp + i); } - - decomp[i] = '\0'; - - return PyString_FromString(decomp); + return PyUnicode_FromStringAndSize(decomp, i); } static void @@ -465,7 +465,8 @@ get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *co { if (code >= 0x110000) { *index = 0; - } else if (self && get_old_record(self, code)->category_changed==0) { + } else if (self && UCD_Check(self) && + get_old_record(self, code)->category_changed==0) { /* unassigned in old version */ *index = 0; } @@ -522,7 +523,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) /* Hangul Decomposition adds three characters in a single step, so we need atleast that much room. */ if (space < 3) { - Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; + Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10; space += 10; if (PyUnicode_Resize(&result, newsize) == -1) return NULL; @@ -544,7 +545,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) continue; } /* normalization changes */ - if (self) { + if (self && UCD_Check(self)) { Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); if (value != 0) { stack[stackptr++] = value; @@ -736,7 +737,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) /* An older version of the database is requested, quickchecks must be disabled. */ - if (self != NULL) + if (self && UCD_Check(self)) return 0; /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, @@ -869,14 +870,16 @@ static char *hangul_syllables[][3] = { { 0, 0, "H" } }; +/* These ranges need to match makeunicodedata.py:cjk_ranges. */ static int is_unified_ideograph(Py_UCS4 code) { - return ( - (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ - (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */ + return + (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ + (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */ (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ - (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */ + (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ + (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ } static int @@ -890,7 +893,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) if (code >= 0x110000) return 0; - if (self) { + if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, code); if (old->category_changed == 0) { /* unassigned */ @@ -1126,7 +1129,7 @@ unicodedata_name(PyObject* self, PyObject* args) } } - return Py_BuildValue("s", name); + return PyUnicode_FromString(name); } PyDoc_STRVAR(unicodedata_lookup__doc__, @@ -1201,7 +1204,7 @@ static PyTypeObject UCD_Type = { 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ - 0, /*tp_compare*/ + 0, /*tp_reserved*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ @@ -1239,23 +1242,35 @@ PyDoc_STRVAR(unicodedata_docstring, "This module provides access to the Unicode Character Database which\n\ defines character properties for all Unicode characters. The data in\n\ this database is based on the UnicodeData.txt file version\n\ -5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\ +6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\ \n\ The module uses the same names and symbols as defined by the\n\ -UnicodeData File Format 5.2.0 (see\n\ -http://www.unicode.org/reports/tr44/tr44-4.html)."); +UnicodeData File Format 6.0.0 (see\n\ +http://www.unicode.org/reports/tr44/tr44-6.html)."); + + +static struct PyModuleDef unicodedatamodule = { + PyModuleDef_HEAD_INIT, + "unicodedata", + unicodedata_docstring, + -1, + unicodedata_functions, + NULL, + NULL, + NULL, + NULL +}; PyMODINIT_FUNC -initunicodedata(void) +PyInit_unicodedata(void) { PyObject *m, *v; Py_TYPE(&UCD_Type) = &PyType_Type; - m = Py_InitModule3( - "unicodedata", unicodedata_functions, unicodedata_docstring); + m = PyModule_Create(&unicodedatamodule); if (!m) - return; + return NULL; PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); Py_INCREF(&UCD_Type); @@ -1270,6 +1285,7 @@ initunicodedata(void) v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); if (v != NULL) PyModule_AddObject(m, "ucnhash_CAPI", v); + return m; } /* |
