summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2001-01-24 07:59:11 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2001-01-24 07:59:11 (GMT)
commit06d126803c66515f5cdb272dfd7807716f04d33f (patch)
tree1f1bb52eee3fbb613c83eae4bb0c8b82ea276257
parenteda28445c075102690710e3775b7f419669eb653 (diff)
downloadcpython-06d126803c66515f5cdb272dfd7807716f04d33f.zip
cpython-06d126803c66515f5cdb272dfd7807716f04d33f.tar.gz
cpython-06d126803c66515f5cdb272dfd7807716f04d33f.tar.bz2
Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this). Also adds "name" and "lookup" functions to unicodedata.
-rw-r--r--Lib/test/test_ucn.py15
-rw-r--r--Modules/ucnhash.c196
-rw-r--r--Modules/unicodedata.c245
-rw-r--r--Objects/unicodeobject.c20
4 files changed, 248 insertions, 228 deletions
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
index 0797f2c..f7d3ce4 100644
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -1,6 +1,7 @@
""" Test script for the Unicode implementation.
Written by Bill Tutt.
+Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
@@ -46,23 +47,24 @@ except UnicodeError, v:
print v
print "done."
-import ucnhash
+import unicodedata
print "Testing name to code mapping....",
for char in "SPAM":
name = "LATIN SMALL LETTER %s" % char
- code = ucnhash.getcode(name)
- verify(ucnhash.getname(code) == name)
+ code = unicodedata.lookup(name)
+ verify(unicodedata.name(code) == name)
print "done."
print "Testing code to name mapping for all characters....",
count = 0
for code in range(65536):
try:
- name = ucnhash.getname(code)
- verify(ucnhash.getcode(name) == code)
+ char = unichr(code)
+ name = unicodedata.name(char)
+ verify(unicodedata.lookup(name) == char)
count += 1
- except ValueError:
+ except (KeyError, ValueError):
pass
print "done."
@@ -78,7 +80,6 @@ verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41")
"""
print "done."
-
# strict error testing:
print "Testing unicode character name expansion strict error handling....",
try:
diff --git a/Modules/ucnhash.c b/Modules/ucnhash.c
index bdcdab1..424b6c5 100644
--- a/Modules/ucnhash.c
+++ b/Modules/ucnhash.c
@@ -1,212 +1,22 @@
-/* unicode character name tables */
-/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
+/* obsolete -- remove this file! */
#include "Python.h"
-#include "ucnhash.h"
-
-/* data file generated by Tools/unicode/makeunicodedata.py */
-#include "unicodename_db.h"
-
-/* -------------------------------------------------------------------- */
-/* database code (cut and pasted from the unidb package) */
-
-static unsigned long
-gethash(const char *s, int len, int scale)
-{
- int i;
- unsigned long h = 0;
- unsigned long ix;
- for (i = 0; i < len; i++) {
- h = (h * scale) + (unsigned char) toupper(s[i]);
- ix = h & 0xff000000;
- if (ix)
- h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
- }
- return h;
-}
-
-static int
-getname(Py_UCS4 code, char* buffer, int buflen)
-{
- int offset;
- int i;
- int word;
- unsigned char* w;
-
- if (code < 0 || code >= 65536)
- return 0;
-
- /* get offset into phrasebook */
- offset = phrasebook_offset1[(code>>phrasebook_shift)];
- offset = phrasebook_offset2[(offset<<phrasebook_shift) +
- (code&((1<<phrasebook_shift)-1))];
- if (!offset)
- return 0;
-
- i = 0;
-
- for (;;) {
- /* get word index */
- word = phrasebook[offset] - phrasebook_short;
- if (word >= 0) {
- word = (word << 8) + phrasebook[offset+1];
- offset += 2;
- } else
- word = phrasebook[offset++];
- if (i) {
- if (i > buflen)
- return 0; /* buffer overflow */
- buffer[i++] = ' ';
- }
- /* copy word string from lexicon. the last character in the
- word has bit 7 set. the last word in a string ends with
- 0x80 */
- w = lexicon + lexicon_offset[word];
- while (*w < 128) {
- if (i >= buflen)
- return 0; /* buffer overflow */
- buffer[i++] = *w++;
- }
- if (i >= buflen)
- return 0; /* buffer overflow */
- buffer[i++] = *w & 127;
- if (*w == 128)
- break; /* end of word */
- }
-
- return 1;
-}
-
-static int
-cmpname(int code, const char* name, int namelen)
-{
- /* check if code corresponds to the given name */
- int i;
- char buffer[NAME_MAXLEN];
- if (!getname(code, buffer, sizeof(buffer)))
- return 0;
- for (i = 0; i < namelen; i++) {
- if (toupper(name[i]) != buffer[i])
- return 0;
- }
- return buffer[namelen] == '\0';
-}
-
-static int
-getcode(const char* name, int namelen, Py_UCS4* code)
-{
- unsigned int h, v;
- unsigned int mask = code_size-1;
- unsigned int i, incr;
-
- /* the following is the same as python's dictionary lookup, with
- only minor changes. see the makeunicodedata script for more
- details */
-
- h = (unsigned int) gethash(name, namelen, code_magic);
- i = (~h) & mask;
- v = code_hash[i];
- if (!v)
- return 0;
- if (cmpname(v, name, namelen)) {
- *code = v;
- return 1;
- }
- incr = (h ^ (h >> 3)) & mask;
- if (!incr)
- incr = mask;
- for (;;) {
- i = (i + incr) & mask;
- v = code_hash[i];
- if (!v)
- return -1;
- if (cmpname(v, name, namelen)) {
- *code = v;
- return 1;
- }
- incr = incr << 1;
- if (incr > mask)
- incr = incr ^ code_poly;
- }
-}
-
-static const _PyUnicode_Name_CAPI hashAPI =
-{
- sizeof(_PyUnicode_Name_CAPI),
- getname,
- getcode
-};
-
-/* -------------------------------------------------------------------- */
-/* Python bindings */
-
-static PyObject *
-ucnhash_getname(PyObject* self, PyObject* args)
-{
- char name[NAME_MAXLEN];
-
- int code;
- if (!PyArg_ParseTuple(args, "i", &code))
- return NULL;
-
- if (!getname((Py_UCS4) code, name, sizeof(name))) {
- PyErr_SetString(PyExc_ValueError, "undefined character code");
- return NULL;
- }
-
- return Py_BuildValue("s", name);
-}
-
-static PyObject *
-ucnhash_getcode(PyObject* self, PyObject* args)
-{
- Py_UCS4 code;
-
- char* name;
- int namelen;
- if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
- return NULL;
-
- if (!getcode(name, namelen, &code)) {
- PyErr_SetString(PyExc_ValueError, "undefined character name");
- return NULL;
- }
-
- return Py_BuildValue("i", code);
-}
static
PyMethodDef ucnhash_methods[] =
{
- {"getname", ucnhash_getname, 1},
- {"getcode", ucnhash_getcode, 1},
{NULL, NULL},
};
-static char *ucnhash_docstring = "ucnhash hash function module";
-
+static char *ucnhash_docstring = "ucnhash hash function module (obsolete)";
-/* Create PyMethodObjects and register them in the module's dict */
DL_EXPORT(void)
initucnhash(void)
{
- PyObject *m, *d, *v;
-
- m = Py_InitModule4(
+ Py_InitModule4(
"ucnhash", /* Module name */
ucnhash_methods, /* Method list */
ucnhash_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
- if (!m)
- return;
-
- d = PyModule_GetDict(m);
- if (!d)
- return;
-
- /* Export C API */
- v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
- PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
- Py_XDECREF(v);
}
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 06e5f04..dfe2f7b 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -12,6 +12,9 @@
------------------------------------------------------------------------ */
#include "Python.h"
+#include "ucnhash.h"
+
+/* character properties */
typedef struct {
const unsigned char category; /* index into
@@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
long rc;
- if (!PyArg_ParseTuple(args, "O!|O:decimal",
- &PyUnicode_Type, &v, &defobj))
+ if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
long rc;
- if (!PyArg_ParseTuple(args, "O!|O:digit",
- &PyUnicode_Type, &v, &defobj))
+ if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
- PyErr_SetString(PyExc_ValueError,
- "not a digit");
+ PyErr_SetString(PyExc_ValueError, "not a digit");
return NULL;
}
else {
@@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
double rc;
- if (!PyArg_ParseTuple(args, "O!|O:numeric",
- &PyUnicode_Type, &v, &defobj))
+ if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
- PyErr_SetString(PyExc_ValueError,
- "not a numeric character");
+ PyErr_SetString(PyExc_ValueError, "not a numeric character");
return NULL;
}
else {
@@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
return PyString_FromString(decomp);
}
+/* -------------------------------------------------------------------- */
+/* unicode character name tables */
+
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodename_db.h"
+
+/* -------------------------------------------------------------------- */
+/* database code (cut and pasted from the unidb package) */
+
+static unsigned long
+gethash(const char *s, int len, int scale)
+{
+ int i;
+ unsigned long h = 0;
+ unsigned long ix;
+ for (i = 0; i < len; i++) {
+ h = (h * scale) + (unsigned char) toupper(s[i]);
+ ix = h & 0xff000000;
+ if (ix)
+ h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
+ }
+ return h;
+}
+
+static int
+getname(Py_UCS4 code, char* buffer, int buflen)
+{
+ int offset;
+ int i;
+ int word;
+ unsigned char* w;
+
+ if (code < 0 || code >= 65536)
+ return 0;
+
+ /* get offset into phrasebook */
+ offset = phrasebook_offset1[(code>>phrasebook_shift)];
+ offset = phrasebook_offset2[(offset<<phrasebook_shift) +
+ (code&((1<<phrasebook_shift)-1))];
+ if (!offset)
+ return 0;
+
+ i = 0;
+
+ for (;;) {
+ /* get word index */
+ word = phrasebook[offset] - phrasebook_short;
+ if (word >= 0) {
+ word = (word << 8) + phrasebook[offset+1];
+ offset += 2;
+ } else
+ word = phrasebook[offset++];
+ if (i) {
+ if (i > buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = ' ';
+ }
+ /* copy word string from lexicon. the last character in the
+ word has bit 7 set. the last word in a string ends with
+ 0x80 */
+ w = lexicon + lexicon_offset[word];
+ while (*w < 128) {
+ if (i >= buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = *w++;
+ }
+ if (i >= buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = *w & 127;
+ if (*w == 128)
+ break; /* end of word */
+ }
+
+ return 1;
+}
+
+static int
+cmpname(int code, const char* name, int namelen)
+{
+ /* check if code corresponds to the given name */
+ int i;
+ char buffer[NAME_MAXLEN];
+ if (!getname(code, buffer, sizeof(buffer)))
+ return 0;
+ for (i = 0; i < namelen; i++) {
+ if (toupper(name[i]) != buffer[i])
+ return 0;
+ }
+ return buffer[namelen] == '\0';
+}
+
+static int
+getcode(const char* name, int namelen, Py_UCS4* code)
+{
+ unsigned int h, v;
+ unsigned int mask = code_size-1;
+ unsigned int i, incr;
+
+ /* the following is the same as python's dictionary lookup, with
+ only minor changes. see the makeunicodedata script for more
+ details */
+
+ h = (unsigned int) gethash(name, namelen, code_magic);
+ i = (~h) & mask;
+ v = code_hash[i];
+ if (!v)
+ return 0;
+ if (cmpname(v, name, namelen)) {
+ *code = v;
+ return 1;
+ }
+ incr = (h ^ (h >> 3)) & mask;
+ if (!incr)
+ incr = mask;
+ for (;;) {
+ i = (i + incr) & mask;
+ v = code_hash[i];
+ if (!v)
+ return -1;
+ if (cmpname(v, name, namelen)) {
+ *code = v;
+ return 1;
+ }
+ incr = incr << 1;
+ if (incr > mask)
+ incr = incr ^ code_poly;
+ }
+}
+
+static const _PyUnicode_Name_CAPI hashAPI =
+{
+ sizeof(_PyUnicode_Name_CAPI),
+ getname,
+ getcode
+};
+
+/* -------------------------------------------------------------------- */
+/* Python bindings */
+
+static PyObject *
+unicodedata_name(PyObject* self, PyObject* args)
+{
+ char name[NAME_MAXLEN];
+
+ PyUnicodeObject* v;
+ PyObject* defobj = NULL;
+ if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
+ return NULL;
+
+ if (PyUnicode_GET_SIZE(v) != 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "need a single Unicode character as parameter");
+ return NULL;
+ }
+
+ if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) {
+ if (defobj == NULL) {
+ PyErr_SetString(PyExc_ValueError, "no such name");
+ return NULL;
+ }
+ else {
+ Py_INCREF(defobj);
+ return defobj;
+ }
+ }
+
+ return Py_BuildValue("s", name);
+}
+
+static PyObject *
+unicodedata_lookup(PyObject* self, PyObject* args)
+{
+ Py_UCS4 code;
+ Py_UNICODE str[1];
+
+ char* name;
+ int namelen;
+ if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
+ return NULL;
+
+ if (!getcode(name, namelen, &code)) {
+ PyErr_SetString(PyExc_KeyError, "undefined character name");
+ return NULL;
+ }
+
+ str[0] = (Py_UNICODE) code;
+ return PyUnicode_FromUnicode(str, 1);
+}
+
/* XXX Add doc strings. */
static PyMethodDef unicodedata_functions[] = {
- {"decimal", unicodedata_decimal, 1},
- {"digit", unicodedata_digit, 1},
- {"numeric", unicodedata_numeric, 1},
- {"category", unicodedata_category, 1},
- {"bidirectional", unicodedata_bidirectional, 1},
- {"combining", unicodedata_combining, 1},
- {"mirrored", unicodedata_mirrored, 1},
- {"decomposition", unicodedata_decomposition, 1},
+ {"decimal", unicodedata_decimal, METH_VARARGS},
+ {"digit", unicodedata_digit, METH_VARARGS},
+ {"numeric", unicodedata_numeric, METH_VARARGS},
+ {"category", unicodedata_category, METH_VARARGS},
+ {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
+ {"combining", unicodedata_combining, METH_VARARGS},
+ {"mirrored", unicodedata_mirrored, METH_VARARGS},
+ {"decomposition",unicodedata_decomposition, METH_VARARGS},
+ {"name", unicodedata_name, METH_VARARGS},
+ {"lookup", unicodedata_lookup, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
+static char *unicodedata_docstring = "unicode character database";
+
DL_EXPORT(void)
initunicodedata(void)
{
- Py_InitModule("unicodedata", unicodedata_functions);
+ PyObject *m, *d, *v;
+
+ m = Py_InitModule4(
+ "unicodedata", unicodedata_functions,
+ unicodedata_docstring, NULL, PYTHON_API_VERSION);
+ if (!m)
+ return;
+
+ d = PyModule_GetDict(m);
+ if (!d)
+ return;
+
+ /* Export C API */
+ v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+ PyDict_SetItemString(d, "ucnhash_CAPI", v);
+ Py_XDECREF(v);
+
}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 585afe6..39ea071 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1103,7 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
}
}
-static _PyUnicode_Name_CAPI *unicode_names = NULL;
+static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
@@ -1236,18 +1236,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
- if (unicode_names == NULL) {
+ if (ucnhash_CAPI == NULL) {
PyObject *mod = 0, *v = 0;
- mod = PyImport_ImportModule("ucnhash");
+ mod = PyImport_ImportModule("unicodedata");
if (mod == NULL)
goto ucnhashError;
- v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
+ v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Py_DECREF(mod);
if (v == NULL)
goto ucnhashError;
- unicode_names = PyCObject_AsVoidPtr(v);
+ ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
- if (unicode_names == NULL)
+ if (ucnhash_CAPI == NULL)
goto ucnhashError;
}
@@ -1259,7 +1259,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
while (*endBrace != '}' && endBrace < end)
endBrace++;
if (endBrace != end && *endBrace == '}') {
- if (!unicode_names->getcode(start, endBrace-start, &chr)) {
+ if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Invalid Unicode Character Name")
@@ -1312,8 +1312,10 @@ store:
return (PyObject *)v;
ucnhashError:
- PyErr_SetString(PyExc_UnicodeError,
- "\\N escapes not supported (can't load ucnhash module)");
+ PyErr_SetString(
+ PyExc_UnicodeError,
+ "\\N escapes not supported (can't load unicodedata module)"
+ );
return NULL;
onError: