summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Include/ucnhash.h45
-rw-r--r--Lib/test/test_ucn.py14
-rw-r--r--Modules/ucnhash.c155
-rw-r--r--Objects/unicodeobject.c142
4 files changed, 146 insertions, 210 deletions
diff --git a/Include/ucnhash.h b/Include/ucnhash.h
index 7bf3f5d..a664336 100644
--- a/Include/ucnhash.h
+++ b/Include/ucnhash.h
@@ -1,20 +1,29 @@
+/* Unicode name database interface */
-#include "Python.h"
-#include <stdlib.h>
-
-/* --- C API ----------------------------------------------------*/
-/* C API for usage by other Python modules */
-typedef struct _Py_UCNHashAPI
-{
- unsigned long cKeys;
- unsigned long cchMax;
- unsigned long (*hash)(const char *key, unsigned int cch);
- const void *(*getValue)(unsigned long iKey);
-} _Py_UCNHashAPI;
-
-typedef struct
-{
- const char *pszUCN;
- Py_UCS4 value;
-} _Py_UnicodeCharacterName;
+#ifndef Py_UCNHASH_H
+#define Py_UCNHASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* revised ucnhash CAPI interface (exported through a PyCObject) */
+
+typedef struct {
+
+ /* Size of this struct */
+ int size;
+
+ /* Get name for a given character code. Returns non-zero if
+ success, zero if not. Does not set Python exceptions. */
+ int (*getname)(Py_UCS4 code, char* buffer, int buflen);
+
+ /* Get character code for a given name. Same error handling
+ as for getname. */
+ int (*getcode)(const char* name, int namelen, Py_UCS4* code);
+
+} _PyUnicode_Name_CAPI;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_UCNHASH_H */
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
index 92155be..a33d111 100644
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -50,22 +50,20 @@ print "done."
# strict error testing:
print "Testing unicode character name expansion strict error handling....",
-k_cchMaxUnicodeName = 83
-
-s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}"
try:
- unicode(s, 'unicode-escape', 'strict')
+ unicode("\N{blah}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
- raise AssertionError, "failed to raise an exception when presented " \
- "with a UCN > k_cchMaxUnicodeName"
+ raise AssertionError, "failed to raise an exception when given a bogus character name"
+
try:
- unicode("\N{blah}", 'unicode-escape', 'strict')
+ unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
- raise AssertionError, "failed to raise an exception when given a bogus character name"
+ raise AssertionError, "failed to raise an exception when given a very " \
+ "long bogus character name"
try:
unicode("\N{SPACE", 'unicode-escape', 'strict')
diff --git a/Modules/ucnhash.c b/Modules/ucnhash.c
index e5a9bad..67a8895 100644
--- a/Modules/ucnhash.c
+++ b/Modules/ucnhash.c
@@ -1,5 +1,13 @@
+#include "Python.h"
#include "ucnhash.h"
+/* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
+
+typedef struct {
+ const char* pszUCN;
+ Py_UCS4 value;
+}_Py_UnicodeCharacterName;
+
/*
* The hash is produced using the algorithm described in
* "Optimal algorithms for minimal perfect hashing",
@@ -14,11 +22,11 @@
* Generated on: Fri Jul 14 08:00:58 2000
*/
+#define cKeys 10538
#define k_cHashElements 18836
#define k_cchMaxKey 83
#define k_cKeys 10538
-
staticforward const unsigned short G[k_cHashElements];
staticforward const _Py_UnicodeCharacterName aucn[k_cKeys];
@@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch)
while (--len >= 0)
{
/* (1000003 * x) ^ toupper(*(p++))
- * translated to handle > 32 bit longs
- */
+ * translated to handle > 32 bit longs */
x = (0xf4243 * x);
x = x & 0xFFFFFFFF;
x = x ^ toupper(*(p++));
@@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch)
}
-static unsigned long hash(const char *key, unsigned int cch)
+static unsigned long
+hash(const char *key, unsigned int cch)
{
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements;
}
-const void *getValue(unsigned long iKey)
+const _Py_UnicodeCharacterName *
+getValue(unsigned long iKey)
{
- return &aucn[iKey];
+ return (_Py_UnicodeCharacterName *) &aucn[iKey];
}
-/* Helper for adding objects to dictionaries. Check for errors with
- PyErr_Occurred() */
-static
-void insobj(PyObject *dict,
- char *name,
- PyObject *v)
+static int
+mystrnicmp(const char *s1, const char *s2, size_t count)
{
- PyDict_SetItemString(dict, name, v);
- Py_XDECREF(v);
+ char c1, c2;
+
+ if (count) {
+ do {
+ c1 = tolower(*(s1++));
+ c2 = tolower(*(s2++));
+ } while (--count && c1 == c2);
+ return c1 - c2;
+ }
+
+ return 0;
}
-static const _Py_UCNHashAPI hashAPI =
+/* bindings for the new API */
+
+static int
+ucnhash_getname(Py_UCS4 code, char* buffer, int buflen)
{
- k_cKeys,
- k_cchMaxKey,
- &hash,
- &getValue,
+ return 0;
+}
+
+static int
+ucnhash_getcode(const char* name, int namelen, Py_UCS4* code)
+{
+ unsigned long j;
+
+ j = hash(name, namelen);
+
+ if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0)
+ return 0;
+
+ *code = getValue(j)->value;
+
+ return 1;
+}
+
+static const _PyUnicode_Name_CAPI hashAPI =
+{
+ sizeof(_PyUnicode_Name_CAPI),
+ ucnhash_getname,
+ ucnhash_getcode
};
static
-PyMethodDef Module_methods[] =
+PyMethodDef ucnhash_methods[] =
{
{NULL, NULL},
};
-static char *Module_docstring = "ucnhash hash function module";
-
-/* Error reporting for module init functions */
-
-#define Py_ReportModuleInitError(modname) { \
- PyObject *exc_type, *exc_value, *exc_tb; \
- PyObject *str_type, *str_value; \
- \
- /* Fetch error objects and convert them to strings */ \
- PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \
- if (exc_type && exc_value) { \
- str_type = PyObject_Str(exc_type); \
- str_value = PyObject_Str(exc_value); \
- } \
- else { \
- str_type = NULL; \
- str_value = NULL; \
- } \
- /* Try to format a more informative error message using the \
- original error */ \
- if (str_type && str_value && \
- PyString_Check(str_type) && PyString_Check(str_value)) \
- PyErr_Format( \
- PyExc_ImportError, \
- "initialization of module "modname" failed " \
- "(%s:%s)", \
- PyString_AS_STRING(str_type), \
- PyString_AS_STRING(str_value)); \
- else \
- PyErr_SetString( \
- PyExc_ImportError, \
- "initialization of module "modname" failed"); \
- Py_XDECREF(str_type); \
- Py_XDECREF(str_value); \
- Py_XDECREF(exc_type); \
- Py_XDECREF(exc_value); \
- Py_XDECREF(exc_tb); \
-}
+static char *ucnhash_docstring = "ucnhash hash function module";
/* Create PyMethodObjects and register them in the module's dict */
DL_EXPORT(void)
initucnhash(void)
{
- PyObject *module, *moddict;
- /* Create module */
- module = Py_InitModule4("ucnhash", /* Module name */
- Module_methods, /* Method list */
- Module_docstring, /* Module doc-string */
- (PyObject *)NULL, /* always pass this as *self */
- PYTHON_API_VERSION); /* API Version */
- if (module == NULL)
- goto onError;
- /* Add some constants to the module's dict */
- moddict = PyModule_GetDict(module);
- if (moddict == NULL)
- goto onError;
+ PyObject *m, *d, *v;
+
+ m = Py_InitModule4(
+ "ucnhash", /* Module name */
+ ucnhash_methods, /* Method list */
+ ucnhash_docstring, /* Module doc-string */
+ (PyObject *)NULL, /* always pass this as *self */
+ PYTHON_API_VERSION); /* API Version */
+ if (!m)
+ return;
+
+ d = PyModule_GetDict(m);
+ if (!d)
+ return;
/* Export C API */
- insobj(
- moddict,
- "ucnhashAPI",
- PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
-
-onError:
- /* Check for errors and report them */
- if (PyErr_Occurred())
- Py_ReportModuleInitError("ucnhash");
- return;
+ v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+ PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
+ Py_XDECREF(v);
}
static const unsigned short G[] =
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c1f3d54..a06c40b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
Copyright (c) Corporation for National Research Initiatives.
+--------------------------------------------------------------------
+The original string type implementation is:
+
+ Copyright (c) 1999 by Secret Labs AB
+ Copyright (c) 1999 by Fredrik Lundh
+
+By obtaining, using, and/or copying this software and/or its
+associated documentation, you agree that you have read, understood,
+and will comply with the following terms and conditions:
+
+Permission to use, copy, modify, and distribute this software and its
+associated documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appears in all
+copies, and that both that copyright notice and this permission notice
+appear in supporting documentation, and that the name of Secret Labs
+AB or the author not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+--------------------------------------------------------------------
- Original header:
- --------------------------------------------------------------------
-
- * Yet another Unicode string type for Python. This type supports the
- * 16-bit Basic Multilingual Plane (BMP) only.
- *
- * Note that this string class supports embedded NULL characters. End
- * of string is given by the length attribute. However, the internal
- * representation always stores a trailing NULL to make it easier to
- * use unicode strings with standard APIs.
- *
- * History:
- * 1999-01-23 fl Created
- * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
- * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
- * 1999-03-06 fl Moved declarations to separate file, etc.
- * 1999-06-13 fl Changed join method semantics according to Tim's proposal
- * 1999-08-10 fl Some minor tweaks
- *
- * Written by Fredrik Lundh, January 1999.
- *
- * Copyright (c) 1999 by Secret Labs AB.
- * Copyright (c) 1999 by Fredrik Lundh.
- *
- * fredrik@pythonware.com
- * http://www.pythonware.com
- *
- * --------------------------------------------------------------------
- * This Unicode String Type is
- *
- * Copyright (c) 1999 by Secret Labs AB
- * Copyright (c) 1999 by Fredrik Lundh
- *
- * By obtaining, using, and/or copying this software and/or its
- * associated documentation, you agree that you have read, understood,
- * and will comply with the following terms and conditions:
- *
- * Permission to use, copy, modify, and distribute this software and its
- * associated documentation for any purpose and without fee is hereby
- * granted, provided that the above copyright notice appears in all
- * copies, and that both that copyright notice and this permission notice
- * appear in supporting documentation, and that the name of Secret Labs
- * AB or the author not be used in advertising or publicity pertaining to
- * distribution of the software without specific, written prior
- * permission.
- *
- * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
- * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- * -------------------------------------------------------------------- */
+*/
#include "Python.h"
@@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
}
}
-static _Py_UCNHashAPI *pucnHash = NULL;
-
-static
-int mystrnicmp(const char *s1, const char *s2, size_t count)
-{
- char c1, c2;
-
- if (count)
- {
- do
- {
- c1 = tolower(*(s1++));
- c2 = tolower(*(s2++));
- }
- while(--count && c1 == c2);
-
- return c1 - c2;
- }
-
- return 0;
-}
+static _PyUnicode_Name_CAPI *unicode_names = NULL;
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
@@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
- if (pucnHash == NULL) {
+ if (unicode_names == NULL) {
PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash");
if (mod == NULL)
goto onError;
- v = PyObject_GetAttrString(mod,"ucnhashAPI");
+ v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Py_DECREF(mod);
if (v == NULL)
goto onError;
- pucnHash = PyCObject_AsVoidPtr(v);
+ unicode_names = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
- if (pucnHash == NULL)
+ if (unicode_names == NULL)
goto onError;
}
if (*s == '{') {
const char *start = s + 1;
const char *endBrace = start;
- unsigned long j;
-
- /* look for either the closing brace, or we
- * exceed the maximum length of the unicode character names
- */
- while (*endBrace != '}' &&
- (unsigned int)(endBrace - start) <=
- pucnHash->cchMax &&
- endBrace < end)
- {
+
+ /* look for the closing brace */
+ while (*endBrace != '}' && endBrace < end)
endBrace++;
- }
if (endBrace != end && *endBrace == '}') {
- j = pucnHash->hash(start, endBrace - start);
- if (j > pucnHash->cKeys ||
- mystrnicmp(
- start,
- ((_Py_UnicodeCharacterName *)
- (pucnHash->getValue(j)))->pszUCN,
- (int)(endBrace - start)) != 0)
- {
+ if (!unicode_names->getcode(start, endBrace-start, &chr)) {
if (unicodeescape_decoding_error(
&s, &x, errors,
- "Invalid Unicode Character Name"))
- {
+ "Invalid Unicode Character Name")
+ )
goto onError;
- }
goto ucnFallthrough;
}
- chr = ((_Py_UnicodeCharacterName *)
- (pucnHash->getValue(j)))->value;
s = endBrace + 1;
goto store;
} else {