From 0fdb90cafe596a03a5c3005a21e8fa2a230803e5 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Fri, 19 Jan 2001 09:45:02 +0000 Subject: refactored the unicodeobject/ucnhash interface, to hide the implementation details inside the ucnhash module. also cleaned up the unicode copyright blurb a little; Secret Labs' internal revision history isn't that interesting... --- Include/ucnhash.h | 45 ++++++++------ Lib/test/test_ucn.py | 14 ++--- Modules/ucnhash.c | 155 +++++++++++++++++++++++------------------------- Objects/unicodeobject.c | 142 ++++++++++++-------------------------------- 4 files changed, 146 insertions(+), 210 deletions(-) diff --git a/Include/ucnhash.h b/Include/ucnhash.h index 7bf3f5d..a664336 100644 --- a/Include/ucnhash.h +++ b/Include/ucnhash.h @@ -1,20 +1,29 @@ +/* Unicode name database interface */ -#include "Python.h" -#include - -/* --- C API ----------------------------------------------------*/ -/* C API for usage by other Python modules */ -typedef struct _Py_UCNHashAPI -{ - unsigned long cKeys; - unsigned long cchMax; - unsigned long (*hash)(const char *key, unsigned int cch); - const void *(*getValue)(unsigned long iKey); -} _Py_UCNHashAPI; - -typedef struct -{ - const char *pszUCN; - Py_UCS4 value; -} _Py_UnicodeCharacterName; +#ifndef Py_UCNHASH_H +#define Py_UCNHASH_H +#ifdef __cplusplus +extern "C" { +#endif +/* revised ucnhash CAPI interface (exported through a PyCObject) */ + +typedef struct { + + /* Size of this struct */ + int size; + + /* Get name for a given character code. Returns non-zero if + success, zero if not. Does not set Python exceptions. */ + int (*getname)(Py_UCS4 code, char* buffer, int buflen); + + /* Get character code for a given name. Same error handling + as for getname. */ + int (*getcode)(const char* name, int namelen, Py_UCS4* code); + +} _PyUnicode_Name_CAPI; + +#ifdef __cplusplus +} +#endif +#endif /* !Py_UCNHASH_H */ diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 92155be..a33d111 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -50,22 +50,20 @@ print "done." # strict error testing: print "Testing unicode character name expansion strict error handling....", -k_cchMaxUnicodeName = 83 - -s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}" try: - unicode(s, 'unicode-escape', 'strict') + unicode("\N{blah}", 'unicode-escape', 'strict') except UnicodeError: pass else: - raise AssertionError, "failed to raise an exception when presented " \ - "with a UCN > k_cchMaxUnicodeName" + raise AssertionError, "failed to raise an exception when given a bogus character name" + try: - unicode("\N{blah}", 'unicode-escape', 'strict') + unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict') except UnicodeError: pass else: - raise AssertionError, "failed to raise an exception when given a bogus character name" + raise AssertionError, "failed to raise an exception when given a very " \ + "long bogus character name" try: unicode("\N{SPACE", 'unicode-escape', 'strict') diff --git a/Modules/ucnhash.c b/Modules/ucnhash.c index e5a9bad..67a8895 100644 --- a/Modules/ucnhash.c +++ b/Modules/ucnhash.c @@ -1,5 +1,13 @@ +#include "Python.h" #include "ucnhash.h" +/* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */ + +typedef struct { + const char* pszUCN; + Py_UCS4 value; +}_Py_UnicodeCharacterName; + /* * The hash is produced using the algorithm described in * "Optimal algorithms for minimal perfect hashing", @@ -14,11 +22,11 @@ * Generated on: Fri Jul 14 08:00:58 2000 */ +#define cKeys 10538 #define k_cHashElements 18836 #define k_cchMaxKey 83 #define k_cKeys 10538 - staticforward const unsigned short G[k_cHashElements]; staticforward const _Py_UnicodeCharacterName aucn[k_cKeys]; @@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch) while (--len >= 0) { /* (1000003 * x) ^ toupper(*(p++)) - * translated to handle > 32 bit longs - */ + * translated to handle > 32 bit longs */ x = (0xf4243 * x); x = x & 0xFFFFFFFF; x = x ^ toupper(*(p++)); @@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch) } -static unsigned long hash(const char *key, unsigned int cch) +static unsigned long +hash(const char *key, unsigned int cch) { return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements; } -const void *getValue(unsigned long iKey) +const _Py_UnicodeCharacterName * +getValue(unsigned long iKey) { - return &aucn[iKey]; + return (_Py_UnicodeCharacterName *) &aucn[iKey]; } -/* Helper for adding objects to dictionaries. Check for errors with - PyErr_Occurred() */ -static -void insobj(PyObject *dict, - char *name, - PyObject *v) +static int +mystrnicmp(const char *s1, const char *s2, size_t count) { - PyDict_SetItemString(dict, name, v); - Py_XDECREF(v); + char c1, c2; + + if (count) { + do { + c1 = tolower(*(s1++)); + c2 = tolower(*(s2++)); + } while (--count && c1 == c2); + return c1 - c2; + } + + return 0; } -static const _Py_UCNHashAPI hashAPI = +/* bindings for the new API */ + +static int +ucnhash_getname(Py_UCS4 code, char* buffer, int buflen) { - k_cKeys, - k_cchMaxKey, - &hash, - &getValue, + return 0; +} + +static int +ucnhash_getcode(const char* name, int namelen, Py_UCS4* code) +{ + unsigned long j; + + j = hash(name, namelen); + + if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0) + return 0; + + *code = getValue(j)->value; + + return 1; +} + +static const _PyUnicode_Name_CAPI hashAPI = +{ + sizeof(_PyUnicode_Name_CAPI), + ucnhash_getname, + ucnhash_getcode }; static -PyMethodDef Module_methods[] = +PyMethodDef ucnhash_methods[] = { {NULL, NULL}, }; -static char *Module_docstring = "ucnhash hash function module"; - -/* Error reporting for module init functions */ - -#define Py_ReportModuleInitError(modname) { \ - PyObject *exc_type, *exc_value, *exc_tb; \ - PyObject *str_type, *str_value; \ - \ - /* Fetch error objects and convert them to strings */ \ - PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \ - if (exc_type && exc_value) { \ - str_type = PyObject_Str(exc_type); \ - str_value = PyObject_Str(exc_value); \ - } \ - else { \ - str_type = NULL; \ - str_value = NULL; \ - } \ - /* Try to format a more informative error message using the \ - original error */ \ - if (str_type && str_value && \ - PyString_Check(str_type) && PyString_Check(str_value)) \ - PyErr_Format( \ - PyExc_ImportError, \ - "initialization of module "modname" failed " \ - "(%s:%s)", \ - PyString_AS_STRING(str_type), \ - PyString_AS_STRING(str_value)); \ - else \ - PyErr_SetString( \ - PyExc_ImportError, \ - "initialization of module "modname" failed"); \ - Py_XDECREF(str_type); \ - Py_XDECREF(str_value); \ - Py_XDECREF(exc_type); \ - Py_XDECREF(exc_value); \ - Py_XDECREF(exc_tb); \ -} +static char *ucnhash_docstring = "ucnhash hash function module"; /* Create PyMethodObjects and register them in the module's dict */ DL_EXPORT(void) initucnhash(void) { - PyObject *module, *moddict; - /* Create module */ - module = Py_InitModule4("ucnhash", /* Module name */ - Module_methods, /* Method list */ - Module_docstring, /* Module doc-string */ - (PyObject *)NULL, /* always pass this as *self */ - PYTHON_API_VERSION); /* API Version */ - if (module == NULL) - goto onError; - /* Add some constants to the module's dict */ - moddict = PyModule_GetDict(module); - if (moddict == NULL) - goto onError; + PyObject *m, *d, *v; + + m = Py_InitModule4( + "ucnhash", /* Module name */ + ucnhash_methods, /* Method list */ + ucnhash_docstring, /* Module doc-string */ + (PyObject *)NULL, /* always pass this as *self */ + PYTHON_API_VERSION); /* API Version */ + if (!m) + return; + + d = PyModule_GetDict(m); + if (!d) + return; /* Export C API */ - insobj( - moddict, - "ucnhashAPI", - PyCObject_FromVoidPtr((void *)&hashAPI, NULL)); - -onError: - /* Check for errors and report them */ - if (PyErr_Occurred()) - Py_ReportModuleInitError("ucnhash"); - return; + v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); + PyDict_SetItemString(d, "Unicode_Names_CAPI", v); + Py_XDECREF(v); } static const unsigned short G[] = diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c1f3d54..a06c40b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt). Copyright (c) Corporation for National Research Initiatives. +-------------------------------------------------------------------- +The original string type implementation is: + + Copyright (c) 1999 by Secret Labs AB + Copyright (c) 1999 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its +associated documentation, you agree that you have read, understood, +and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appears in all +copies, and that both that copyright notice and this permission notice +appear in supporting documentation, and that the name of Secret Labs +AB or the author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +-------------------------------------------------------------------- - Original header: - -------------------------------------------------------------------- - - * Yet another Unicode string type for Python. This type supports the - * 16-bit Basic Multilingual Plane (BMP) only. - * - * Note that this string class supports embedded NULL characters. End - * of string is given by the length attribute. However, the internal - * representation always stores a trailing NULL to make it easier to - * use unicode strings with standard APIs. - * - * History: - * 1999-01-23 fl Created - * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support - * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc. - * 1999-03-06 fl Moved declarations to separate file, etc. - * 1999-06-13 fl Changed join method semantics according to Tim's proposal - * 1999-08-10 fl Some minor tweaks - * - * Written by Fredrik Lundh, January 1999. - * - * Copyright (c) 1999 by Secret Labs AB. - * Copyright (c) 1999 by Fredrik Lundh. - * - * fredrik@pythonware.com - * http://www.pythonware.com - * - * -------------------------------------------------------------------- - * This Unicode String Type is - * - * Copyright (c) 1999 by Secret Labs AB - * Copyright (c) 1999 by Fredrik Lundh - * - * By obtaining, using, and/or copying this software and/or its - * associated documentation, you agree that you have read, understood, - * and will comply with the following terms and conditions: - * - * Permission to use, copy, modify, and distribute this software and its - * associated documentation for any purpose and without fee is hereby - * granted, provided that the above copyright notice appears in all - * copies, and that both that copyright notice and this permission notice - * appear in supporting documentation, and that the name of Secret Labs - * AB or the author not be used in advertising or publicity pertaining to - * distribution of the software without specific, written prior - * permission. - * - * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO - * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT - * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * -------------------------------------------------------------------- */ +*/ #include "Python.h" @@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source, } } -static _Py_UCNHashAPI *pucnHash = NULL; - -static -int mystrnicmp(const char *s1, const char *s2, size_t count) -{ - char c1, c2; - - if (count) - { - do - { - c1 = tolower(*(s1++)); - c2 = tolower(*(s2++)); - } - while(--count && c1 == c2); - - return c1 - c2; - } - - return 0; -} +static _PyUnicode_Name_CAPI *unicode_names = NULL; PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, int size, @@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, /* Ok, we need to deal with Unicode Character Names now, * make sure we've imported the hash table data... */ - if (pucnHash == NULL) { + if (unicode_names == NULL) { PyObject *mod = 0, *v = 0; mod = PyImport_ImportModule("ucnhash"); if (mod == NULL) goto onError; - v = PyObject_GetAttrString(mod,"ucnhashAPI"); + v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI"); Py_DECREF(mod); if (v == NULL) goto onError; - pucnHash = PyCObject_AsVoidPtr(v); + unicode_names = PyCObject_AsVoidPtr(v); Py_DECREF(v); - if (pucnHash == NULL) + if (unicode_names == NULL) goto onError; } if (*s == '{') { const char *start = s + 1; const char *endBrace = start; - unsigned long j; - - /* look for either the closing brace, or we - * exceed the maximum length of the unicode character names - */ - while (*endBrace != '}' && - (unsigned int)(endBrace - start) <= - pucnHash->cchMax && - endBrace < end) - { + + /* look for the closing brace */ + while (*endBrace != '}' && endBrace < end) endBrace++; - } if (endBrace != end && *endBrace == '}') { - j = pucnHash->hash(start, endBrace - start); - if (j > pucnHash->cKeys || - mystrnicmp( - start, - ((_Py_UnicodeCharacterName *) - (pucnHash->getValue(j)))->pszUCN, - (int)(endBrace - start)) != 0) - { + if (!unicode_names->getcode(start, endBrace-start, &chr)) { if (unicodeescape_decoding_error( &s, &x, errors, - "Invalid Unicode Character Name")) - { + "Invalid Unicode Character Name") + ) goto onError; - } goto ucnFallthrough; } - chr = ((_Py_UnicodeCharacterName *) - (pucnHash->getValue(j)))->value; s = endBrace + 1; goto store; } else { -- cgit v0.12