diff options
author | Walter Dörwald <walter@livinglogic.de> | 2005-10-06 20:29:57 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2005-10-06 20:29:57 (GMT) |
commit | d1c1e10f70212464415fdf2ab0bed4b5d32fdf32 (patch) | |
tree | 82db5a596a24d75e3caf0cd8054e2a68fe837332 /Objects | |
parent | 331649acc7479f6e10cf6f6d01118d90f58ae600 (diff) | |
download | cpython-d1c1e10f70212464415fdf2ab0bed4b5d32fdf32.zip cpython-d1c1e10f70212464415fdf2ab0bed4b5d32fdf32.tar.gz cpython-d1c1e10f70212464415fdf2ab0bed4b5d32fdf32.tar.bz2 |
Part of SF patch #1313939: Speedup charmap decoding by extending
PyUnicode_DecodeCharmap() the accept a unicode string as the mapping
argument which is used as a mapping table.
This code isn't used by any of the codecs yet.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 182 |
1 files changed, 107 insertions, 75 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5d096ed..7ab4d0c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2833,6 +2833,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, int extrachars = 0; PyObject *errorHandler = NULL; PyObject *exc = NULL; + Py_UNICODE *mapstring = NULL; + int maplen = 0; /* Default to Latin-1 */ if (mapping == NULL) @@ -2845,91 +2847,121 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); e = s + size; - while (s < e) { - unsigned char ch = *s; - PyObject *w, *x; - - /* Get mapping (char ordinal -> integer, Unicode char or None) */ - w = PyInt_FromLong((long)ch); - if (w == NULL) - goto onError; - x = PyObject_GetItem(mapping, w); - Py_DECREF(w); - if (x == NULL) { - if (PyErr_ExceptionMatches(PyExc_LookupError)) { - /* No mapping found means: mapping is undefined. */ - PyErr_Clear(); - x = Py_None; - Py_INCREF(x); - } else - goto onError; + if (PyUnicode_CheckExact(mapping)) { + mapstring = PyUnicode_AS_UNICODE(mapping); + maplen = PyUnicode_GET_SIZE(mapping); + while (s < e) { + unsigned char ch = *s; + Py_UNICODE x = 0xfffe; /* illegal value */ + + if (ch < maplen) + x = mapstring[ch]; + + if (x == 0xfffe) { + /* undefined mapping */ + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to <undefined>", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + goto onError; + } + continue; + } + *p++ = x; + ++s; } + } + else { + while (s < e) { + unsigned char ch = *s; + PyObject *w, *x; - /* Apply mapping */ - if (PyInt_Check(x)) { - long value = PyInt_AS_LONG(x); - if (value < 0 || value > 65535) { - PyErr_SetString(PyExc_TypeError, - "character mapping must be in range(65536)"); - Py_DECREF(x); + /* Get mapping (char ordinal -> integer, Unicode char or None) */ + w = PyInt_FromLong((long)ch); + if (w == NULL) goto onError; + x = PyObject_GetItem(mapping, w); + Py_DECREF(w); + if (x == NULL) { + if (PyErr_ExceptionMatches(PyExc_LookupError)) { + /* No mapping found means: mapping is undefined. */ + PyErr_Clear(); + x = Py_None; + Py_INCREF(x); + } else + goto onError; } - *p++ = (Py_UNICODE)value; - } - else if (x == Py_None) { - /* undefined mapping */ - outpos = p-PyUnicode_AS_UNICODE(v); - startinpos = s-starts; - endinpos = startinpos+1; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "charmap", "character maps to <undefined>", - starts, size, &startinpos, &endinpos, &exc, &s, - (PyObject **)&v, &outpos, &p)) { - Py_DECREF(x); - goto onError; + + /* Apply mapping */ + if (PyInt_Check(x)) { + long value = PyInt_AS_LONG(x); + if (value < 0 || value > 65535) { + PyErr_SetString(PyExc_TypeError, + "character mapping must be in range(65536)"); + Py_DECREF(x); + goto onError; + } + *p++ = (Py_UNICODE)value; } - continue; - } - else if (PyUnicode_Check(x)) { - int targetsize = PyUnicode_GET_SIZE(x); - - if (targetsize == 1) - /* 1-1 mapping */ - *p++ = *PyUnicode_AS_UNICODE(x); - - else if (targetsize > 1) { - /* 1-n mapping */ - if (targetsize > extrachars) { - /* resize first */ - int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); - int needed = (targetsize - extrachars) + \ - (targetsize << 2); - extrachars += needed; - if (_PyUnicode_Resize(&v, - PyUnicode_GET_SIZE(v) + needed) < 0) { - Py_DECREF(x); - goto onError; + else if (x == Py_None) { + /* undefined mapping */ + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to <undefined>", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + Py_DECREF(x); + goto onError; + } + continue; + } + else if (PyUnicode_Check(x)) { + int targetsize = PyUnicode_GET_SIZE(x); + + if (targetsize == 1) + /* 1-1 mapping */ + *p++ = *PyUnicode_AS_UNICODE(x); + + else if (targetsize > 1) { + /* 1-n mapping */ + if (targetsize > extrachars) { + /* resize first */ + int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); + int needed = (targetsize - extrachars) + \ + (targetsize << 2); + extrachars += needed; + if (_PyUnicode_Resize(&v, + PyUnicode_GET_SIZE(v) + needed) < 0) { + Py_DECREF(x); + goto onError; + } + p = PyUnicode_AS_UNICODE(v) + oldpos; } - p = PyUnicode_AS_UNICODE(v) + oldpos; + Py_UNICODE_COPY(p, + PyUnicode_AS_UNICODE(x), + targetsize); + p += targetsize; + extrachars -= targetsize; } - Py_UNICODE_COPY(p, - PyUnicode_AS_UNICODE(x), - targetsize); - p += targetsize; - extrachars -= targetsize; + /* 1-0 mapping: skip the character */ + } + else { + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or unicode"); + Py_DECREF(x); + goto onError; } - /* 1-0 mapping: skip the character */ - } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "character mapping must return integer, None or unicode"); Py_DECREF(x); - goto onError; + ++s; } - Py_DECREF(x); - ++s; } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) |