From d1c1e10f70212464415fdf2ab0bed4b5d32fdf32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Thu, 6 Oct 2005 20:29:57 +0000 Subject: Part of SF patch #1313939: Speedup charmap decoding by extending PyUnicode_DecodeCharmap() the accept a unicode string as the mapping argument which is used as a mapping table. This code isn't used by any of the codecs yet. --- Doc/api/concrete.tex | 7 +- Lib/test/test_codecs.py | 37 +++++++++- Misc/NEWS | 5 ++ Objects/unicodeobject.c | 182 ++++++++++++++++++++++++++++-------------------- 4 files changed, 154 insertions(+), 77 deletions(-) diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex index b6dbc5d..53c3b67 100644 --- a/Doc/api/concrete.tex +++ b/Doc/api/concrete.tex @@ -1322,7 +1322,12 @@ points. const char *errors} Create a Unicode object by decoding \var{size} bytes of the encoded string \var{s} using the given \var{mapping} object. Return - \NULL{} if an exception was raised by the codec. + \NULL{} if an exception was raised by the codec. If \var{mapping} is \NULL{} + latin-1 decoding will be done. Else it can be a dictionary mapping byte or a + unicode string, which is treated as a lookup table. Byte values greater + that the length of the string and U+FFFE "characters" are treated as + "undefined mapping". + \versionchanged[Allowed unicode string as mapping argument]{2.4} \end{cfuncdesc} \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s, diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index a4d58c6..74ad83b 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -924,6 +924,40 @@ class BasicStrTest(unittest.TestCase): (chars, size) = codecs.getdecoder(encoding)(bytes) self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) +class CharmapTest(unittest.TestCase): + def test_decode_with_string_map(self): + self.assertEquals( + codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"), + (u"abc", 3) + ) + + self.assertEquals( + codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"), + (u"ab\ufffd", 3) + ) + + self.assertEquals( + codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"), + (u"ab\ufffd", 3) + ) + + self.assertEquals( + codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"), + (u"ab", 3) + ) + + self.assertEquals( + codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"), + (u"ab", 3) + ) + + allbytes = "".join(chr(i) for i in xrange(256)) + self.assertEquals( + codecs.charmap_decode(allbytes, "ignore", u""), + (u"", len(allbytes)) + ) + + def test_main(): test_support.run_unittest( UTF16Test, @@ -940,7 +974,8 @@ def test_main(): StreamReaderTest, Str2StrTest, BasicUnicodeTest, - BasicStrTest + BasicStrTest, + CharmapTest ) diff --git a/Misc/NEWS b/Misc/NEWS index 11dd40c..4d35774 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -563,6 +563,11 @@ C API - Removed PyRange_New(). +- Patch #1313939: PyUnicode_DecodeCharmap() accepts a unicode string as the + mapping argument now. This string is used as a mapping table. Byte values + greater than the length of the string and 0xFFFE are treated as undefined + mappings. + Tests ----- diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5d096ed..7ab4d0c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2833,6 +2833,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, int extrachars = 0; PyObject *errorHandler = NULL; PyObject *exc = NULL; + Py_UNICODE *mapstring = NULL; + int maplen = 0; /* Default to Latin-1 */ if (mapping == NULL) @@ -2845,91 +2847,121 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); e = s + size; - while (s < e) { - unsigned char ch = *s; - PyObject *w, *x; - - /* Get mapping (char ordinal -> integer, Unicode char or None) */ - w = PyInt_FromLong((long)ch); - if (w == NULL) - goto onError; - x = PyObject_GetItem(mapping, w); - Py_DECREF(w); - if (x == NULL) { - if (PyErr_ExceptionMatches(PyExc_LookupError)) { - /* No mapping found means: mapping is undefined. */ - PyErr_Clear(); - x = Py_None; - Py_INCREF(x); - } else - goto onError; + if (PyUnicode_CheckExact(mapping)) { + mapstring = PyUnicode_AS_UNICODE(mapping); + maplen = PyUnicode_GET_SIZE(mapping); + while (s < e) { + unsigned char ch = *s; + Py_UNICODE x = 0xfffe; /* illegal value */ + + if (ch < maplen) + x = mapstring[ch]; + + if (x == 0xfffe) { + /* undefined mapping */ + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to ", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + goto onError; + } + continue; + } + *p++ = x; + ++s; } + } + else { + while (s < e) { + unsigned char ch = *s; + PyObject *w, *x; - /* Apply mapping */ - if (PyInt_Check(x)) { - long value = PyInt_AS_LONG(x); - if (value < 0 || value > 65535) { - PyErr_SetString(PyExc_TypeError, - "character mapping must be in range(65536)"); - Py_DECREF(x); + /* Get mapping (char ordinal -> integer, Unicode char or None) */ + w = PyInt_FromLong((long)ch); + if (w == NULL) goto onError; + x = PyObject_GetItem(mapping, w); + Py_DECREF(w); + if (x == NULL) { + if (PyErr_ExceptionMatches(PyExc_LookupError)) { + /* No mapping found means: mapping is undefined. */ + PyErr_Clear(); + x = Py_None; + Py_INCREF(x); + } else + goto onError; } - *p++ = (Py_UNICODE)value; - } - else if (x == Py_None) { - /* undefined mapping */ - outpos = p-PyUnicode_AS_UNICODE(v); - startinpos = s-starts; - endinpos = startinpos+1; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "charmap", "character maps to ", - starts, size, &startinpos, &endinpos, &exc, &s, - (PyObject **)&v, &outpos, &p)) { - Py_DECREF(x); - goto onError; + + /* Apply mapping */ + if (PyInt_Check(x)) { + long value = PyInt_AS_LONG(x); + if (value < 0 || value > 65535) { + PyErr_SetString(PyExc_TypeError, + "character mapping must be in range(65536)"); + Py_DECREF(x); + goto onError; + } + *p++ = (Py_UNICODE)value; } - continue; - } - else if (PyUnicode_Check(x)) { - int targetsize = PyUnicode_GET_SIZE(x); - - if (targetsize == 1) - /* 1-1 mapping */ - *p++ = *PyUnicode_AS_UNICODE(x); - - else if (targetsize > 1) { - /* 1-n mapping */ - if (targetsize > extrachars) { - /* resize first */ - int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); - int needed = (targetsize - extrachars) + \ - (targetsize << 2); - extrachars += needed; - if (_PyUnicode_Resize(&v, - PyUnicode_GET_SIZE(v) + needed) < 0) { - Py_DECREF(x); - goto onError; + else if (x == Py_None) { + /* undefined mapping */ + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to ", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + Py_DECREF(x); + goto onError; + } + continue; + } + else if (PyUnicode_Check(x)) { + int targetsize = PyUnicode_GET_SIZE(x); + + if (targetsize == 1) + /* 1-1 mapping */ + *p++ = *PyUnicode_AS_UNICODE(x); + + else if (targetsize > 1) { + /* 1-n mapping */ + if (targetsize > extrachars) { + /* resize first */ + int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); + int needed = (targetsize - extrachars) + \ + (targetsize << 2); + extrachars += needed; + if (_PyUnicode_Resize(&v, + PyUnicode_GET_SIZE(v) + needed) < 0) { + Py_DECREF(x); + goto onError; + } + p = PyUnicode_AS_UNICODE(v) + oldpos; } - p = PyUnicode_AS_UNICODE(v) + oldpos; + Py_UNICODE_COPY(p, + PyUnicode_AS_UNICODE(x), + targetsize); + p += targetsize; + extrachars -= targetsize; } - Py_UNICODE_COPY(p, - PyUnicode_AS_UNICODE(x), - targetsize); - p += targetsize; - extrachars -= targetsize; + /* 1-0 mapping: skip the character */ + } + else { + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or unicode"); + Py_DECREF(x); + goto onError; } - /* 1-0 mapping: skip the character */ - } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "character mapping must return integer, None or unicode"); Py_DECREF(x); - goto onError; + ++s; } - Py_DECREF(x); - ++s; } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) -- cgit v0.12