diff options
-rw-r--r-- | Doc/lib/libcodecs.tex | 21 | ||||
-rw-r--r-- | Doc/lib/libfuncs.tex | 2 | ||||
-rw-r--r-- | Lib/test/test_codeccallbacks.py | 96 | ||||
-rw-r--r-- | Modules/_iconv_codec.c | 18 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 26 |
5 files changed, 122 insertions, 41 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index 355ac5d..caaaaf4 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -103,11 +103,22 @@ Raises a \exception{LookupError} in case the encoding cannot be found. Register the error handling function \var{error_handler} under the name \var{name}. \var{error_handler} will be called during encoding and decoding in case of an error, when \var{name} is specified as the -errors parameter. \var{error_handler} will be called with an -\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or -\exception{UnicodeTranslateError} instance and must return a tuple -with a replacement for the unencodable/undecodable part of the input -and a position where encoding/decoding should continue. +errors parameter. + +For encoding \var{error_handler} will be called with a +\exception{UnicodeEncodeError} instance, which contains information about +the location of the error. The error handler must either raise this or +a different exception or return a tuple with a replacement for the +unencodable part of the input and a position where encoding should +continue. The encoder will encode the replacement and continue encoding +the original input at the specified position. Negative position values +will be treated as being relative to the end of the input string. If the +resulting position is out of bound an IndexError will be raised. + +Decoding and translating works similar, except \exception{UnicodeDecodeError} +or \exception{UnicodeTranslateError} will be passed to the handler and +that the replacement from the error handler will be put into the output +directly. \end{funcdesc} \begin{funcdesc}{lookup_error}{name} diff --git a/Doc/lib/libfuncs.tex b/Doc/lib/libfuncs.tex index d5b565f..323a516 100644 --- a/Doc/lib/libfuncs.tex +++ b/Doc/lib/libfuncs.tex @@ -572,7 +572,7 @@ class C: \var{classinfo} argument, or of a (direct or indirect) subclass thereof. Also return true if \var{classinfo} is a type object and \var{object} is an object of that type. If \var{object} is not a - class instance or a object of the given type, the function always + class instance or an object of the given type, the function always returns false. If \var{classinfo} is neither a class object nor a type object, it may be a tuple of class or type objects, or may recursively contain other such tuples (other sequence types are not diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index b51b489..bf583c2 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1,6 +1,23 @@ import test.test_support, unittest import sys, codecs, htmlentitydefs, unicodedata +class PosReturn: + # this can be used for configurable callbacks + + def __init__(self): + self.pos = 0 + + def handle(self, exc): + oldpos = self.pos + realpos = oldpos + if realpos<0: + realpos = len(exc.object) + realpos + # if we don't advance this time, terminate on the next call + # otherwise we'd get an endless loop + if realpos <= exc.start: + self.pos = len(exc.object) + return (u"<?>", oldpos) + class CodecCallbackTest(unittest.TestCase): def test_xmlcharrefreplace(self): @@ -543,18 +560,36 @@ class CodecCallbackTest(unittest.TestCase): codecs.register_error("test.baddecodereturn2", baddecodereturn2) self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2") - pos = [-42] - def negposreturn(exc): - pos[0] += 1 # use list to work around scoping problem - return (u"?", pos[0]) - codecs.register_error("test.negposreturn", negposreturn) - "\xff".decode("ascii", "test.negposreturn") + handler = PosReturn() + codecs.register_error("test.posreturn", handler.handle) + + # Valid negative position + handler.pos = -1 + self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") + + # Valid negative position + handler.pos = -2 + self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>") + + # Negative position out of bounds + handler.pos = -3 + self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") + + # Valid positive position + handler.pos = 1 + self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") + + # Largest valid positive position (one beyond end of input + handler.pos = 2 + self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>") + + # Invalid positive position + handler.pos = 3 + self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") - def hugeposreturn(exc): - return (u"?", 424242) - codecs.register_error("test.hugeposreturn", hugeposreturn) - "\xff".decode("ascii", "test.hugeposreturn") - "\\uyyyy".decode("raw-unicode-escape", "test.hugeposreturn") + # Restart at the "0" + handler.pos = 6 + self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0") class D(dict): def __getitem__(self, key): @@ -579,22 +614,39 @@ class CodecCallbackTest(unittest.TestCase): codecs.register_error("test.badencodereturn2", badencodereturn2) self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2") - pos = [-42] - def negposreturn(exc): - pos[0] += 1 # use list to work around scoping problem - return (u"?", pos[0]) - codecs.register_error("test.negposreturn", negposreturn) - u"\xff".encode("ascii", "test.negposreturn") + handler = PosReturn() + codecs.register_error("test.posreturn", handler.handle) + + # Valid negative position + handler.pos = -1 + self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") + + # Valid negative position + handler.pos = -2 + self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>") + + # Negative position out of bounds + handler.pos = -3 + self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") + + # Valid positive position + handler.pos = 1 + self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") + + # Largest valid positive position (one beyond end of input + handler.pos = 2 + self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>") + + # Invalid positive position + handler.pos = 3 + self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") - def hugeposreturn(exc): - return (u"?", 424242) - codecs.register_error("test.hugeposreturn", hugeposreturn) - u"\xff".encode("ascii", "test.hugeposreturn") + handler.pos = 0 class D(dict): def __getitem__(self, key): raise ValueError - for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.hugeposreturn"): + for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None}) self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D()) self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300}) diff --git a/Modules/_iconv_codec.c b/Modules/_iconv_codec.c index d61adbc..3f2a72a 100644 --- a/Modules/_iconv_codec.c +++ b/Modules/_iconv_codec.c @@ -247,8 +247,13 @@ errorexit_cbpad: Py_XDECREF(retobj); Py_DECREF(retobj); if (newpos < 0) - newpos = inputlen - newpos; - if (newpos < 0 || newpos >= inputlen) + newpos = inputlen + newpos; + if (newpos < 0 || newpos > inputlen) { + PyErr_Format(PyExc_IndexError, "position %ld from error handler" + " out of bounds", newpos); + goto errorexit; + } + if (newpos == inputlen) break; inp = inp_top + Py_UNICODE_SIZE * newpos; inplen = inplen_total - Py_UNICODE_SIZE * newpos; @@ -471,8 +476,13 @@ errorexit_cbpad: Py_DECREF(retobj); Py_DECREF(retobj); if (newpos < 0) - newpos = inplen_total - newpos; - if (newpos < 0 || newpos >= inplen_total) + newpos = inplen_total + newpos; + if (newpos < 0 || newpos > inplen_total) { + PyErr_Format(PyExc_IndexError, "position %ld from error handler" + " out of bounds", newpos); + goto errorexit; + } + if (newpos == inplen_total) break; inp = inp_top + newpos; inplen = inplen_total - newpos; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1abef89..dfeabf5 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -728,9 +728,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) goto onError; if (newpos<0) - newpos = 0; - else if (newpos>insize) - newpos = insize; + newpos = insize+newpos; + if (newpos<0 || newpos>insize) { + PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos); + goto onError; + } /* need more space? (at least enough for what we have+the replacement+the rest of the string (starting @@ -2246,9 +2248,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors, return NULL; } if (*newpos<0) - *newpos = 0; - else if (*newpos>size) - *newpos = size; + *newpos = size+*newpos; + if (*newpos<0 || *newpos>size) { + PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); + Py_DECREF(restuple); + return NULL; + } Py_INCREF(resunicode); Py_DECREF(restuple); return resunicode; @@ -3084,9 +3089,12 @@ static PyObject *unicode_translate_call_errorhandler(const char *errors, return NULL; } if (*newpos<0) - *newpos = 0; - else if (*newpos>size) - *newpos = size; + *newpos = size+*newpos; + if (*newpos<0 || *newpos>size) { + PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); + Py_DECREF(restuple); + return NULL; + } Py_INCREF(resunicode); Py_DECREF(restuple); return resunicode; |