summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/lib/libcodecs.tex21
-rw-r--r--Doc/lib/libfuncs.tex2
-rw-r--r--Lib/test/test_codeccallbacks.py96
-rw-r--r--Modules/_iconv_codec.c18
-rw-r--r--Objects/unicodeobject.c26
5 files changed, 122 insertions, 41 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index 355ac5d..caaaaf4 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -103,11 +103,22 @@ Raises a \exception{LookupError} in case the encoding cannot be found.
Register the error handling function \var{error_handler} under the
name \var{name}. \var{error_handler} will be called during encoding
and decoding in case of an error, when \var{name} is specified as the
-errors parameter. \var{error_handler} will be called with an
-\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or
-\exception{UnicodeTranslateError} instance and must return a tuple
-with a replacement for the unencodable/undecodable part of the input
-and a position where encoding/decoding should continue.
+errors parameter.
+
+For encoding \var{error_handler} will be called with a
+\exception{UnicodeEncodeError} instance, which contains information about
+the location of the error. The error handler must either raise this or
+a different exception or return a tuple with a replacement for the
+unencodable part of the input and a position where encoding should
+continue. The encoder will encode the replacement and continue encoding
+the original input at the specified position. Negative position values
+will be treated as being relative to the end of the input string. If the
+resulting position is out of bound an IndexError will be raised.
+
+Decoding and translating works similar, except \exception{UnicodeDecodeError}
+or \exception{UnicodeTranslateError} will be passed to the handler and
+that the replacement from the error handler will be put into the output
+directly.
\end{funcdesc}
\begin{funcdesc}{lookup_error}{name}
diff --git a/Doc/lib/libfuncs.tex b/Doc/lib/libfuncs.tex
index d5b565f..323a516 100644
--- a/Doc/lib/libfuncs.tex
+++ b/Doc/lib/libfuncs.tex
@@ -572,7 +572,7 @@ class C:
\var{classinfo} argument, or of a (direct or indirect) subclass
thereof. Also return true if \var{classinfo} is a type object and
\var{object} is an object of that type. If \var{object} is not a
- class instance or a object of the given type, the function always
+ class instance or an object of the given type, the function always
returns false. If \var{classinfo} is neither a class object nor a
type object, it may be a tuple of class or type objects, or may
recursively contain other such tuples (other sequence types are not
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index b51b489..bf583c2 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -1,6 +1,23 @@
import test.test_support, unittest
import sys, codecs, htmlentitydefs, unicodedata
+class PosReturn:
+ # this can be used for configurable callbacks
+
+ def __init__(self):
+ self.pos = 0
+
+ def handle(self, exc):
+ oldpos = self.pos
+ realpos = oldpos
+ if realpos<0:
+ realpos = len(exc.object) + realpos
+ # if we don't advance this time, terminate on the next call
+ # otherwise we'd get an endless loop
+ if realpos <= exc.start:
+ self.pos = len(exc.object)
+ return (u"<?>", oldpos)
+
class CodecCallbackTest(unittest.TestCase):
def test_xmlcharrefreplace(self):
@@ -543,18 +560,36 @@ class CodecCallbackTest(unittest.TestCase):
codecs.register_error("test.baddecodereturn2", baddecodereturn2)
self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
- pos = [-42]
- def negposreturn(exc):
- pos[0] += 1 # use list to work around scoping problem
- return (u"?", pos[0])
- codecs.register_error("test.negposreturn", negposreturn)
- "\xff".decode("ascii", "test.negposreturn")
+ handler = PosReturn()
+ codecs.register_error("test.posreturn", handler.handle)
+
+ # Valid negative position
+ handler.pos = -1
+ self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
+
+ # Valid negative position
+ handler.pos = -2
+ self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>")
+
+ # Negative position out of bounds
+ handler.pos = -3
+ self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
+
+ # Valid positive position
+ handler.pos = 1
+ self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
+
+ # Largest valid positive position (one beyond end of input
+ handler.pos = 2
+ self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>")
+
+ # Invalid positive position
+ handler.pos = 3
+ self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
- def hugeposreturn(exc):
- return (u"?", 424242)
- codecs.register_error("test.hugeposreturn", hugeposreturn)
- "\xff".decode("ascii", "test.hugeposreturn")
- "\\uyyyy".decode("raw-unicode-escape", "test.hugeposreturn")
+ # Restart at the "0"
+ handler.pos = 6
+ self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0")
class D(dict):
def __getitem__(self, key):
@@ -579,22 +614,39 @@ class CodecCallbackTest(unittest.TestCase):
codecs.register_error("test.badencodereturn2", badencodereturn2)
self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
- pos = [-42]
- def negposreturn(exc):
- pos[0] += 1 # use list to work around scoping problem
- return (u"?", pos[0])
- codecs.register_error("test.negposreturn", negposreturn)
- u"\xff".encode("ascii", "test.negposreturn")
+ handler = PosReturn()
+ codecs.register_error("test.posreturn", handler.handle)
+
+ # Valid negative position
+ handler.pos = -1
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
+
+ # Valid negative position
+ handler.pos = -2
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
+
+ # Negative position out of bounds
+ handler.pos = -3
+ self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
+
+ # Valid positive position
+ handler.pos = 1
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
+
+ # Largest valid positive position (one beyond end of input
+ handler.pos = 2
+ self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
+
+ # Invalid positive position
+ handler.pos = 3
+ self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
- def hugeposreturn(exc):
- return (u"?", 424242)
- codecs.register_error("test.hugeposreturn", hugeposreturn)
- u"\xff".encode("ascii", "test.hugeposreturn")
+ handler.pos = 0
class D(dict):
def __getitem__(self, key):
raise ValueError
- for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.hugeposreturn"):
+ for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
diff --git a/Modules/_iconv_codec.c b/Modules/_iconv_codec.c
index d61adbc..3f2a72a 100644
--- a/Modules/_iconv_codec.c
+++ b/Modules/_iconv_codec.c
@@ -247,8 +247,13 @@ errorexit_cbpad: Py_XDECREF(retobj);
Py_DECREF(retobj);
if (newpos < 0)
- newpos = inputlen - newpos;
- if (newpos < 0 || newpos >= inputlen)
+ newpos = inputlen + newpos;
+ if (newpos < 0 || newpos > inputlen) {
+ PyErr_Format(PyExc_IndexError, "position %ld from error handler"
+ " out of bounds", newpos);
+ goto errorexit;
+ }
+ if (newpos == inputlen)
break;
inp = inp_top + Py_UNICODE_SIZE * newpos;
inplen = inplen_total - Py_UNICODE_SIZE * newpos;
@@ -471,8 +476,13 @@ errorexit_cbpad: Py_DECREF(retobj);
Py_DECREF(retobj);
if (newpos < 0)
- newpos = inplen_total - newpos;
- if (newpos < 0 || newpos >= inplen_total)
+ newpos = inplen_total + newpos;
+ if (newpos < 0 || newpos > inplen_total) {
+ PyErr_Format(PyExc_IndexError, "position %ld from error handler"
+ " out of bounds", newpos);
+ goto errorexit;
+ }
+ if (newpos == inplen_total)
break;
inp = inp_top + newpos;
inplen = inplen_total - newpos;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1abef89..dfeabf5 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -728,9 +728,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
goto onError;
if (newpos<0)
- newpos = 0;
- else if (newpos>insize)
- newpos = insize;
+ newpos = insize+newpos;
+ if (newpos<0 || newpos>insize) {
+ PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
+ goto onError;
+ }
/* need more space? (at least enough for what we
have+the replacement+the rest of the string (starting
@@ -2246,9 +2248,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
return NULL;
}
if (*newpos<0)
- *newpos = 0;
- else if (*newpos>size)
- *newpos = size;
+ *newpos = size+*newpos;
+ if (*newpos<0 || *newpos>size) {
+ PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
+ Py_DECREF(restuple);
+ return NULL;
+ }
Py_INCREF(resunicode);
Py_DECREF(restuple);
return resunicode;
@@ -3084,9 +3089,12 @@ static PyObject *unicode_translate_call_errorhandler(const char *errors,
return NULL;
}
if (*newpos<0)
- *newpos = 0;
- else if (*newpos>size)
- *newpos = size;
+ *newpos = size+*newpos;
+ if (*newpos<0 || *newpos>size) {
+ PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
+ Py_DECREF(restuple);
+ return NULL;
+ }
Py_INCREF(resunicode);
Py_DECREF(restuple);
return resunicode;