summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/lib/libcodecs.tex39
-rw-r--r--Doc/lib/libexcs.tex21
-rw-r--r--Include/codecs.h30
-rw-r--r--Include/pyerrors.h66
-rw-r--r--Lib/codecs.py13
-rw-r--r--Lib/test/test_codeccallbacks.py483
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/_codecsmodule.c28
-rw-r--r--Objects/stringobject.c8
-rw-r--r--Objects/unicodeobject.c1792
-rw-r--r--Python/codecs.c399
-rw-r--r--Python/exceptions.c603
12 files changed, 2929 insertions, 556 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index 136c528..85ca7a5 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -17,7 +17,7 @@
This module defines base classes for standard Python codecs (encoders
and decoders) and provides access to the internal Python codec
-registry which manages the codec lookup process.
+registry which manages the codec and error handling lookup process.
It defines the following functions:
@@ -98,6 +98,43 @@ Raises a \exception{LookupError} in case the encoding cannot be found.
To simplify working with encoded files or stream, the module
also defines these utility functions:
+\begin{funcdesc}{register_error}{name, error_handler}
+Register the error handling function \var{error_handler} under the
+name \var{name}. \vari{error_handler} will be called during encoding
+and decoding in case of an error, when \var{name} is specified as the
+errors parameter. \var{error_handler} will be called with an
+\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or
+\exception{UnicodeTranslateError} instance and must return a tuple
+with a replacement for the unencodable/undecodable part of the input
+and a position where encoding/decoding should continue.
+\end{funcdesc}
+
+\begin{funcdesc}{lookup_error}{name}
+Return the error handler previously register under the name \var{name}.
+
+Raises a \exception{LookupError} in case the handler cannot be found.
+\end{funcdesc}
+
+\begin{funcdesc}{strict_errors}{exception}
+Implements the \code{strict} error handling.
+\end{funcdesc}
+
+\begin{funcdesc}{replace_errors}{exception}
+Implements the \code{replace} error handling.
+\end{funcdesc}
+
+\begin{funcdesc}{ignore_errors}{exception}
+Implements the \code{ignore} error handling.
+\end{funcdesc}
+
+\begin{funcdesc}{xmlcharrefreplace_errors_errors}{exception}
+Implements the \code{xmlcharrefreplace} error handling.
+\end{funcdesc}
+
+\begin{funcdesc}{backslashreplace_errors_errors}{exception}
+Implements the \code{backslashreplace} error handling.
+\end{funcdesc}
+
\begin{funcdesc}{open}{filename, mode\optional{, encoding\optional{,
errors\optional{, buffering}}}}
Open an encoded file using the given \var{mode} and return
diff --git a/Doc/lib/libexcs.tex b/Doc/lib/libexcs.tex
index 078fe3c..54b141a 100644
--- a/Doc/lib/libexcs.tex
+++ b/Doc/lib/libexcs.tex
@@ -335,6 +335,24 @@ Raised when an \keyword{assert} statement fails.
\versionadded{2.0}
\end{excdesc}
+\begin{excdesc}{UnicodeEncodeError}
+ Raised when a Unicode-related error occurs during encoding. It
+ is a subclass of \exception{UnicodeError}.
+\versionadded{2.3}
+\end{excdesc}
+
+\begin{excdesc}{UnicodeDecodeError}
+ Raised when a Unicode-related error occurs during decoding. It
+ is a subclass of \exception{UnicodeError}.
+\versionadded{2.3}
+\end{excdesc}
+
+\begin{excdesc}{UnicodeTranslateError}
+ Raised when a Unicode-related error occurs during translating. It
+ is a subclass of \exception{UnicodeError}.
+\versionadded{2.3}
+\end{excdesc}
+
\begin{excdesc}{ValueError}
Raised when a built-in operation or function receives an argument
that has the right type but an inappropriate value, and the
@@ -426,6 +444,9 @@ The class hierarchy for built-in exceptions is:
| | +-- FloatingPointError
| +-- ValueError
| | +-- UnicodeError
+ | | +-- UnicodeEncodeError
+ | | +-- UnicodeDecodeError
+ | | +-- UnicodeTranslateError
| +-- ReferenceError
| +-- SystemError
| +-- MemoryError
diff --git a/Include/codecs.h b/Include/codecs.h
index 2cc4d7d..82f18cd 100644
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -117,6 +117,36 @@ PyAPI_FUNC(PyObject *) PyCodec_StreamWriter(
const char *errors
);
+/* Unicode encoding error handling callback registry API */
+
+/* Register the error handling callback function error under the name
+ name. This function will be called by the codec when it encounters
+ unencodable characters/undecodable bytes and doesn't know the
+ callback name, when name is specified as the error parameter
+ in the call to the encode/decode function.
+ Return 0 on success, -1 on error */
+PyAPI_FUNC(int) PyCodec_RegisterError(const char *name, PyObject *error);
+
+/* Lookup the error handling callback function registered under the
+ name error. As a special case NULL can be passed, in which case
+ the error handling callback for "strict" will be returned. */
+PyAPI_FUNC(PyObject *) PyCodec_LookupError(const char *name);
+
+/* raise exc as an exception */
+PyAPI_FUNC(PyObject *) PyCodec_StrictErrors(PyObject *exc);
+
+/* ignore the unicode error, skipping the faulty input */
+PyAPI_FUNC(PyObject *) PyCodec_IgnoreErrors(PyObject *exc);
+
+/* replace the unicode error with ? or U+FFFD */
+PyAPI_FUNC(PyObject *) PyCodec_ReplaceErrors(PyObject *exc);
+
+/* replace the unicode encode error with XML character references */
+PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc);
+
+/* replace the unicode encode error with backslash escapes (\x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/pyerrors.h b/Include/pyerrors.h
index b783b7b..756c4b2 100644
--- a/Include/pyerrors.h
+++ b/Include/pyerrors.h
@@ -54,6 +54,9 @@ PyAPI_DATA(PyObject *) PyExc_SystemExit;
PyAPI_DATA(PyObject *) PyExc_TypeError;
PyAPI_DATA(PyObject *) PyExc_UnboundLocalError;
PyAPI_DATA(PyObject *) PyExc_UnicodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeEncodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeDecodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeTranslateError;
PyAPI_DATA(PyObject *) PyExc_ValueError;
PyAPI_DATA(PyObject *) PyExc_ZeroDivisionError;
#ifdef MS_WINDOWS
@@ -114,6 +117,69 @@ PyAPI_FUNC(void) PyErr_SetInterrupt(void);
PyAPI_FUNC(void) PyErr_SyntaxLocation(char *, int);
PyAPI_FUNC(PyObject *) PyErr_ProgramText(char *, int);
+/* The following functions are used to create and modify unicode
+ exceptions from C */
+/* create a UnicodeDecodeError object */
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_Create(
+ const char *, const char *, int, int, int, const char *);
+
+/* create a UnicodeEncodeError object */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_Create(
+ const char *, const Py_UNICODE *, int, int, int, const char *);
+
+/* create a UnicodeTranslateError object */
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_Create(
+ const Py_UNICODE *, int, int, int, const char *);
+
+/* get the encoding attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetEncoding(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetEncoding(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetEncoding(PyObject *);
+
+/* get the object attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetObject(PyObject *);
+
+/* get the value of the start attribute (the int * may not be NULL)
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_GetStart(PyObject *, int *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_GetStart(PyObject *, int *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_GetStart(PyObject *, int *);
+
+/* assign a new value to the start attribute
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetStart(PyObject *, int);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetStart(PyObject *, int);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetStart(PyObject *, int);
+
+/* get the value of the end attribute (the int *may not be NULL)
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_GetEnd(PyObject *, int *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_GetEnd(PyObject *, int *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_GetEnd(PyObject *, int *);
+
+/* assign a new value to the end attribute
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetEnd(PyObject *, int);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetEnd(PyObject *, int);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetEnd(PyObject *, int);
+
+/* get the value of the reason attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetReason(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetReason(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetReason(PyObject *);
+
+/* assign a new value to the reason attribute
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetReason(
+ PyObject *, const char *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetReason(
+ PyObject *, const char *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetReason(
+ PyObject *, const char *);
+
+
/* These APIs aren't really part of the error implementation, but
often needed to format error messages; the native C lib APIs are
not available on all platforms, which is why we provide emulations
diff --git a/Lib/codecs.py b/Lib/codecs.py
index b089e90..40f0a2e 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -20,7 +20,10 @@ except ImportError, why:
__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
- "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE"]
+ "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
+ "strict_errors", "ignore_errors", "replace_errors",
+ "xmlcharrefreplace_errors",
+ "register_error", "lookup_error"]
### Constants
@@ -632,6 +635,14 @@ def make_encoding_map(decoding_map):
m[v] = None
return m
+### error handlers
+
+strict_errors = lookup_error("strict")
+ignore_errors = lookup_error("ignore")
+replace_errors = lookup_error("replace")
+xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
+backslashreplace_errors = lookup_error("backslashreplace")
+
# Tell modulefinder that using codecs probably needs the encodings
# package
_false = 0
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
new file mode 100644
index 0000000..1650965
--- /dev/null
+++ b/Lib/test/test_codeccallbacks.py
@@ -0,0 +1,483 @@
+import test.test_support, unittest
+import sys, codecs, htmlentitydefs, unicodedata
+
+class CodecCallbackTest(unittest.TestCase):
+
+ def test_xmlcharrefreplace(self):
+ # replace unencodable characters which numeric character entities.
+ # For ascii, latin-1 and charmaps this is completely implemented
+ # in C and should be reasonably fast.
+ s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
+ self.assertEqual(
+ s.encode("ascii", "xmlcharrefreplace"),
+ "スパモ änd eggs"
+ )
+ self.assertEqual(
+ s.encode("latin-1", "xmlcharrefreplace"),
+ "スパモ \xe4nd eggs"
+ )
+
+ def test_xmlcharnamereplace(self):
+ # This time use a named character entity for unencodable
+ # characters, if one is available.
+ names = {}
+ for (key, value) in htmlentitydefs.entitydefs.items():
+ if len(value)==1:
+ names[unicode(value, "latin-1")] = unicode(key, "latin-1")
+ else:
+ names[unichr(int(value[2:-1]))] = unicode(key, "latin-1")
+
+ def xmlcharnamereplace(exc):
+ if not isinstance(exc, UnicodeEncodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = []
+ for c in exc.object[exc.start:exc.end]:
+ try:
+ l.append(u"&%s;" % names[c])
+ except KeyError:
+ l.append(u"&#%d;" % ord(c))
+ return (u"".join(l), exc.end)
+
+ codecs.register_error(
+ "test.xmlcharnamereplace", xmlcharnamereplace)
+
+ sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
+ sout = "«ℜ» = ⟨ሴ€⟩"
+ self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
+ sout = "\xabℜ\xbb = ⟨ሴ€⟩"
+ self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
+ sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩"
+ self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
+
+ def test_uninamereplace(self):
+ # We're using the names from the unicode database this time,
+ # and we're doing "systax highlighting" here, i.e. we include
+ # the replaced text in ANSI escape sequences. For this it is
+ # useful that the error handler is not called for every single
+ # unencodable character, but for a complete sequence of
+ # unencodable characters, otherwise we would output many
+ # unneccessary escape sequences.
+
+ def uninamereplace(exc):
+ if not isinstance(exc, UnicodeEncodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = []
+ for c in exc.object[exc.start:exc.end]:
+ l.append(unicodedata.name(c, u"0x%x" % ord(c)))
+ return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
+
+ codecs.register_error(
+ "test.uninamereplace", uninamereplace)
+
+ sin = u"\xac\u1234\u20ac\u8000"
+ sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
+ self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
+
+ sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
+ self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
+
+ sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m"
+ self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
+
+ def test_backslashescape(self):
+ # Does the same as the "unicode-escape" encoding, but with different
+ # base encodings.
+ sin = u"a\xac\u1234\u20ac\u8000"
+ if sys.maxunicode > 0xffff:
+ sin += unichr(sys.maxunicode)
+ sout = "a\\xac\\u1234\\u20ac\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
+
+ sout = "a\xac\\u1234\\u20ac\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
+
+ sout = "a\xac\\u1234\xa4\\u8000"
+ if sys.maxunicode > 0xffff:
+ sout += "\\U%08x" % sys.maxunicode
+ self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
+
+ def test_relaxedutf8(self):
+ # This is the test for a decoding callback handler,
+ # that relaxes the UTF-8 minimal encoding restriction.
+ # A null byte that is encoded as "\xc0\x80" will be
+ # decoded as a null byte. All other illegal sequences
+ # will be handled strictly.
+ def relaxedutf8(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
+ return (u"\x00", exc.start+2) # retry after two bytes
+ else:
+ raise exc
+
+ codecs.register_error(
+ "test.relaxedutf8", relaxedutf8)
+
+ sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
+ sout = u"a\x00b\x00c\xfc\x00\x00"
+ self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
+ sin = "\xc0\x80\xc0\x81"
+ self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8")
+
+ def test_charmapencode(self):
+ # For charmap encodings the replacement string will be
+ # mapped through the encoding again. This means, that
+ # to be able to use e.g. the "replace" handler, the
+ # charmap has to have a mapping for "?".
+ charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
+ sin = u"abc"
+ sout = "AABBCC"
+ self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
+
+ sin = u"abcA"
+ self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
+
+ charmap[ord("?")] = "XYZ"
+ sin = u"abcDEF"
+ sout = "AABBCCXYZXYZXYZ"
+ self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
+
+ charmap[ord("?")] = u"XYZ"
+ self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+
+ charmap[ord("?")] = u"XYZ"
+ self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+
+ def test_callbacks(self):
+ def handler1(exc):
+ if not isinstance(exc, UnicodeEncodeError) \
+ and not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+ return (u"[%s]" % u"".join(l), exc.end)
+
+ codecs.register_error("test.handler1", handler1)
+
+ def handler2(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
+ return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
+
+ codecs.register_error("test.handler2", handler2)
+
+ s = "\x00\x81\x7f\x80\xff"
+
+ self.assertEqual(
+ s.decode("ascii", "test.handler1"),
+ u"\x00[<129>]\x7f[<128>][<255>]"
+ )
+ self.assertEqual(
+ s.decode("ascii", "test.handler2"),
+ u"\x00[<129>][<128>]"
+ )
+
+ self.assertEqual(
+ "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
+ u"\u3042[<92><117><51><120>]xx"
+ )
+
+ self.assertEqual(
+ "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
+ u"\u3042[<92><117><51><120><120>]"
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
+ u"z[<98>][<99>]"
+ )
+
+ self.assertEqual(
+ u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
+ u"g[<252><223>]rk"
+ )
+
+ self.assertEqual(
+ u"g\xfc\xdf".encode("ascii", "test.handler1"),
+ u"g[<252><223>]"
+ )
+
+ def test_longstrings(self):
+ # test long strings to check for memory overflow problems
+ errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+ # register the handlers under different names,
+ # to prevent the codec from recognizing the name
+ for err in errors:
+ codecs.register_error("test." + err, codecs.lookup_error(err))
+ l = 1000
+ errors += [ "test." + err for err in errors ]
+ for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
+ for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+ for err in errors:
+ try:
+ uni.encode(enc, err)
+ except UnicodeError:
+ pass
+
+ def check_exceptionobjectargs(self, exctype, args, msg):
+ # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
+ # check with one missing argument
+ self.assertRaises(TypeError, exctype, *args[:-1])
+ # check with one missing argument
+ self.assertRaises(TypeError, exctype, *(args + ["too much"]))
+ # check with one argument of the wrong type
+ wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
+ for i in xrange(len(args)):
+ for wrongarg in wrongargs:
+ if type(wrongarg) is type(args[i]):
+ continue
+ # build argument array
+ callargs = []
+ for j in xrange(len(args)):
+ if i==j:
+ callargs.append(wrongarg)
+ else:
+ callargs.append(args[i])
+ self.assertRaises(TypeError, exctype, *callargs)
+ exc = exctype(*args)
+ self.assertEquals(str(exc), msg)
+
+ def test_unicodeencodeerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"g\xfcrk", 1, 2, "ouch"],
+ "'ascii' codec can't encode character '\ufc' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"g\xfcrk", 1, 4, "ouch"],
+ "'ascii' codec can't encode characters in position 1-3: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeEncodeError,
+ ["ascii", u"\xfcx", 0, 1, "ouch"],
+ "'ascii' codec can't encode character '\ufc' in position 0: ouch"
+ )
+
+ def test_unicodedecodeerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeDecodeError,
+ ["ascii", "g\xfcrk", 1, 2, "ouch"],
+ "'ascii' codec can't decode byte 0xfc in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeDecodeError,
+ ["ascii", "g\xfcrk", 1, 3, "ouch"],
+ "'ascii' codec can't decode bytes in position 1-2: ouch"
+ )
+
+ def test_unicodetranslateerror(self):
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\xfcrk", 1, 2, "ouch"],
+ "can't translate character '\\ufc' in position 1: ouch"
+ )
+ self.check_exceptionobjectargs(
+ UnicodeTranslateError,
+ [u"g\xfcrk", 1, 3, "ouch"],
+ "can't translate characters in position 1-2: ouch"
+ )
+
+ def test_badandgoodstrictexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.strict_errors,
+ 42
+ )
+ self.assertRaises(
+ Exception,
+ codecs.strict_errors,
+ Exception("ouch")
+ )
+
+ self.assertRaises(
+ UnicodeEncodeError,
+ codecs.strict_errors,
+ UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
+ )
+
+ def test_badandgoodignoreexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.ignore_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.ignore_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+ (u"", 1)
+ )
+ self.assertEquals(
+ codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+ (u"", 1)
+ )
+
+ def test_badandgoodreplaceexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.replace_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.replace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"?", 1)
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
+ (u"\ufffd", 1)
+ )
+ self.assertEquals(
+ codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
+ (u"\ufffd", 1)
+ )
+
+ def test_badandgoodxmlcharrefreplaceexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"&#%d;" % 0x3042, 1)
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.xmlcharrefreplace_errors,
+ UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+ )
+
+ def test_badandgoodbackslashreplaceexceptions(self):
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ 42
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
+ (u"\\u3042", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")),
+ (u"\\x00", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")),
+ (u"\\xff", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")),
+ (u"\\u0100", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")),
+ (u"\\uffff", 1)
+ )
+ if sys.maxunicode>0xffff:
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")),
+ (u"\\U00010000", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")),
+ (u"\\U0010ffff", 1)
+ )
+
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeError("ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.backslashreplace_errors,
+ UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
+ )
+
+ def test_badhandlerresults(self):
+ results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+ encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
+
+ for res in results:
+ codecs.register_error("test.badhandler", lambda: res)
+ for enc in encs:
+ self.assertRaises(
+ TypeError,
+ u"\u3042".encode,
+ enc,
+ "test.badhandler"
+ )
+ for (enc, bytes) in (
+ ("ascii", "\xff"),
+ ("utf-8", "\xff"),
+ ("utf-7", "+x-")
+ ):
+ self.assertRaises(
+ TypeError,
+ bytes.decode,
+ enc,
+ "test.badhandler"
+ )
+
+ def test_lookup(self):
+ self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+ self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore"))
+ self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
+ self.assertEquals(
+ codecs.xmlcharrefreplace_errors,
+ codecs.lookup_error("xmlcharrefreplace")
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors,
+ codecs.lookup_error("backslashreplace")
+ )
+
+def test_main():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(CodecCallbackTest))
+ test.test_support.run_suite(suite)
+
+if __name__ == "__main__":
+ test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 7034d72..ad87762 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -57,6 +57,9 @@ Type/class unification and new-style classes
Core and builtins
+- Codec error handling callbacks (PEP 293) are implemented.
+ Error handling in unicode.encode or str.decode can now be customized.
+
- A subtle change to the semantics of the built-in function intern():
interned strings are no longer immortal. You must keep a reference
to the return value intern() around to get the benefit.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index 1e3fc5d..24fa1d5 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -706,6 +706,32 @@ mbcs_encode(PyObject *self,
#endif /* MS_WINDOWS */
#endif /* Py_USING_UNICODE */
+/* --- Error handler registry --------------------------------------------- */
+
+static PyObject *register_error(PyObject *self, PyObject *args)
+{
+ const char *name;
+ PyObject *handler;
+
+ if (!PyArg_ParseTuple(args, "sO:register_error",
+ &name, &handler))
+ return NULL;
+ if (PyCodec_RegisterError(name, handler))
+ return NULL;
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static PyObject *lookup_error(PyObject *self, PyObject *args)
+{
+ const char *name;
+
+ if (!PyArg_ParseTuple(args, "s:lookup_error",
+ &name))
+ return NULL;
+ return PyCodec_LookupError(name);
+}
+
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
@@ -744,6 +770,8 @@ static PyMethodDef _codecs_functions[] = {
{"mbcs_decode", mbcs_decode, METH_VARARGS},
#endif
#endif /* Py_USING_UNICODE */
+ {"register_error", register_error, METH_VARARGS},
+ {"lookup_error", lookup_error, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index 8ae9407..31f188a 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -2468,7 +2468,9 @@ PyDoc_STRVAR(encode__doc__,
Encodes S using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
-a ValueError. Other possible values are 'ignore' and 'replace'.");
+a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
+'xmlcharrefreplace' as well as any other name registered with\n\
+codecs.register_error that is able to handle UnicodeEncodeErrors.");
static PyObject *
string_encode(PyStringObject *self, PyObject *args)
@@ -2487,7 +2489,9 @@ PyDoc_STRVAR(decode__doc__,
Decodes S using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
-a ValueError. Other possible values are 'ignore' and 'replace'.");
+a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
+as well as any other name registerd with codecs.register_error that is\n\
+able to handle UnicodeDecodeErrors.");
static PyObject *
string_decode(PyStringObject *self, PyObject *args)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 920f9ea..2108d94 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -528,8 +528,8 @@ PyObject *PyUnicode_Decode(const char *s,
const char *errors)
{
PyObject *buffer = NULL, *unicode;
-
- if (encoding == NULL)
+
+ if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
@@ -680,6 +680,92 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
return -1;
}
+/* error handling callback helper:
+ build arguments, call the callback and check the arguments,
+ if no exception occured, copy the replacement to the output
+ and adjust various state variables.
+ return 0 on success, -1 on error
+*/
+
+static
+int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
+ const char *encoding, const char *reason,
+ const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
+ PyObject **output, int *outpos, Py_UNICODE **outptr)
+{
+ static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
+
+ PyObject *restuple = NULL;
+ PyObject *repunicode = NULL;
+ int outsize = PyUnicode_GET_SIZE(*output);
+ int requiredsize;
+ int newpos;
+ Py_UNICODE *repptr;
+ int repsize;
+ int res = -1;
+
+ if (*errorHandler == NULL) {
+ *errorHandler = PyCodec_LookupError(errors);
+ if (*errorHandler == NULL)
+ goto onError;
+ }
+
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeDecodeError_Create(
+ encoding, input, insize, *startinpos, *endinpos, reason);
+ if (*exceptionObject == NULL)
+ goto onError;
+ }
+ else {
+ if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+ goto onError;
+ }
+
+ restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
+ if (restuple == NULL)
+ goto onError;
+ if (!PyTuple_Check(restuple)) {
+ PyErr_Format(PyExc_TypeError, &argparse[4]);
+ goto onError;
+ }
+ if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
+ goto onError;
+ if (newpos<0)
+ newpos = 0;
+ else if (newpos>insize)
+ newpos = insize;
+
+ /* need more space? (at least enough for what we
+ have+the replacement+the rest of the string (starting
+ at the new input position), so we won't have to check space
+ when there are no errors in the rest of the string) */
+ repptr = PyUnicode_AS_UNICODE(repunicode);
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ requiredsize = *outpos + repsize + insize-newpos;
+ if (requiredsize > outsize) {
+ if (requiredsize<2*outsize)
+ requiredsize = 2*outsize;
+ if (PyUnicode_Resize(output, requiredsize))
+ goto onError;
+ *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
+ }
+ *endinpos = newpos;
+ *inptr = input + newpos;
+ Py_UNICODE_COPY(*outptr, repptr, repsize);
+ *outptr += repsize;
+ *outpos += repsize;
+ /* we made it! */
+ res = 0;
+
+ onError:
+ Py_XDECREF(restuple);
+ return res;
+}
+
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
@@ -738,40 +824,14 @@ char utf7_special[128] = {
} \
} \
-static
-int utf7_decoding_error(Py_UNICODE **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-7 decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- if (dest != NULL) {
- **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
- (*dest)++;
- }
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-7 decoding error; unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
PyObject *PyUnicode_DecodeUTF7(const char *s,
int size,
const char *errors)
{
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
@@ -779,7 +839,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
int inShift = 0;
unsigned int bitsleft = 0;
unsigned long charsleft = 0;
- int surrogate = 0;
+ int surrogate = 0;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
unicode = _PyUnicode_New(size);
if (!unicode)
@@ -791,7 +853,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
e = s + size;
while (s < e) {
- Py_UNICODE ch = *s;
+ Py_UNICODE ch;
+ restart:
+ ch = *s;
if (inShift) {
if ((ch == '-') || !B64CHAR(ch)) {
@@ -836,6 +900,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
}
}
else if ( ch == '+' ) {
+ startinpos = s-starts;
s++;
if (s < e && *s == '-') {
s++;
@@ -857,21 +922,39 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
}
continue;
utf7Error:
- if (utf7_decoding_error(&p, errors, errmsg))
- goto onError;
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = s-starts;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf7", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
+ goto onError;
}
if (inShift) {
- if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = size;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf7", "unterminated shift sequence",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
goto onError;
+ if (s < e)
+ goto restart;
}
- if (_PyUnicode_Resize(&unicode, p - unicode->str))
+ if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
Py_DECREF(unicode);
return NULL;
}
@@ -1001,46 +1084,21 @@ char utf8_code_length[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
-static
-int utf8_decoding_error(const char **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-8 decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- (*source)++;
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- (*source)++;
- **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-8 decoding error; unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
PyObject *PyUnicode_DecodeUTF8(const char *s,
int size,
const char *errors)
{
+ const char *starts = s;
int n;
+ int startinpos;
+ int endinpos;
+ int outpos;
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
@@ -1067,6 +1125,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
if (s + n > e) {
errmsg = "unexpected end of data";
+ startinpos = s-starts;
+ endinpos = size;
goto utf8Error;
}
@@ -1074,19 +1134,27 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
case 0:
errmsg = "unexpected code byte";
+ startinpos = s-starts;
+ endinpos = startinpos+1;
goto utf8Error;
case 1:
errmsg = "internal error";
+ startinpos = s-starts;
+ endinpos = startinpos+1;
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid data";
+ startinpos = s-starts;
+ endinpos = startinpos+2;
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
if (ch < 0x80) {
+ startinpos = s-starts;
+ endinpos = startinpos+2;
errmsg = "illegal encoding";
goto utf8Error;
}
@@ -1098,6 +1166,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80) {
errmsg = "invalid data";
+ startinpos = s-starts;
+ endinpos = startinpos+3;
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
@@ -1110,6 +1180,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
unit.
*/
errmsg = "illegal encoding";
+ startinpos = s-starts;
+ endinpos = startinpos+3;
goto utf8Error;
}
else
@@ -1121,6 +1193,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80) {
errmsg = "invalid data";
+ startinpos = s-starts;
+ endinpos = startinpos+4;
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
@@ -1132,6 +1206,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
UTF-16 */
{
errmsg = "illegal encoding";
+ startinpos = s-starts;
+ endinpos = startinpos+4;
goto utf8Error;
}
#ifdef Py_UNICODE_WIDE
@@ -1153,23 +1229,34 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
default:
/* Other sizes are only needed for UCS-4 */
errmsg = "unsupported Unicode code range";
+ startinpos = s-starts;
+ endinpos = startinpos+n;
goto utf8Error;
}
s += n;
continue;
utf8Error:
- if (utf8_decoding_error(&s, &p, errors, errmsg))
- goto onError;
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf8", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
+ goto onError;
}
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
Py_DECREF(unicode);
return NULL;
}
@@ -1287,43 +1374,16 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
/* --- UTF-16 Codec ------------------------------------------------------- */
-static
-int utf16_decoding_error(Py_UNICODE **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "UTF-16 decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- if (dest) {
- **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
- (*dest)++;
- }
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "UTF-16 decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
PyObject *
PyUnicode_DecodeUTF16(const char *s,
int size,
const char *errors,
int *byteorder)
{
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const unsigned char *q, *e;
@@ -1335,13 +1395,8 @@ PyUnicode_DecodeUTF16(const char *s,
#else
int ihi = 0, ilo = 1;
#endif
-
- /* size should be an even number */
- if (size & 1) {
- if (utf16_decoding_error(NULL, errors, "truncated data"))
- return NULL;
- --size; /* else ignore the oddball byte */
- }
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
@@ -1398,7 +1453,18 @@ PyUnicode_DecodeUTF16(const char *s,
}
while (q < e) {
- Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
+ Py_UNICODE ch;
+ /* remaing bytes at the end? (size should be even) */
+ if (e-q<2) {
+ errmsg = "truncated data";
+ startinpos = ((const char *)q)-starts;
+ endinpos = ((const char *)e)-starts;
+ goto utf16Error;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ }
+ ch = (q[ihi] << 8) | q[ilo];
+
q += 2;
if (ch < 0xD800 || ch > 0xDFFF) {
@@ -1409,6 +1475,8 @@ PyUnicode_DecodeUTF16(const char *s,
/* UTF-16 code pair: */
if (q >= e) {
errmsg = "unexpected end of data";
+ startinpos = (((const char *)q)-2)-starts;
+ endinpos = ((const char *)e)-starts;
goto utf16Error;
}
if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -1425,15 +1493,24 @@ PyUnicode_DecodeUTF16(const char *s,
}
else {
errmsg = "illegal UTF-16 surrogate";
+ startinpos = (((const char *)q)-4)-starts;
+ endinpos = startinpos+2;
goto utf16Error;
}
}
errmsg = "illegal encoding";
+ startinpos = (((const char *)q)-2)-starts;
+ endinpos = startinpos+2;
/* Fall through to report the error */
utf16Error:
- if (utf16_decoding_error(&p, errors, errmsg))
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf16", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
+ (PyObject **)&unicode, &outpos, &p))
goto onError;
}
@@ -1444,10 +1521,14 @@ PyUnicode_DecodeUTF16(const char *s,
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
}
@@ -1528,63 +1609,43 @@ PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
/* --- Unicode Escape Codec ----------------------------------------------- */
-static
-int unicodeescape_decoding_error(Py_UNICODE **x,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "Unicode-Escape decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **x = Py_UNICODE_REPLACEMENT_CHARACTER;
- (*x)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "Unicode-Escape decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
const char *errors)
{
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ int i;
PyUnicodeObject *v;
- Py_UNICODE *p, *buf;
+ Py_UNICODE *p;
const char *end;
char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
- length after conversion to the true value. */
+ length after conversion to the true value.
+ (but if the error callback returns a long replacement string
+ we'll have to allocate more space) */
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
- p = buf = PyUnicode_AS_UNICODE(v);
+ p = PyUnicode_AS_UNICODE(v);
end = s + size;
while (s < end) {
unsigned char c;
Py_UNICODE x;
- int i, digits;
+ int digits;
/* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') {
@@ -1592,6 +1653,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
continue;
}
+ startinpos = s-starts;
/* \ - Escapes */
s++;
switch (*s++) {
@@ -1640,14 +1702,28 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
message = "truncated \\UXXXXXXXX escape";
hexescape:
chr = 0;
- for (i = 0; i < digits; i++) {
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (s+digits>end) {
+ endinpos = size;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", "end of string in escape sequence",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
+ goto onError;
+ goto nextByte;
+ }
+ for (i = 0; i < digits; ++i) {
c = (unsigned char) s[i];
if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&p, errors, message))
+ endinpos = (s+i+1)-starts;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
- chr = 0xffffffff;
- i++;
- break;
+ goto nextByte;
}
chr = (chr<<4) & ~0xF;
if (c >= '0' && c <= '9')
@@ -1659,9 +1735,9 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
}
s += i;
if (chr == 0xffffffff)
- /* _decoding_error will have already written into the
- target buffer. */
- break;
+ /* _decoding_error will have already written into the
+ target buffer. */
+ break;
store:
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
@@ -1678,10 +1754,13 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
#endif
} else {
- if (unicodeescape_decoding_error(
- &p, errors,
- "illegal Unicode character")
- )
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", "illegal Unicode character",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
}
break;
@@ -1717,13 +1796,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
goto store;
}
}
- if (unicodeescape_decoding_error(&p, errors, message))
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
break;
default:
if (s > end) {
- if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
+ message = "\\ at end of string";
+ s--;
+ endinpos = s-starts;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicodeescape", message,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
}
else {
@@ -1732,9 +1825,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
}
break;
}
+ nextByte:
+ ;
}
- if (_PyUnicode_Resize(&v, (int)(p - buf)))
- goto onError;
+ if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
+ goto onError;
return (PyObject *)v;
ucnhashError:
@@ -1742,10 +1837,14 @@ ucnhashError:
PyExc_UnicodeError,
"\\N escapes not supported (can't load unicodedata module)"
);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
onError:
Py_XDECREF(v);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
}
@@ -1909,20 +2008,27 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
int size,
const char *errors)
{
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
PyUnicodeObject *v;
- Py_UNICODE *p, *buf;
+ Py_UNICODE *p;
const char *end;
const char *bs;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
- length after conversion to the true value. */
+ length after conversion to the true value. (But decoding error
+ handler might have to resize the string) */
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
- p = buf = PyUnicode_AS_UNICODE(v);
+ p = PyUnicode_AS_UNICODE(v);
end = s + size;
while (s < end) {
unsigned char c;
@@ -1934,6 +2040,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
*p++ = (unsigned char)*s++;
continue;
}
+ startinpos = s-starts;
/* \u-escapes are only interpreted iff the number of leading
backslashes if odd */
@@ -1952,15 +2059,18 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
s++;
/* \uXXXX with 4 hex digits */
- for (x = 0, i = 0; i < 4; i++) {
- c = (unsigned char)s[i];
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ for (x = 0, i = 0; i < 4; ++i, ++s) {
+ c = (unsigned char)*s;
if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&p, errors,
- "truncated \\uXXXX"))
+ endinpos = s-starts;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "rawunicodeescape", "truncated \\uXXXX",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
- x = 0xffffffff;
- i++;
- break;
+ goto nextByte;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
@@ -1970,16 +2080,20 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
else
x += 10 + c - 'A';
}
- s += i;
- if (x != 0xffffffff)
- *p++ = x;
+ *p++ = x;
+ nextByte:
+ ;
}
- if (_PyUnicode_Resize(&v, (int)(p - buf)))
+ if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)v;
onError:
Py_XDECREF(v);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
}
@@ -2059,71 +2173,271 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
return NULL;
}
-static
-int latin1_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "Latin-1 encoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
+/* create or adjust a UnicodeEncodeError */
+static void make_encode_exception(PyObject **exceptionObject,
+ const char *encoding,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
+{
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeEncodeError_Create(
+ encoding, unicode, size, startpos, endpos, reason);
}
else {
- PyErr_Format(PyExc_ValueError,
- "Latin-1 encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
+ if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
+ goto onError;
+ if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
+ goto onError;
+ if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
+ goto onError;
+ return;
+ onError:
+ Py_DECREF(*exceptionObject);
+ *exceptionObject = NULL;
}
}
-PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
- int size,
- const char *errors)
+/* raises a UnicodeEncodeError */
+static void raise_encode_exception(PyObject **exceptionObject,
+ const char *encoding,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
{
- PyObject *repr;
- char *s, *start;
+ make_encode_exception(exceptionObject,
+ encoding, unicode, size, startpos, endpos, reason);
+ if (*exceptionObject != NULL)
+ PyCodec_StrictErrors(*exceptionObject);
+}
- repr = PyString_FromStringAndSize(NULL, size);
- if (repr == NULL)
- return NULL;
- if (size == 0)
- return repr;
+/* error handling callback helper:
+ build arguments, call the callback and check the arguments,
+ put the result into newpos and return the replacement string, which
+ has to be freed by the caller */
+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+ PyObject **errorHandler,
+ const char *encoding, const char *reason,
+ const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
+ int startpos, int endpos,
+ int *newpos)
+{
+ static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
- s = PyString_AS_STRING(repr);
- start = s;
- while (size-- > 0) {
- Py_UNICODE ch = *p++;
- if (ch >= 256) {
- if (latin1_encoding_error(&p, &s, errors,
- "ordinal not in range(256)"))
- goto onError;
+ PyObject *restuple;
+ PyObject *resunicode;
+
+ if (*errorHandler == NULL) {
+ *errorHandler = PyCodec_LookupError(errors);
+ if (*errorHandler == NULL)
+ return NULL;
+ }
+
+ make_encode_exception(exceptionObject,
+ encoding, unicode, size, startpos, endpos, reason);
+ if (*exceptionObject == NULL)
+ return NULL;
+
+ restuple = PyObject_CallFunctionObjArgs(
+ *errorHandler, *exceptionObject, NULL);
+ if (restuple == NULL)
+ return NULL;
+ if (!PyTuple_Check(restuple)) {
+ PyErr_Format(PyExc_TypeError, &argparse[4]);
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+ &resunicode, newpos)) {
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (*newpos<0)
+ *newpos = 0;
+ else if (*newpos>size)
+ *newpos = size;
+ Py_INCREF(resunicode);
+ Py_DECREF(restuple);
+ return resunicode;
+}
+
+static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
+ int size,
+ const char *errors,
+ int limit)
+{
+ /* output object */
+ PyObject *res;
+ /* pointers to the beginning and end+1 of input */
+ const Py_UNICODE *startp = p;
+ const Py_UNICODE *endp = p + size;
+ /* pointer to the beginning of the unencodable characters */
+ /* const Py_UNICODE *badp = NULL; */
+ /* pointer into the output */
+ char *str;
+ /* current output position */
+ int respos = 0;
+ int ressize;
+ char *encoding = (limit == 256) ? "latin-1" : "ascii";
+ char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
+
+ /* allocate enough for a simple encoding without
+ replacements, if we need more, we'll resize */
+ res = PyString_FromStringAndSize(NULL, size);
+ if (res == NULL)
+ goto onError;
+ if (size == 0)
+ return res;
+ str = PyString_AS_STRING(res);
+ ressize = size;
+
+ while (p<endp) {
+ Py_UNICODE c = *p;
+
+ /* can we encode this? */
+ if (c<limit) {
+ /* no overflow check, because we know that the space is enough */
+ *str++ = (char)c;
+ ++p;
+ }
+ else {
+ int unicodepos = p-startp;
+ int requiredsize;
+ PyObject *repunicode;
+ int repsize;
+ int newpos;
+ int respos;
+ Py_UNICODE *uni2;
+ /* startpos for collecting unencodable chars */
+ const Py_UNICODE *collstart = p;
+ const Py_UNICODE *collend = p;
+ /* find all unecodable characters */
+ while ((collend < endp) && ((*collend)>=limit))
+ ++collend;
+ /* cache callback name lookup (if not done yet, i.e. it's the first error) */
+ if (known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ known_errorHandler = 4;
+ else
+ known_errorHandler = 0;
+ }
+ switch (known_errorHandler) {
+ case 1: /* strict */
+ raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
+ goto onError;
+ case 2: /* replace */
+ while (collstart++<collend)
+ *str++ = '?'; /* fall through */
+ case 3: /* ignore */
+ p = collend;
+ break;
+ case 4: /* xmlcharrefreplace */
+ respos = str-PyString_AS_STRING(res);
+ /* determine replacement size (temporarily (mis)uses p) */
+ for (p = collstart, repsize = 0; p < collend; ++p) {
+ if (*p<10)
+ repsize += 2+1+1;
+ else if (*p<100)
+ repsize += 2+2+1;
+ else if (*p<1000)
+ repsize += 2+3+1;
+ else if (*p<10000)
+ repsize += 2+4+1;
+ else if (*p<100000)
+ repsize += 2+5+1;
+ else if (*p<1000000)
+ repsize += 2+6+1;
+ else
+ repsize += 2+7+1;
+ }
+ requiredsize = respos+repsize+(endp-collend);
+ if (requiredsize > ressize) {
+ if (requiredsize<2*ressize)
+ requiredsize = 2*ressize;
+ if (_PyString_Resize(&res, requiredsize))
+ goto onError;
+ str = PyString_AS_STRING(res) + respos;
+ ressize = requiredsize;
+ }
+ /* generate replacement (temporarily (mis)uses p) */
+ for (p = collstart; p < collend; ++p) {
+ str += sprintf(str, "&#%d;", (int)*p);
+ }
+ p = collend;
+ break;
+ default:
+ repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ encoding, reason, startp, size, &exc,
+ collstart-startp, collend-startp, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ /* need more space? (at least enough for what we
+ have+the replacement+the rest of the string, so
+ we won't have to check space for encodable characters) */
+ respos = str-PyString_AS_STRING(res);
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ requiredsize = respos+repsize+(endp-collend);
+ if (requiredsize > ressize) {
+ if (requiredsize<2*ressize)
+ requiredsize = 2*ressize;
+ if (_PyString_Resize(&res, requiredsize)) {
+ Py_DECREF(repunicode);
+ goto onError;
+ }
+ str = PyString_AS_STRING(res) + respos;
+ ressize = requiredsize;
+ }
+ /* check if there is anything unencodable in the replacement
+ and copy it to the output */
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
+ c = *uni2;
+ if (c >= limit) {
+ raise_encode_exception(&exc, encoding, startp, size,
+ unicodepos, unicodepos+1, reason);
+ Py_DECREF(repunicode);
+ goto onError;
+ }
+ *str = (char)c;
+ }
+ p = startp + newpos;
+ Py_DECREF(repunicode);
+ }
}
- else
- *s++ = (char)ch;
}
- /* Resize if error handling skipped some characters */
- if (s - start < PyString_GET_SIZE(repr))
- _PyString_Resize(&repr, s - start);
- return repr;
+ /* Resize if we allocated to much */
+ respos = str-PyString_AS_STRING(res);
+ if (respos<ressize)
+ /* If this falls res will be NULL */
+ _PyString_Resize(&res, respos);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return res;
- onError:
- Py_DECREF(repr);
+ onError:
+ Py_XDECREF(res);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
}
+PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
+ int size,
+ const char *errors)
+{
+ return unicode_encode_ucs1(p, size, errors, 256);
+}
+
PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
@@ -2137,42 +2451,19 @@ PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
/* --- 7-bit ASCII Codec -------------------------------------------------- */
-static
-int ascii_decoding_error(const char **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "ASCII decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "ASCII decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
PyObject *PyUnicode_DecodeASCII(const char *s,
int size,
const char *errors)
{
+ const char *starts = s;
PyUnicodeObject *v;
Py_UNICODE *p;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ const char *e;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && *(unsigned char*)s < 128) {
@@ -2186,89 +2477,44 @@ PyObject *PyUnicode_DecodeASCII(const char *s,
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0) {
- register unsigned char c;
-
- c = (unsigned char)*s++;
- if (c < 128)
+ e = s + size;
+ while (s < e) {
+ register unsigned char c = (unsigned char)*s;
+ if (c < 128) {
*p++ = c;
- else if (ascii_decoding_error(&s, &p, errors,
- "ordinal not in range(128)"))
+ ++s;
+ }
+ else {
+ startinpos = s-starts;
+ endinpos = startinpos + 1;
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "ascii", "ordinal not in range(128)",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p))
goto onError;
+ }
}
if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)v;
onError:
Py_XDECREF(v);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return NULL;
}
-static
-int ascii_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "ASCII encoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "ASCII encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
int size,
const char *errors)
{
- PyObject *repr;
- char *s, *start;
-
- repr = PyString_FromStringAndSize(NULL, size);
- if (repr == NULL)
- return NULL;
- if (size == 0)
- return repr;
-
- s = PyString_AS_STRING(repr);
- start = s;
- while (size-- > 0) {
- Py_UNICODE ch = *p++;
- if (ch >= 128) {
- if (ascii_encoding_error(&p, &s, errors,
- "ordinal not in range(128)"))
- goto onError;
- }
- else
- *s++ = (char)ch;
- }
- /* Resize if error handling skipped some characters */
- if (s - start < PyString_GET_SIZE(repr))
- _PyString_Resize(&repr, s - start);
- return repr;
-
- onError:
- Py_DECREF(repr);
- return NULL;
+ return unicode_encode_ucs1(p, size, errors, 128);
}
PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
@@ -2348,44 +2594,21 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
/* --- Character Mapping Codec -------------------------------------------- */
-static
-int charmap_decoding_error(const char **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "charmap decoding error: %.400s",
- details);
- return -1;
- }
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
- }
- else if (strcmp(errors,"replace") == 0) {
- **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
- (*dest)++;
- return 0;
- }
- else {
- PyErr_Format(PyExc_ValueError,
- "charmap decoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
- }
-}
-
PyObject *PyUnicode_DecodeCharmap(const char *s,
int size,
PyObject *mapping,
const char *errors)
{
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ const char *e;
PyUnicodeObject *v;
Py_UNICODE *p;
int extrachars = 0;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
/* Default to Latin-1 */
if (mapping == NULL)
@@ -2397,8 +2620,9 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0) {
- unsigned char ch = *s++;
+ e = s + size;
+ while (s < e) {
+ unsigned char ch = *s;
PyObject *w, *x;
/* Get mapping (char ordinal -> integer, Unicode char or None) */
@@ -2430,11 +2654,18 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
}
else if (x == Py_None) {
/* undefined mapping */
- if (charmap_decoding_error(&s, &p, errors,
- "character maps to <undefined>")) {
+ outpos = p-PyUnicode_AS_UNICODE(v);
+ startinpos = s-starts;
+ endinpos = startinpos+1;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "charmap", "character maps to <undefined>",
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p)) {
Py_DECREF(x);
goto onError;
}
+ continue;
}
else if (PyUnicode_Check(x)) {
int targetsize = PyUnicode_GET_SIZE(x);
@@ -2474,45 +2705,233 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
goto onError;
}
Py_DECREF(x);
+ ++s;
}
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
return (PyObject *)v;
onError:
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
Py_XDECREF(v);
return NULL;
}
-static
-int charmap_encoding_error(const Py_UNICODE **source,
- char **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "charmap encoding error: %.400s",
- details);
- return -1;
+/* Lookup the character ch in the mapping. If the character
+ can't be found, Py_None is returned (or NULL, if another
+ error occured). */
+static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
+{
+ PyObject *w = PyInt_FromLong((long)c);
+ PyObject *x;
+
+ if (w == NULL)
+ return NULL;
+ x = PyObject_GetItem(mapping, w);
+ Py_DECREF(w);
+ if (x == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+ /* No mapping found means: mapping is undefined. */
+ PyErr_Clear();
+ x = Py_None;
+ Py_INCREF(x);
+ return x;
+ } else
+ return NULL;
}
- else if (strcmp(errors,"ignore") == 0) {
- return 0;
+ else if (PyInt_Check(x)) {
+ long value = PyInt_AS_LONG(x);
+ if (value < 0 || value > 255) {
+ PyErr_SetString(PyExc_TypeError,
+ "character mapping must be in range(256)");
+ Py_DECREF(x);
+ return NULL;
+ }
+ return x;
}
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
- return 0;
+ else if (PyString_Check(x))
+ return x;
+ else {
+ /* wrong return value */
+ PyErr_SetString(PyExc_TypeError,
+ "character mapping must return integer, None or str");
+ Py_DECREF(x);
+ return NULL;
}
+}
+
+/* lookup the character, put the result in the output string and adjust
+ various state variables. Reallocate the output string if not enough
+ space is available. Return a new reference to the object that
+ was put in the output buffer, or Py_None, if the mapping was undefined
+ (in which case no character was written) or NULL, if a
+ reallocation error ocurred. The called must decref the result */
+static
+PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
+ PyObject **outobj, int *outpos)
+{
+ PyObject *rep = charmapencode_lookup(c, mapping);
+
+ if (rep==NULL)
+ return NULL;
+ else if (rep==Py_None)
+ return rep;
else {
- PyErr_Format(PyExc_ValueError,
- "charmap encoding error; "
- "unknown error handling code: %.400s",
- errors);
- return -1;
+ char *outstart = PyString_AS_STRING(*outobj);
+ int outsize = PyString_GET_SIZE(*outobj);
+ if (PyInt_Check(rep)) {
+ int requiredsize = *outpos+1;
+ if (outsize<requiredsize) {
+ /* exponentially overallocate to minimize reallocations */
+ if (requiredsize < 2*outsize)
+ requiredsize = 2*outsize;
+ if (_PyString_Resize(outobj, requiredsize)) {
+ Py_DECREF(rep);
+ return NULL;
+ }
+ outstart = PyString_AS_STRING(*outobj);
+ }
+ outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
+ }
+ else {
+ const char *repchars = PyString_AS_STRING(rep);
+ int repsize = PyString_GET_SIZE(rep);
+ int requiredsize = *outpos+repsize;
+ if (outsize<requiredsize) {
+ /* exponentially overallocate to minimize reallocations */
+ if (requiredsize < 2*outsize)
+ requiredsize = 2*outsize;
+ if (_PyString_Resize(outobj, requiredsize)) {
+ Py_DECREF(rep);
+ return NULL;
+ }
+ outstart = PyString_AS_STRING(*outobj);
+ }
+ memcpy(outstart + *outpos, repchars, repsize);
+ *outpos += repsize;
+ }
+ }
+ return rep;
+}
+
+/* handle an error in PyUnicode_EncodeCharmap
+ Return 0 on success, -1 on error */
+static
+int charmap_encoding_error(
+ const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
+ PyObject **exceptionObject,
+ int *known_errorHandler, PyObject *errorHandler, const char *errors,
+ PyObject **res, int *respos)
+{
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ int repsize;
+ int newpos;
+ Py_UNICODE *uni2;
+ /* startpos for collecting unencodable chars */
+ int collstartpos = *inpos;
+ int collendpos = *inpos+1;
+ int collpos;
+ char *encoding = "charmap";
+ char *reason = "character maps to <undefined>";
+
+ PyObject *x;
+ /* find all unencodable characters */
+ while (collendpos < size) {
+ x = charmapencode_lookup(p[collendpos], mapping);
+ if (x==NULL)
+ return -1;
+ else if (x!=Py_None) {
+ Py_DECREF(x);
+ break;
+ }
+ Py_DECREF(x);
+ ++collendpos;
+ }
+ /* cache callback name lookup
+ * (if not done yet, i.e. it's the first error) */
+ if (*known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ *known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ *known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ *known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ *known_errorHandler = 4;
+ else
+ *known_errorHandler = 0;
+ }
+ switch (*known_errorHandler) {
+ case 1: /* strict */
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ case 2: /* replace */
+ for (collpos = collstartpos; collpos<collendpos; ++collpos) {
+ x = charmapencode_output('?', mapping, res, respos);
+ if (x==NULL) {
+ return -1;
+ }
+ else if (x==Py_None) {
+ Py_DECREF(x);
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ }
+ Py_DECREF(x);
+ }
+ /* fall through */
+ case 3: /* ignore */
+ *inpos = collendpos;
+ break;
+ case 4: /* xmlcharrefreplace */
+ /* generate replacement (temporarily (mis)uses p) */
+ for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+ char buffer[2+29+1+1];
+ char *cp;
+ sprintf(buffer, "&#%d;", (int)p[collpos]);
+ for (cp = buffer; *cp; ++cp) {
+ x = charmapencode_output(*cp, mapping, res, respos);
+ if (x==NULL)
+ return -1;
+ else if (x==Py_None) {
+ Py_DECREF(x);
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ }
+ Py_DECREF(x);
+ }
+ }
+ *inpos = collendpos;
+ break;
+ default:
+ repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ encoding, reason, p, size, exceptionObject,
+ collstartpos, collendpos, &newpos);
+ if (repunicode == NULL)
+ return -1;
+ /* generate replacement */
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
+ x = charmapencode_output(*uni2, mapping, res, respos);
+ if (x==NULL) {
+ Py_DECREF(repunicode);
+ return -1;
+ }
+ else if (x==Py_None) {
+ Py_DECREF(repunicode);
+ Py_DECREF(x);
+ raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ return -1;
+ }
+ Py_DECREF(x);
+ }
+ *inpos = newpos;
+ Py_DECREF(repunicode);
}
+ return 0;
}
PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
@@ -2520,101 +2939,62 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
PyObject *mapping,
const char *errors)
{
- PyObject *v;
- char *s;
- int extrachars = 0;
+ /* output object */
+ PyObject *res = NULL;
+ /* current input position */
+ int inpos = 0;
+ /* current output position */
+ int respos = 0;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+ * 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
/* Default to Latin-1 */
if (mapping == NULL)
return PyUnicode_EncodeLatin1(p, size, errors);
- v = PyString_FromStringAndSize(NULL, size);
- if (v == NULL)
- return NULL;
+ /* allocate enough for a simple encoding without
+ replacements, if we need more, we'll resize */
+ res = PyString_FromStringAndSize(NULL, size);
+ if (res == NULL)
+ goto onError;
if (size == 0)
- return v;
- s = PyString_AS_STRING(v);
- while (size-- > 0) {
- Py_UNICODE ch = *p++;
- PyObject *w, *x;
+ return res;
- /* Get mapping (Unicode ordinal -> string char, integer or None) */
- w = PyInt_FromLong((long)ch);
- if (w == NULL)
+ while (inpos<size) {
+ /* try to encode it */
+ PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
+ if (x==NULL) /* error */
goto onError;
- x = PyObject_GetItem(mapping, w);
- Py_DECREF(w);
- if (x == NULL) {
- if (PyErr_ExceptionMatches(PyExc_LookupError)) {
- /* No mapping found means: mapping is undefined. */
- PyErr_Clear();
- x = Py_None;
- Py_INCREF(x);
- } else
+ if (x==Py_None) { /* unencodable character */
+ if (charmap_encoding_error(p, size, &inpos, mapping,
+ &exc,
+ &known_errorHandler, errorHandler, errors,
+ &res, &respos))
goto onError;
}
+ else
+ /* done with this character => adjust input position */
+ ++inpos;
+ Py_DECREF(x);
+ }
- /* Apply mapping */
- if (PyInt_Check(x)) {
- long value = PyInt_AS_LONG(x);
- if (value < 0 || value > 255) {
- PyErr_SetString(PyExc_TypeError,
- "character mapping must be in range(256)");
- Py_DECREF(x);
- goto onError;
- }
- *s++ = (char)value;
- }
- else if (x == Py_None) {
- /* undefined mapping */
- if (charmap_encoding_error(&p, &s, errors,
- "character maps to <undefined>")) {
- Py_DECREF(x);
- goto onError;
- }
- }
- else if (PyString_Check(x)) {
- int targetsize = PyString_GET_SIZE(x);
-
- if (targetsize == 1)
- /* 1-1 mapping */
- *s++ = *PyString_AS_STRING(x);
-
- else if (targetsize > 1) {
- /* 1-n mapping */
- if (targetsize > extrachars) {
- /* resize first */
- int oldpos = (int)(s - PyString_AS_STRING(v));
- int needed = (targetsize - extrachars) + \
- (targetsize << 2);
- extrachars += needed;
- if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
- Py_DECREF(x);
- goto onError;
- }
- s = PyString_AS_STRING(v) + oldpos;
- }
- memcpy(s, PyString_AS_STRING(x), targetsize);
- s += targetsize;
- extrachars -= targetsize;
- }
- /* 1-0 mapping: skip the character */
- }
- else {
- /* wrong return value */
- PyErr_SetString(PyExc_TypeError,
- "character mapping must return integer, None or unicode");
- Py_DECREF(x);
+ /* Resize if we allocated to much */
+ if (respos<PyString_GET_SIZE(res)) {
+ if (_PyString_Resize(&res, respos))
goto onError;
- }
- Py_DECREF(x);
}
- if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
- _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
- return v;
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
+ return res;
- onError:
- Py_XDECREF(v);
+ onError:
+ Py_XDECREF(res);
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return NULL;
}
@@ -2631,115 +3011,344 @@ PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
NULL);
}
+/* create or adjust a UnicodeTranslateError */
+static void make_translate_exception(PyObject **exceptionObject,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
+{
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeTranslateError_Create(
+ unicode, size, startpos, endpos, reason);
+ }
+ else {
+ if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
+ goto onError;
+ if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
+ goto onError;
+ if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
+ goto onError;
+ return;
+ onError:
+ Py_DECREF(*exceptionObject);
+ *exceptionObject = NULL;
+ }
+}
+
+/* raises a UnicodeTranslateError */
+static void raise_translate_exception(PyObject **exceptionObject,
+ const Py_UNICODE *unicode, int size,
+ int startpos, int endpos,
+ const char *reason)
+{
+ make_translate_exception(exceptionObject,
+ unicode, size, startpos, endpos, reason);
+ if (*exceptionObject != NULL)
+ PyCodec_StrictErrors(*exceptionObject);
+}
+
+/* error handling callback helper:
+ build arguments, call the callback and check the arguments,
+ put the result into newpos and return the replacement string, which
+ has to be freed by the caller */
+static PyObject *unicode_translate_call_errorhandler(const char *errors,
+ PyObject **errorHandler,
+ const char *reason,
+ const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
+ int startpos, int endpos,
+ int *newpos)
+{
+ static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
+
+ PyObject *restuple;
+ PyObject *resunicode;
+
+ if (*errorHandler == NULL) {
+ *errorHandler = PyCodec_LookupError(errors);
+ if (*errorHandler == NULL)
+ return NULL;
+ }
+
+ make_translate_exception(exceptionObject,
+ unicode, size, startpos, endpos, reason);
+ if (*exceptionObject == NULL)
+ return NULL;
+
+ restuple = PyObject_CallFunctionObjArgs(
+ *errorHandler, *exceptionObject, NULL);
+ if (restuple == NULL)
+ return NULL;
+ if (!PyTuple_Check(restuple)) {
+ PyErr_Format(PyExc_TypeError, &argparse[4]);
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+ &resunicode, newpos)) {
+ Py_DECREF(restuple);
+ return NULL;
+ }
+ if (*newpos<0)
+ *newpos = 0;
+ else if (*newpos>size)
+ *newpos = size;
+ Py_INCREF(resunicode);
+ Py_DECREF(restuple);
+ return resunicode;
+}
+
+/* Lookup the character ch in the mapping and put the result in result,
+ which must be decrefed by the caller.
+ Return 0 on success, -1 on error */
static
-int translate_error(const Py_UNICODE **source,
- Py_UNICODE **dest,
- const char *errors,
- const char *details)
-{
- if ((errors == NULL) ||
- (strcmp(errors,"strict") == 0)) {
- PyErr_Format(PyExc_UnicodeError,
- "translate error: %.400s",
- details);
- return -1;
+int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
+{
+ PyObject *w = PyInt_FromLong((long)c);
+ PyObject *x;
+
+ if (w == NULL)
+ return -1;
+ x = PyObject_GetItem(mapping, w);
+ Py_DECREF(w);
+ if (x == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+ /* No mapping found means: use 1:1 mapping. */
+ PyErr_Clear();
+ *result = NULL;
+ return 0;
+ } else
+ return -1;
}
- else if (strcmp(errors,"ignore") == 0) {
+ else if (x == Py_None) {
+ *result = x;
return 0;
}
- else if (strcmp(errors,"replace") == 0) {
- **dest = '?';
- (*dest)++;
+ else if (PyInt_Check(x)) {
+ long value = PyInt_AS_LONG(x);
+ long max = PyUnicode_GetMax();
+ if (value < 0 || value > max) {
+ PyErr_Format(PyExc_TypeError,
+ "character mapping must be in range(0x%lx)", max+1);
+ Py_DECREF(x);
+ return -1;
+ }
+ *result = x;
+ return 0;
+ }
+ else if (PyUnicode_Check(x)) {
+ *result = x;
return 0;
}
else {
- PyErr_Format(PyExc_ValueError,
- "translate error; "
- "unknown error handling code: %.400s",
- errors);
+ /* wrong return value */
+ PyErr_SetString(PyExc_TypeError,
+ "character mapping must return integer, None or unicode");
+ return -1;
+ }
+}
+/* ensure that *outobj is at least requiredsize characters long,
+if not reallocate and adjust various state variables.
+Return 0 on success, -1 on error */
+static
+int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
+ int requiredsize)
+{
+ if (requiredsize > *outsize) {
+ /* remember old output position */
+ int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
+ /* exponentially overallocate to minimize reallocations */
+ if (requiredsize < 2 * *outsize)
+ requiredsize = 2 * *outsize;
+ if (_PyUnicode_Resize(outobj, requiredsize))
+ return -1;
+ *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
+ *outsize = requiredsize;
+ }
+ return 0;
+}
+/* lookup the character, put the result in the output string and adjust
+ various state variables. Return a new reference to the object that
+ was put in the output buffer in *result, or Py_None, if the mapping was
+ undefined (in which case no character was written).
+ The called must decref result.
+ Return 0 on success, -1 on error. */
+static
+int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
+ PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
+{
+ if (charmaptranslate_lookup(c, mapping, res))
return -1;
+ if (*res==NULL) {
+ /* not found => default to 1:1 mapping */
+ *(*outp)++ = (Py_UNICODE)c;
+ }
+ else if (*res==Py_None)
+ ;
+ else if (PyInt_Check(*res)) {
+ /* no overflow check, because we know that the space is enough */
+ *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
+ }
+ else if (PyUnicode_Check(*res)) {
+ int repsize = PyUnicode_GET_SIZE(*res);
+ if (repsize==1) {
+ /* no overflow check, because we know that the space is enough */
+ *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
+ }
+ else if (repsize!=0) {
+ /* more than one character */
+ int requiredsize = *outsize + repsize - 1;
+ if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
+ return -1;
+ memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
+ *outp += repsize;
+ }
}
+ else
+ return -1;
+ return 0;
}
-PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
+PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
int size,
PyObject *mapping,
const char *errors)
{
- PyUnicodeObject *v;
- Py_UNICODE *p;
-
+ /* output object */
+ PyObject *res = NULL;
+ /* pointers to the beginning and end+1 of input */
+ const Py_UNICODE *startp = p;
+ const Py_UNICODE *endp = p + size;
+ /* pointer into the output */
+ Py_UNICODE *str;
+ /* current output position */
+ int respos = 0;
+ int ressize;
+ char *reason = "character maps to <undefined>";
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+ * 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
+
if (mapping == NULL) {
PyErr_BadArgument();
return NULL;
}
-
- /* Output will never be longer than input */
- v = _PyUnicode_New(size);
- if (v == NULL)
- goto onError;
- if (size == 0)
- goto done;
- p = PyUnicode_AS_UNICODE(v);
- while (size-- > 0) {
- Py_UNICODE ch = *s++;
- PyObject *w, *x;
- /* Get mapping */
- w = PyInt_FromLong(ch);
- if (w == NULL)
- goto onError;
- x = PyObject_GetItem(mapping, w);
- Py_DECREF(w);
- if (x == NULL) {
- if (PyErr_ExceptionMatches(PyExc_LookupError)) {
- /* No mapping found: default to 1-1 mapping */
- PyErr_Clear();
- *p++ = ch;
- continue;
- }
+ /* allocate enough for a simple 1:1 translation without
+ replacements, if we need more, we'll resize */
+ res = PyUnicode_FromUnicode(NULL, size);
+ if (res == NULL)
+ goto onError;
+ if (size == 0)
+ return res;
+ str = PyUnicode_AS_UNICODE(res);
+ ressize = size;
+
+ while (p<endp) {
+ /* try to encode it */
+ PyObject *x = NULL;
+ if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
+ Py_XDECREF(x);
goto onError;
}
-
- /* Apply mapping */
- if (PyInt_Check(x))
- *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
- else if (x == Py_None) {
- /* undefined mapping */
- if (translate_error(&s, &p, errors,
- "character maps to <undefined>")) {
- Py_DECREF(x);
- goto onError;
+ if (x!=Py_None) /* it worked => adjust input pointer */
+ ++p;
+ else { /* untranslatable character */
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ int repsize;
+ int newpos;
+ Py_UNICODE *uni2;
+ /* startpos for collecting untranslatable chars */
+ const Py_UNICODE *collstart = p;
+ const Py_UNICODE *collend = p+1;
+ const Py_UNICODE *coll;
+
+ Py_XDECREF(x);
+ /* find all untranslatable characters */
+ while (collend < endp) {
+ if (charmaptranslate_lookup(*collend, mapping, &x))
+ goto onError;
+ Py_XDECREF(x);
+ if (x!=Py_None)
+ break;
+ ++collend;
}
- }
- else if (PyUnicode_Check(x)) {
- if (PyUnicode_GET_SIZE(x) != 1) {
- /* 1-n mapping */
- PyErr_SetString(PyExc_NotImplementedError,
- "1-n mappings are currently not implemented");
- Py_DECREF(x);
- goto onError;
+ /* cache callback name lookup
+ * (if not done yet, i.e. it's the first error) */
+ if (known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ known_errorHandler = 4;
+ else
+ known_errorHandler = 0;
+ }
+ switch (known_errorHandler) {
+ case 1: /* strict */
+ raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
+ goto onError;
+ case 2: /* replace */
+ /* No need to check for space, this is a 1:1 replacement */
+ for (coll = collstart; coll<collend; ++coll)
+ *str++ = '?';
+ /* fall through */
+ case 3: /* ignore */
+ p = collend;
+ break;
+ case 4: /* xmlcharrefreplace */
+ /* generate replacement (temporarily (mis)uses p) */
+ for (p = collstart; p < collend; ++p) {
+ char buffer[2+29+1+1];
+ char *cp;
+ sprintf(buffer, "&#%d;", (int)*p);
+ if (charmaptranslate_makespace(&res, &str, &ressize,
+ (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
+ goto onError;
+ for (cp = buffer; *cp; ++cp)
+ *str++ = *cp;
+ }
+ p = collend;
+ break;
+ default:
+ repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+ reason, startp, size, &exc,
+ collstart-startp, collend-startp, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ /* generate replacement */
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ if (charmaptranslate_makespace(&res, &str, &ressize,
+ (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
+ Py_DECREF(repunicode);
+ goto onError;
+ }
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
+ *str++ = *uni2;
+ p = startp + newpos;
+ Py_DECREF(repunicode);
}
- *p++ = *PyUnicode_AS_UNICODE(x);
- }
- else {
- /* wrong return value */
- PyErr_SetString(PyExc_TypeError,
- "translate mapping must return integer, None or unicode");
- Py_DECREF(x);
- goto onError;
}
- Py_DECREF(x);
}
- if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
- if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
+ /* Resize if we allocated to much */
+ respos = str-PyUnicode_AS_UNICODE(res);
+ if (respos<ressize) {
+ if (_PyUnicode_Resize(&res, respos))
goto onError;
+ }
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
+ return res;
- done:
- return (PyObject *)v;
-
- onError:
- Py_XDECREF(v);
+ onError:
+ Py_XDECREF(res);
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return NULL;
}
@@ -2772,6 +3381,13 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
const char *errors)
{
Py_UNICODE *p, *end;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+ const char *encoding = "decimal";
+ const char *reason = "invalid decimal Unicode string";
+ /* the following variable is used for caching string comparisons
+ * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
+ int known_errorHandler = -1;
if (output == NULL) {
PyErr_BadArgument();
@@ -2781,40 +3397,110 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
p = s;
end = s + length;
while (p < end) {
- register Py_UNICODE ch = *p++;
+ register Py_UNICODE ch = *p;
int decimal;
+ PyObject *repunicode;
+ int repsize;
+ int newpos;
+ Py_UNICODE *uni2;
+ Py_UNICODE *collstart;
+ Py_UNICODE *collend;
if (Py_UNICODE_ISSPACE(ch)) {
*output++ = ' ';
+ ++p;
continue;
}
decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0) {
*output++ = '0' + decimal;
+ ++p;
continue;
}
if (0 < ch && ch < 256) {
*output++ = (char)ch;
+ ++p;
continue;
}
- /* All other characters are considered invalid */
- if (errors == NULL || strcmp(errors, "strict") == 0) {
- PyErr_SetString(PyExc_ValueError,
- "invalid decimal Unicode string");
- goto onError;
+ /* All other characters are considered unencodable */
+ collstart = p;
+ collend = p+1;
+ while (collend < end) {
+ if ((0 < *collend && *collend < 256) ||
+ !Py_UNICODE_ISSPACE(*collend) ||
+ Py_UNICODE_TODECIMAL(*collend))
+ break;
}
- else if (strcmp(errors, "ignore") == 0)
- continue;
- else if (strcmp(errors, "replace") == 0) {
- *output++ = '?';
- continue;
+ /* cache callback name lookup
+ * (if not done yet, i.e. it's the first error) */
+ if (known_errorHandler==-1) {
+ if ((errors==NULL) || (!strcmp(errors, "strict")))
+ known_errorHandler = 1;
+ else if (!strcmp(errors, "replace"))
+ known_errorHandler = 2;
+ else if (!strcmp(errors, "ignore"))
+ known_errorHandler = 3;
+ else if (!strcmp(errors, "xmlcharrefreplace"))
+ known_errorHandler = 4;
+ else
+ known_errorHandler = 0;
+ }
+ switch (known_errorHandler) {
+ case 1: /* strict */
+ raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
+ goto onError;
+ case 2: /* replace */
+ for (p = collstart; p < collend; ++p)
+ *output++ = '?';
+ /* fall through */
+ case 3: /* ignore */
+ p = collend;
+ break;
+ case 4: /* xmlcharrefreplace */
+ /* generate replacement (temporarily (mis)uses p) */
+ for (p = collstart; p < collend; ++p)
+ output += sprintf(output, "&#%d;", (int)*p);
+ p = collend;
+ break;
+ default:
+ repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+ encoding, reason, s, length, &exc,
+ collstart-s, collend-s, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ /* generate replacement */
+ repsize = PyUnicode_GET_SIZE(repunicode);
+ for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
+ Py_UNICODE ch = *uni2;
+ if (Py_UNICODE_ISSPACE(ch))
+ *output++ = ' ';
+ else {
+ decimal = Py_UNICODE_TODECIMAL(ch);
+ if (decimal >= 0)
+ *output++ = '0' + decimal;
+ else if (0 < ch && ch < 256)
+ *output++ = (char)ch;
+ else {
+ Py_DECREF(repunicode);
+ raise_encode_exception(&exc, encoding,
+ s, length, collstart-s, collend-s, reason);
+ goto onError;
+ }
+ }
+ }
+ p = s + newpos;
+ Py_DECREF(repunicode);
}
}
/* 0-terminate the output string */
*output++ = '\0';
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return 0;
onError:
+ Py_XDECREF(exc);
+ Py_XDECREF(errorHandler);
return -1;
}
@@ -3927,7 +4613,9 @@ PyDoc_STRVAR(encode__doc__,
Return an encoded string version of S. Default encoding is the current\n\
default string encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
-a ValueError. Other possible values are 'ignore' and 'replace'.");
+a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
+'xmlcharrefreplace' as well as any other name registered with\n\
+codecs.register_error that can handle UnicodeEncodeErrors.");
static PyObject *
unicode_encode(PyUnicodeObject *self, PyObject *args)
diff --git a/Python/codecs.c b/Python/codecs.c
index 3e54d8f..09cba75 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -422,12 +422,409 @@ PyObject *PyCodec_Decode(PyObject *object,
return NULL;
}
+static PyObject *_PyCodec_ErrorRegistry;
+
+/* Register the error handling callback function error under the name
+ name. This function will be called by the codec when it encounters
+ an unencodable characters/undecodable bytes and doesn't know the
+ callback name, when name is specified as the error parameter
+ in the call to the encode/decode function.
+ Return 0 on success, -1 on error */
+int PyCodec_RegisterError(const char *name, PyObject *error)
+{
+ if (!PyCallable_Check(error)) {
+ PyErr_SetString(PyExc_TypeError, "handler must be callable");
+ return -1;
+ }
+ return PyDict_SetItemString( _PyCodec_ErrorRegistry, (char *)name, error);
+}
+
+/* Lookup the error handling callback function registered under the
+ name error. As a special case NULL can be passed, in which case
+ the error handling callback for strict encoding will be returned. */
+PyObject *PyCodec_LookupError(const char *name)
+{
+ PyObject *handler = NULL;
+
+ if (name==NULL)
+ name = "strict";
+ handler = PyDict_GetItemString(_PyCodec_ErrorRegistry, (char *)name);
+ if (!handler)
+ PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
+ else
+ Py_INCREF(handler);
+ return handler;
+}
+
+static void wrong_exception_type(PyObject *exc)
+{
+ PyObject *type = PyObject_GetAttrString(exc, "__class__");
+ if (type != NULL) {
+ PyObject *name = PyObject_GetAttrString(type, "__name__");
+ Py_DECREF(type);
+ if (name != NULL) {
+ PyObject *string = PyObject_Str(name);
+ Py_DECREF(name);
+ PyErr_Format(PyExc_TypeError, "don't know how to handle %.400s in error callback",
+ PyString_AS_STRING(string));
+ Py_DECREF(string);
+ }
+ }
+}
+
+PyObject *PyCodec_StrictErrors(PyObject *exc)
+{
+ if (PyInstance_Check(exc))
+ PyErr_SetObject((PyObject*)((PyInstanceObject*)exc)->in_class,
+ exc);
+ else
+ PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
+ return NULL;
+}
+
+
+PyObject *PyCodec_IgnoreErrors(PyObject *exc)
+{
+ int end;
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+ if (PyUnicodeDecodeError_GetEnd(exc, &end))
+ return NULL;
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
+ if (PyUnicodeTranslateError_GetEnd(exc, &end))
+ return NULL;
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+ /* ouch: passing NULL, 0, pos gives None instead of u'' */
+ return Py_BuildValue("(u#i)", &end, 0, end);
+}
+
+
+PyObject *PyCodec_ReplaceErrors(PyObject *exc)
+{
+ PyObject *restuple;
+ int start;
+ int end;
+ int i;
+
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ PyObject *res;
+ Py_UNICODE *p;
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ res = PyUnicode_FromUnicode(NULL, end-start);
+ if (res == NULL)
+ return NULL;
+ for (p = PyUnicode_AS_UNICODE(res), i = start;
+ i<end; ++p, ++i)
+ *p = '?';
+ restuple = Py_BuildValue("(Oi)", res, end);
+ Py_DECREF(res);
+ return restuple;
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+ Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
+ if (PyUnicodeDecodeError_GetEnd(exc, &end))
+ return NULL;
+ return Py_BuildValue("(u#i)", &res, 1, end);
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
+ PyObject *res;
+ Py_UNICODE *p;
+ if (PyUnicodeTranslateError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeTranslateError_GetEnd(exc, &end))
+ return NULL;
+ res = PyUnicode_FromUnicode(NULL, end-start);
+ if (res == NULL)
+ return NULL;
+ for (p = PyUnicode_AS_UNICODE(res), i = start;
+ i<end; ++p, ++i)
+ *p = Py_UNICODE_REPLACEMENT_CHARACTER;
+ restuple = Py_BuildValue("(Oi)", res, end);
+ Py_DECREF(res);
+ return restuple;
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
+PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
+{
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ PyObject *restuple;
+ PyObject *object;
+ int start;
+ int end;
+ PyObject *res;
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ Py_UNICODE *outp;
+ int ressize;
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+ return NULL;
+ startp = PyUnicode_AS_UNICODE(object);
+ for (p = startp+start, ressize = 0; p < startp+end; ++p) {
+ if (*p<10)
+ ressize += 2+1+1;
+ else if (*p<100)
+ ressize += 2+2+1;
+ else if (*p<1000)
+ ressize += 2+3+1;
+ else if (*p<10000)
+ ressize += 2+4+1;
+ else if (*p<100000)
+ ressize += 2+5+1;
+ else if (*p<1000000)
+ ressize += 2+6+1;
+ else
+ ressize += 2+7+1;
+ }
+ /* allocate replacement */
+ res = PyUnicode_FromUnicode(NULL, ressize);
+ if (res == NULL) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ /* generate replacement */
+ for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
+ p < startp+end; ++p) {
+ Py_UNICODE c = *p;
+ int digits;
+ int base;
+ *outp++ = '&';
+ *outp++ = '#';
+ if (*p<10) {
+ digits = 1;
+ base = 1;
+ }
+ else if (*p<100) {
+ digits = 2;
+ base = 10;
+ }
+ else if (*p<1000) {
+ digits = 3;
+ base = 100;
+ }
+ else if (*p<10000) {
+ digits = 4;
+ base = 1000;
+ }
+ else if (*p<100000) {
+ digits = 5;
+ base = 10000;
+ }
+ else if (*p<1000000) {
+ digits = 6;
+ base = 100000;
+ }
+ else {
+ digits = 7;
+ base = 1000000;
+ }
+ while (digits-->0) {
+ *outp++ = '0' + c/base;
+ c %= base;
+ base /= 10;
+ }
+ *outp++ = ';';
+ }
+ restuple = Py_BuildValue("(Oi)", res, end);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return restuple;
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
+static Py_UNICODE hexdigits[] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
+};
+
+PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
+{
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ PyObject *restuple;
+ PyObject *object;
+ int start;
+ int end;
+ PyObject *res;
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ Py_UNICODE *outp;
+ int ressize;
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+ return NULL;
+ startp = PyUnicode_AS_UNICODE(object);
+ for (p = startp+start, ressize = 0; p < startp+end; ++p) {
+ if (*p >= 0x00010000)
+ ressize += 1+1+8;
+ else if (*p >= 0x100) {
+ ressize += 1+1+4;
+ }
+ else
+ ressize += 1+1+2;
+ }
+ res = PyUnicode_FromUnicode(NULL, ressize);
+ if (res==NULL)
+ return NULL;
+ for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
+ p < startp+end; ++p) {
+ Py_UNICODE c = *p;
+ *outp++ = '\\';
+ if (c >= 0x00010000) {
+ *outp++ = 'U';
+ *outp++ = hexdigits[(c>>28)&0xf];
+ *outp++ = hexdigits[(c>>24)&0xf];
+ *outp++ = hexdigits[(c>>20)&0xf];
+ *outp++ = hexdigits[(c>>16)&0xf];
+ *outp++ = hexdigits[(c>>12)&0xf];
+ *outp++ = hexdigits[(c>>8)&0xf];
+ }
+ else if (c >= 0x100) {
+ *outp++ = 'u';
+ *outp++ = hexdigits[(c>>12)&0xf];
+ *outp++ = hexdigits[(c>>8)&0xf];
+ }
+ else
+ *outp++ = 'x';
+ *outp++ = hexdigits[(c>>4)&0xf];
+ *outp++ = hexdigits[c&0xf];
+ }
+
+ restuple = Py_BuildValue("(Oi)", res, end);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return restuple;
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
+static PyObject *strict_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_StrictErrors(exc);
+}
+
+
+static PyObject *ignore_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_IgnoreErrors(exc);
+}
+
+
+static PyObject *replace_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_ReplaceErrors(exc);
+}
+
+
+static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_XMLCharRefReplaceErrors(exc);
+}
+
+
+static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_BackslashReplaceErrors(exc);
+}
+
+
void _PyCodecRegistry_Init(void)
{
+ static struct {
+ char *name;
+ PyMethodDef def;
+ } methods[] =
+ {
+ {
+ "strict",
+ {
+ "strict_errors",
+ strict_errors,
+ METH_O
+ }
+ },
+ {
+ "ignore",
+ {
+ "ignore_errors",
+ ignore_errors,
+ METH_O
+ }
+ },
+ {
+ "replace",
+ {
+ "replace_errors",
+ replace_errors,
+ METH_O
+ }
+ },
+ {
+ "xmlcharrefreplace",
+ {
+ "xmlcharrefreplace_errors",
+ xmlcharrefreplace_errors,
+ METH_O
+ }
+ },
+ {
+ "backslashreplace",
+ {
+ "backslashreplace_errors",
+ backslashreplace_errors,
+ METH_O
+ }
+ }
+ };
if (_PyCodec_SearchPath == NULL)
_PyCodec_SearchPath = PyList_New(0);
if (_PyCodec_SearchCache == NULL)
_PyCodec_SearchCache = PyDict_New();
+ if (_PyCodec_ErrorRegistry == NULL) {
+ int i;
+ _PyCodec_ErrorRegistry = PyDict_New();
+
+ if (_PyCodec_ErrorRegistry) {
+ for (i = 0; i < 5; ++i) {
+ PyObject *func = PyCFunction_New(&methods[i].def, NULL);
+ int res;
+ if (!func)
+ Py_FatalError("can't initialize codec error registry");
+ res = PyCodec_RegisterError(methods[i].name, func);
+ Py_DECREF(func);
+ if (res)
+ Py_FatalError("can't initialize codec error registry");
+ }
+ }
+ }
if (_PyCodec_SearchPath == NULL ||
_PyCodec_SearchCache == NULL)
Py_FatalError("can't initialize codec registry");
@@ -439,4 +836,6 @@ void _PyCodecRegistry_Fini(void)
_PyCodec_SearchPath = NULL;
Py_XDECREF(_PyCodec_SearchCache);
_PyCodec_SearchCache = NULL;
+ Py_XDECREF(_PyCodec_ErrorRegistry);
+ _PyCodec_ErrorRegistry = NULL;
}
diff --git a/Python/exceptions.c b/Python/exceptions.c
index c4bd626..1667cd9 100644
--- a/Python/exceptions.c
+++ b/Python/exceptions.c
@@ -100,6 +100,10 @@ Exception\n\
| +-- ValueError\n\
| | |\n\
| | +-- UnicodeError\n\
+ | | |\n\
+ | | +-- UnicodeEncodeError\n\
+ | | +-- UnicodeDecodeError\n\
+ | | +-- UnicodeTranslateError\n\
| |\n\
| +-- ReferenceError\n\
| +-- SystemError\n\
@@ -840,6 +844,590 @@ static PyMethodDef SyntaxError_methods[] = {
};
+static
+int get_int(PyObject *exc, const char *name, int *value)
+{
+ PyObject *attr = PyObject_GetAttrString(exc, (char *)name);
+
+ if (!attr)
+ return -1;
+ if (!PyInt_Check(attr)) {
+ PyErr_Format(PyExc_TypeError, "%s attribute must be int", name);
+ Py_DECREF(attr);
+ return -1;
+ }
+ *value = PyInt_AS_LONG(attr);
+ Py_DECREF(attr);
+ return 0;
+}
+
+
+static
+int set_int(PyObject *exc, const char *name, int value)
+{
+ PyObject *obj = PyInt_FromLong(value);
+ int result;
+
+ if (!obj)
+ return -1;
+ result = PyObject_SetAttrString(exc, (char *)name, obj);
+ Py_DECREF(obj);
+ return result;
+}
+
+
+static
+PyObject *get_string(PyObject *exc, const char *name)
+{
+ PyObject *attr = PyObject_GetAttrString(exc, (char *)name);
+
+ if (!attr)
+ return NULL;
+ if (!PyString_Check(attr)) {
+ PyErr_Format(PyExc_TypeError, "%s attribute must be str", name);
+ Py_DECREF(attr);
+ return NULL;
+ }
+ return attr;
+}
+
+
+static
+int set_string(PyObject *exc, const char *name, const char *value)
+{
+ PyObject *obj = PyString_FromString(value);
+ int result;
+
+ if (!obj)
+ return -1;
+ result = PyObject_SetAttrString(exc, (char *)name, obj);
+ Py_DECREF(obj);
+ return result;
+}
+
+
+static
+PyObject *get_unicode(PyObject *exc, const char *name)
+{
+ PyObject *attr = PyObject_GetAttrString(exc, (char *)name);
+
+ if (!attr)
+ return NULL;
+ if (!PyUnicode_Check(attr)) {
+ PyErr_Format(PyExc_TypeError, "%s attribute must be unicode", name);
+ Py_DECREF(attr);
+ return NULL;
+ }
+ return attr;
+}
+
+PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc)
+{
+ return get_string(exc, "encoding");
+}
+
+PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc)
+{
+ return get_string(exc, "encoding");
+}
+
+PyObject * PyUnicodeTranslateError_GetEncoding(PyObject *exc)
+{
+ return get_string(exc, "encoding");
+}
+
+PyObject *PyUnicodeEncodeError_GetObject(PyObject *exc)
+{
+ return get_unicode(exc, "object");
+}
+
+PyObject *PyUnicodeDecodeError_GetObject(PyObject *exc)
+{
+ return get_string(exc, "object");
+}
+
+PyObject *PyUnicodeTranslateError_GetObject(PyObject *exc)
+{
+ return get_unicode(exc, "object");
+}
+
+int PyUnicodeEncodeError_GetStart(PyObject *exc, int *start)
+{
+ if (!get_int(exc, "start", start)) {
+ PyObject *object = PyUnicodeEncodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyUnicode_GET_SIZE(object);
+ if (*start<0)
+ *start = 0;
+ if (*start>=size)
+ *start = size-1;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+}
+
+
+int PyUnicodeDecodeError_GetStart(PyObject *exc, int *start)
+{
+ if (!get_int(exc, "start", start)) {
+ PyObject *object = PyUnicodeDecodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyString_GET_SIZE(object);
+ if (*start<0)
+ *start = 0;
+ if (*start>=size)
+ *start = size-1;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+}
+
+
+int PyUnicodeTranslateError_GetStart(PyObject *exc, int *start)
+{
+ return PyUnicodeEncodeError_GetStart(exc, start);
+}
+
+
+int PyUnicodeEncodeError_SetStart(PyObject *exc, int start)
+{
+ return set_int(exc, "start", start);
+}
+
+
+int PyUnicodeDecodeError_SetStart(PyObject *exc, int start)
+{
+ return set_int(exc, "start", start);
+}
+
+
+int PyUnicodeTranslateError_SetStart(PyObject *exc, int start)
+{
+ return set_int(exc, "start", start);
+}
+
+
+int PyUnicodeEncodeError_GetEnd(PyObject *exc, int *end)
+{
+ if (!get_int(exc, "end", end)) {
+ PyObject *object = PyUnicodeEncodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyUnicode_GET_SIZE(object);
+ if (*end<1)
+ *end = 1;
+ if (*end>size)
+ *end = size;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+}
+
+
+int PyUnicodeDecodeError_GetEnd(PyObject *exc, int *end)
+{
+ if (!get_int(exc, "end", end)) {
+ PyObject *object = PyUnicodeDecodeError_GetObject(exc);
+ int size;
+ if (!object)
+ return -1;
+ size = PyString_GET_SIZE(object);
+ if (*end<1)
+ *end = 1;
+ if (*end>size)
+ *end = size;
+ Py_DECREF(object);
+ return 0;
+ }
+ return -1;
+}
+
+
+int PyUnicodeTranslateError_GetEnd(PyObject *exc, int *start)
+{
+ return PyUnicodeEncodeError_GetEnd(exc, start);
+}
+
+
+int PyUnicodeEncodeError_SetEnd(PyObject *exc, int end)
+{
+ return set_int(exc, "end", end);
+}
+
+
+int PyUnicodeDecodeError_SetEnd(PyObject *exc, int end)
+{
+ return set_int(exc, "end", end);
+}
+
+
+int PyUnicodeTranslateError_SetEnd(PyObject *exc, int end)
+{
+ return set_int(exc, "end", end);
+}
+
+
+PyObject *PyUnicodeEncodeError_GetReason(PyObject *exc)
+{
+ return get_string(exc, "reason");
+}
+
+
+PyObject *PyUnicodeDecodeError_GetReason(PyObject *exc)
+{
+ return get_string(exc, "reason");
+}
+
+
+PyObject *PyUnicodeTranslateError_GetReason(PyObject *exc)
+{
+ return get_string(exc, "reason");
+}
+
+
+int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason)
+{
+ return set_string(exc, "reason", reason);
+}
+
+
+int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason)
+{
+ return set_string(exc, "reason", reason);
+}
+
+
+int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason)
+{
+ return set_string(exc, "reason", reason);
+}
+
+
+static PyObject *
+UnicodeError__init__(PyObject *self, PyObject *args, PyTypeObject *objecttype)
+{
+ PyObject *rtnval = NULL;
+ PyObject *encoding;
+ PyObject *object;
+ PyObject *start;
+ PyObject *end;
+ PyObject *reason;
+
+ if (!(self = get_self(args)))
+ return NULL;
+
+ if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args))))
+ return NULL;
+
+ if (!PyArg_ParseTuple(args, "O!O!O!O!O!",
+ &PyString_Type, &encoding,
+ objecttype, &object,
+ &PyInt_Type, &start,
+ &PyInt_Type, &end,
+ &PyString_Type, &reason))
+ return NULL;
+
+ if (PyObject_SetAttrString(self, "args", args))
+ goto finally;
+
+ if (PyObject_SetAttrString(self, "encoding", encoding))
+ goto finally;
+ if (PyObject_SetAttrString(self, "object", object))
+ goto finally;
+ if (PyObject_SetAttrString(self, "start", start))
+ goto finally;
+ if (PyObject_SetAttrString(self, "end", end))
+ goto finally;
+ if (PyObject_SetAttrString(self, "reason", reason))
+ goto finally;
+
+ Py_INCREF(Py_None);
+ rtnval = Py_None;
+
+ finally:
+ Py_DECREF(args);
+ return rtnval;
+}
+
+
+static PyObject *
+UnicodeEncodeError__init__(PyObject *self, PyObject *args)
+{
+ return UnicodeError__init__(self, args, &PyUnicode_Type);
+}
+
+static PyObject *
+UnicodeEncodeError__str__(PyObject *self, PyObject *arg)
+{
+ PyObject *encodingObj = NULL;
+ PyObject *objectObj = NULL;
+ int length;
+ int start;
+ int end;
+ PyObject *reasonObj = NULL;
+ char buffer[1000];
+ PyObject *result = NULL;
+
+ self = arg;
+
+ if (!(encodingObj = PyUnicodeEncodeError_GetEncoding(self)))
+ goto error;
+
+ if (!(objectObj = PyUnicodeEncodeError_GetObject(self)))
+ goto error;
+
+ length = PyUnicode_GET_SIZE(objectObj);
+
+ if (PyUnicodeEncodeError_GetStart(self, &start))
+ goto error;
+
+ if (PyUnicodeEncodeError_GetEnd(self, &end))
+ goto error;
+
+ if (!(reasonObj = PyUnicodeEncodeError_GetReason(self)))
+ goto error;
+
+ if (end==start+1) {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't encode character '\\u%x' in position %d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ (int)PyUnicode_AS_UNICODE(objectObj)[start],
+ start,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ else {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't encode characters in position %d-%d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ start,
+ end-1,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ result = PyString_FromString(buffer);
+
+error:
+ Py_XDECREF(reasonObj);
+ Py_XDECREF(objectObj);
+ Py_XDECREF(encodingObj);
+ return result;
+}
+
+static PyMethodDef UnicodeEncodeError_methods[] = {
+ {"__init__", UnicodeEncodeError__init__, METH_VARARGS},
+ {"__str__", UnicodeEncodeError__str__, METH_O},
+ {NULL, NULL}
+};
+
+
+PyObject * PyUnicodeEncodeError_Create(
+ const char *encoding, const Py_UNICODE *object, int length,
+ int start, int end, const char *reason)
+{
+ return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#iis",
+ encoding, object, length, start, end, reason);
+}
+
+
+static PyObject *
+UnicodeDecodeError__init__(PyObject *self, PyObject *args)
+{
+ return UnicodeError__init__(self, args, &PyString_Type);
+}
+
+static PyObject *
+UnicodeDecodeError__str__(PyObject *self, PyObject *arg)
+{
+ PyObject *encodingObj = NULL;
+ PyObject *objectObj = NULL;
+ int length;
+ int start;
+ int end;
+ PyObject *reasonObj = NULL;
+ char buffer[1000];
+ PyObject *result = NULL;
+
+ self = arg;
+
+ if (!(encodingObj = PyUnicodeDecodeError_GetEncoding(self)))
+ goto error;
+
+ if (!(objectObj = PyUnicodeDecodeError_GetObject(self)))
+ goto error;
+
+ length = PyString_GET_SIZE(objectObj);
+
+ if (PyUnicodeDecodeError_GetStart(self, &start))
+ goto error;
+
+ if (PyUnicodeDecodeError_GetEnd(self, &end))
+ goto error;
+
+ if (!(reasonObj = PyUnicodeDecodeError_GetReason(self)))
+ goto error;
+
+ if (end==start+1) {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't decode byte 0x%x in position %d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ ((int)PyString_AS_STRING(objectObj)[start])&0xff,
+ start,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ else {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "'%.400s' codec can't decode bytes in position %d-%d: %.400s",
+ PyString_AS_STRING(encodingObj),
+ start,
+ end-1,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ result = PyString_FromString(buffer);
+
+error:
+ Py_XDECREF(reasonObj);
+ Py_XDECREF(objectObj);
+ Py_XDECREF(encodingObj);
+ return result;
+}
+
+static PyMethodDef UnicodeDecodeError_methods[] = {
+ {"__init__", UnicodeDecodeError__init__, METH_VARARGS},
+ {"__str__", UnicodeDecodeError__str__, METH_O},
+ {NULL, NULL}
+};
+
+
+PyObject * PyUnicodeDecodeError_Create(
+ const char *encoding, const char *object, int length,
+ int start, int end, const char *reason)
+{
+ return PyObject_CallFunction(PyExc_UnicodeDecodeError, "ss#iis",
+ encoding, object, length, start, end, reason);
+}
+
+
+static PyObject *
+UnicodeTranslateError__init__(PyObject *self, PyObject *args)
+{
+ PyObject *rtnval = NULL;
+ PyObject *object;
+ PyObject *start;
+ PyObject *end;
+ PyObject *reason;
+
+ if (!(self = get_self(args)))
+ return NULL;
+
+ if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args))))
+ return NULL;
+
+ if (!PyArg_ParseTuple(args, "O!O!O!O!",
+ &PyUnicode_Type, &object,
+ &PyInt_Type, &start,
+ &PyInt_Type, &end,
+ &PyString_Type, &reason))
+ goto finally;
+
+ if (PyObject_SetAttrString(self, "args", args))
+ goto finally;
+
+ if (PyObject_SetAttrString(self, "object", object))
+ goto finally;
+ if (PyObject_SetAttrString(self, "start", start))
+ goto finally;
+ if (PyObject_SetAttrString(self, "end", end))
+ goto finally;
+ if (PyObject_SetAttrString(self, "reason", reason))
+ goto finally;
+
+ Py_INCREF(Py_None);
+ rtnval = Py_None;
+
+ finally:
+ Py_DECREF(args);
+ return rtnval;
+}
+
+
+static PyObject *
+UnicodeTranslateError__str__(PyObject *self, PyObject *arg)
+{
+ PyObject *objectObj = NULL;
+ int length;
+ int start;
+ int end;
+ PyObject *reasonObj = NULL;
+ char buffer[1000];
+ PyObject *result = NULL;
+
+ self = arg;
+
+ if (!(objectObj = PyUnicodeTranslateError_GetObject(self)))
+ goto error;
+
+ length = PyUnicode_GET_SIZE(objectObj);
+
+ if (PyUnicodeTranslateError_GetStart(self, &start))
+ goto error;
+
+ if (PyUnicodeTranslateError_GetEnd(self, &end))
+ goto error;
+
+ if (!(reasonObj = PyUnicodeTranslateError_GetReason(self)))
+ goto error;
+
+ if (end==start+1) {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "can't translate character '\\u%x' in position %d: %.400s",
+ (int)PyUnicode_AS_UNICODE(objectObj)[start],
+ start,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ else {
+ PyOS_snprintf(buffer, sizeof(buffer),
+ "can't translate characters in position %d-%d: %.400s",
+ start,
+ end-1,
+ PyString_AS_STRING(reasonObj)
+ );
+ }
+ result = PyString_FromString(buffer);
+
+error:
+ Py_XDECREF(reasonObj);
+ Py_XDECREF(objectObj);
+ return result;
+}
+
+static PyMethodDef UnicodeTranslateError_methods[] = {
+ {"__init__", UnicodeTranslateError__init__, METH_VARARGS},
+ {"__str__", UnicodeTranslateError__str__, METH_O},
+ {NULL, NULL}
+};
+
+
+PyObject * PyUnicodeTranslateError_Create(
+ const Py_UNICODE *object, int length,
+ int start, int end, const char *reason)
+{
+ return PyObject_CallFunction(PyExc_UnicodeTranslateError, "u#iis",
+ object, length, start, end, reason);
+}
+
+
/* Exception doc strings */
@@ -865,6 +1453,12 @@ PyDoc_STRVAR(ValueError__doc__,
PyDoc_STRVAR(UnicodeError__doc__, "Unicode related error.");
+PyDoc_STRVAR(UnicodeEncodeError__doc__, "Unicode encoding error.");
+
+PyDoc_STRVAR(UnicodeDecodeError__doc__, "Unicode decoding error.");
+
+PyDoc_STRVAR(UnicodeTranslateError__doc__, "Unicode translation error.");
+
PyDoc_STRVAR(SystemError__doc__,
"Internal error in the Python interpreter.\n\
\n\
@@ -949,6 +1543,9 @@ PyObject *PyExc_SystemError;
PyObject *PyExc_SystemExit;
PyObject *PyExc_UnboundLocalError;
PyObject *PyExc_UnicodeError;
+PyObject *PyExc_UnicodeEncodeError;
+PyObject *PyExc_UnicodeDecodeError;
+PyObject *PyExc_UnicodeTranslateError;
PyObject *PyExc_TypeError;
PyObject *PyExc_ValueError;
PyObject *PyExc_ZeroDivisionError;
@@ -1035,6 +1632,12 @@ static struct {
FloatingPointError__doc__},
{"ValueError", &PyExc_ValueError, 0, ValueError__doc__},
{"UnicodeError", &PyExc_UnicodeError, &PyExc_ValueError, UnicodeError__doc__},
+ {"UnicodeEncodeError", &PyExc_UnicodeEncodeError, &PyExc_UnicodeError,
+ UnicodeEncodeError__doc__, UnicodeEncodeError_methods},
+ {"UnicodeDecodeError", &PyExc_UnicodeDecodeError, &PyExc_UnicodeError,
+ UnicodeDecodeError__doc__, UnicodeDecodeError_methods},
+ {"UnicodeTranslateError", &PyExc_UnicodeTranslateError, &PyExc_UnicodeError,
+ UnicodeTranslateError__doc__, UnicodeTranslateError_methods},
{"ReferenceError", &PyExc_ReferenceError, 0, ReferenceError__doc__},
{"SystemError", &PyExc_SystemError, 0, SystemError__doc__},
{"MemoryError", &PyExc_MemoryError, 0, MemoryError__doc__},