PEP 293 implemention (from SF patch http://www.python.org/sf/432401)

author: Walter Dörwald <walter@livinglogic.de> 2002-09-02 13:14:32 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2002-09-02 13:14:32 (GMT)
commit: 3aeb632c3152fa082132ce55b9a880e0d16b04ae (patch)
tree: 192bc1543ea77a826d0c940d024dbc8ebba82156 /Objects/unicodeobject.c
parent: 94fab762de532de551987e1f48a125145f85304b (diff)
download: cpython-3aeb632c3152fa082132ce55b9a880e0d16b04ae.zip
cpython-3aeb632c3152fa082132ce55b9a880e0d16b04ae.tar.gz
cpython-3aeb632c3152fa082132ce55b9a880e0d16b04ae.tar.bz2
1 files changed, 1240 insertions, 552 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 920f9ea..2108d94 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -528,8 +528,8 @@ PyObject *PyUnicode_Decode(const char *s,
 			   const char *errors)
 {
     PyObject *buffer = NULL, *unicode;
-    
-    if (encoding == NULL) 
+
+    if (encoding == NULL)
 	encoding = PyUnicode_GetDefaultEncoding();
 
     /* Shortcuts for common default encodings */
@@ -680,6 +680,92 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
     return -1;
 }
 
+/* error handling callback helper:
+   build arguments, call the callback and check the arguments,
+   if no exception occured, copy the replacement to the output
+   and adjust various state variables.
+   return 0 on success, -1 on error
+*/
+
+static
+int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
+                 const char *encoding, const char *reason,
+                 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
+                 PyObject **output, int *outpos, Py_UNICODE **outptr)
+{
+    static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
+
+    PyObject *restuple = NULL;
+    PyObject *repunicode = NULL;
+    int outsize = PyUnicode_GET_SIZE(*output);
+    int requiredsize;
+    int newpos;
+    Py_UNICODE *repptr;
+    int repsize;
+    int res = -1;
+
+    if (*errorHandler == NULL) {
+	*errorHandler = PyCodec_LookupError(errors);
+	if (*errorHandler == NULL)
+	   goto onError;
+    }
+
+    if (*exceptionObject == NULL) {
+    	*exceptionObject = PyUnicodeDecodeError_Create(
+	    encoding, input, insize, *startinpos, *endinpos, reason);
+	if (*exceptionObject == NULL)
+	   goto onError;
+    }
+    else {
+	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
+	    goto onError;
+	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
+	    goto onError;
+	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+	    goto onError;
+    }
+
+    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
+    if (restuple == NULL)
+	goto onError;
+    if (!PyTuple_Check(restuple)) {
+	PyErr_Format(PyExc_TypeError, &argparse[4]);
+	goto onError;
+    }
+    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
+	goto onError;
+    if (newpos<0)
+	newpos = 0;
+    else if (newpos>insize)
+	newpos = insize;
+
+    /* need more space? (at least enough for what we
+       have+the replacement+the rest of the string (starting
+       at the new input position), so we won't have to check space
+       when there are no errors in the rest of the string) */
+    repptr = PyUnicode_AS_UNICODE(repunicode);
+    repsize = PyUnicode_GET_SIZE(repunicode);
+    requiredsize = *outpos + repsize + insize-newpos;
+    if (requiredsize > outsize) {
+	if (requiredsize<2*outsize)
+	    requiredsize = 2*outsize;
+	if (PyUnicode_Resize(output, requiredsize))
+	    goto onError;
+	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
+    }
+    *endinpos = newpos;
+    *inptr = input + newpos;
+    Py_UNICODE_COPY(*outptr, repptr, repsize);
+    *outptr += repsize;
+    *outpos += repsize;
+    /* we made it! */
+    res = 0;
+
+    onError:
+    Py_XDECREF(restuple);
+    return res;
+}
+
 /* --- UTF-7 Codec -------------------------------------------------------- */
 
 /* see RFC2152 for details */
@@ -738,40 +824,14 @@ char utf7_special[128] = {
 		} \
     } \
 
-static
-int utf7_decoding_error(Py_UNICODE **dest,
-                        const char *errors,
-                        const char *details) 
-{
-    if ((errors == NULL) ||
-        (strcmp(errors,"strict") == 0)) {
-        PyErr_Format(PyExc_UnicodeError,
-                     "UTF-7 decoding error: %.400s",
-                     details);
-        return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-        return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-        if (dest != NULL) {
-            **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
-            (*dest)++;
-        }
-        return 0;
-    }
-    else {
-        PyErr_Format(PyExc_ValueError,
-                     "UTF-7 decoding error; unknown error handling code: %.400s",
-                     errors);
-        return -1;
-    }
-}
-
 PyObject *PyUnicode_DecodeUTF7(const char *s,
 			       int size,
 			       const char *errors)
 {
+    const char *starts = s;
+    int startinpos;
+    int endinpos;
+    int outpos;
     const char *e;
     PyUnicodeObject *unicode;
     Py_UNICODE *p;
@@ -779,7 +839,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
     int inShift = 0;
     unsigned int bitsleft = 0;
     unsigned long charsleft = 0;
-	int surrogate = 0;
+    int surrogate = 0;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
 
     unicode = _PyUnicode_New(size);
     if (!unicode)
@@ -791,7 +853,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
     e = s + size;
 
     while (s < e) {
-        Py_UNICODE ch = *s;
+        Py_UNICODE ch;
+        restart:
+        ch = *s;
 
         if (inShift) {
             if ((ch == '-') || !B64CHAR(ch)) {
@@ -836,6 +900,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
             }
         }
         else if ( ch == '+' ) {
+            startinpos = s-starts;
             s++;
             if (s < e && *s == '-') {
                 s++;
@@ -857,21 +922,39 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
         }
         continue;
     utf7Error:
-      if (utf7_decoding_error(&p, errors, errmsg))
-          goto onError;
+        outpos = p-PyUnicode_AS_UNICODE(unicode);
+        endinpos = s-starts;
+        if (unicode_decode_call_errorhandler(
+             errors, &errorHandler,
+             "utf7", errmsg,
+             starts, size, &startinpos, &endinpos, &exc, &s,
+             (PyObject **)&unicode, &outpos, &p))
+        goto onError;
     }
 
     if (inShift) {
-        if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
+        outpos = p-PyUnicode_AS_UNICODE(unicode);
+        endinpos = size;
+        if (unicode_decode_call_errorhandler(
+             errors, &errorHandler,
+             "utf7", "unterminated shift sequence",
+             starts, size, &startinpos, &endinpos, &exc, &s,
+             (PyObject **)&unicode, &outpos, &p))
             goto onError;
+        if (s < e)
+           goto restart;
     }
 
-    if (_PyUnicode_Resize(&unicode, p - unicode->str))
+    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
         goto onError;
 
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return (PyObject *)unicode;
 
 onError:
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     Py_DECREF(unicode);
     return NULL;
 }
@@ -1001,46 +1084,21 @@ char utf8_code_length[256] = {
     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 };
 
-static
-int utf8_decoding_error(const char **source,
-                        Py_UNICODE **dest,
-                        const char *errors,
-                        const char *details) 
-{
-    if ((errors == NULL) ||
-        (strcmp(errors,"strict") == 0)) {
-        PyErr_Format(PyExc_UnicodeError,
-                     "UTF-8 decoding error: %.400s",
-                     details);
-        return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-        (*source)++;
-        return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-        (*source)++;
-        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
-        (*dest)++;
-        return 0;
-    }
-    else {
-        PyErr_Format(PyExc_ValueError,
-                     "UTF-8 decoding error; unknown error handling code: %.400s",
-                     errors);
-        return -1;
-    }
-}
-
 PyObject *PyUnicode_DecodeUTF8(const char *s,
 			       int size,
 			       const char *errors)
 {
+    const char *starts = s;
     int n;
+    int startinpos;
+    int endinpos;
+    int outpos;
     const char *e;
     PyUnicodeObject *unicode;
     Py_UNICODE *p;
     const char *errmsg = "";
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
 
     /* Note: size will always be longer than the resulting Unicode
        character count */
@@ -1067,6 +1125,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 
         if (s + n > e) {
 	    errmsg = "unexpected end of data";
+	    startinpos = s-starts;
+	    endinpos = size;
 	    goto utf8Error;
 	}
 
@@ -1074,19 +1134,27 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 
         case 0:
             errmsg = "unexpected code byte";
+	    startinpos = s-starts;
+	    endinpos = startinpos+1;
 	    goto utf8Error;
 
         case 1:
             errmsg = "internal error";
+	    startinpos = s-starts;
+	    endinpos = startinpos+1;
 	    goto utf8Error;
 
         case 2:
             if ((s[1] & 0xc0) != 0x80) {
                 errmsg = "invalid data";
+		startinpos = s-starts;
+		endinpos = startinpos+2;
 		goto utf8Error;
 	    }
             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
             if (ch < 0x80) {
+		startinpos = s-starts;
+		endinpos = startinpos+2;
                 errmsg = "illegal encoding";
 		goto utf8Error;
 	    }
@@ -1098,6 +1166,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
             if ((s[1] & 0xc0) != 0x80 || 
                 (s[2] & 0xc0) != 0x80) {
                 errmsg = "invalid data";
+		startinpos = s-starts;
+		endinpos = startinpos+3;
 		goto utf8Error;
 	    }
             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
@@ -1110,6 +1180,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 		       unit.
 		*/
                 errmsg = "illegal encoding";
+		startinpos = s-starts;
+		endinpos = startinpos+3;
 		goto utf8Error;
 	    }
 	    else
@@ -1121,6 +1193,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
                 (s[2] & 0xc0) != 0x80 ||
                 (s[3] & 0xc0) != 0x80) {
                 errmsg = "invalid data";
+		startinpos = s-starts;
+		endinpos = startinpos+4;
 		goto utf8Error;
 	    }
             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
@@ -1132,6 +1206,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 					 UTF-16 */
 	    {
                 errmsg = "illegal encoding";
+		startinpos = s-starts;
+		endinpos = startinpos+4;
 		goto utf8Error;
 	    }
 #ifdef Py_UNICODE_WIDE
@@ -1153,23 +1229,34 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
         default:
             /* Other sizes are only needed for UCS-4 */
             errmsg = "unsupported Unicode code range";
+	    startinpos = s-starts;
+	    endinpos = startinpos+n;
 	    goto utf8Error;
         }
         s += n;
 	continue;
 	
     utf8Error:
-      if (utf8_decoding_error(&s, &p, errors, errmsg))
-          goto onError;
+    outpos = p-PyUnicode_AS_UNICODE(unicode);
+    if (unicode_decode_call_errorhandler(
+	     errors, &errorHandler,
+	     "utf8", errmsg,
+	     starts, size, &startinpos, &endinpos, &exc, &s,
+	     (PyObject **)&unicode, &outpos, &p))
+	goto onError;
     }
 
     /* Adjust length */
     if (_PyUnicode_Resize(&unicode, p - unicode->str))
         goto onError;
 
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return (PyObject *)unicode;
 
 onError:
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     Py_DECREF(unicode);
     return NULL;
 }
@@ -1287,43 +1374,16 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 
 /* --- UTF-16 Codec ------------------------------------------------------- */
 
-static
-int utf16_decoding_error(Py_UNICODE **dest,
-			 const char *errors,
-			 const char *details) 
-{
-    if ((errors == NULL) ||
-        (strcmp(errors,"strict") == 0)) {
-        PyErr_Format(PyExc_UnicodeError,
-                     "UTF-16 decoding error: %.400s",
-                     details);
-        return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-        return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-	if (dest) {
-	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
-	    (*dest)++;
-	}
-        return 0;
-    }
-    else {
-        PyErr_Format(PyExc_ValueError,
-                     "UTF-16 decoding error; "
-		     "unknown error handling code: %.400s",
-                     errors);
-        return -1;
-    }
-}
-
 PyObject *
 PyUnicode_DecodeUTF16(const char *s,
 		      int size,
 		      const char *errors,
 		      int *byteorder)
 {
+    const char *starts = s;
+    int startinpos;
+    int endinpos;
+    int outpos;
     PyUnicodeObject *unicode;
     Py_UNICODE *p;
     const unsigned char *q, *e;
@@ -1335,13 +1395,8 @@ PyUnicode_DecodeUTF16(const char *s,
 #else
     int ihi = 0, ilo = 1;
 #endif
-
-    /* size should be an even number */
-    if (size & 1) {
-        if (utf16_decoding_error(NULL, errors, "truncated data"))
-            return NULL;
-        --size;  /* else ignore the oddball byte */
-    }
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
 
     /* Note: size will always be longer than the resulting Unicode
        character count */
@@ -1398,7 +1453,18 @@ PyUnicode_DecodeUTF16(const char *s,
     }
 
     while (q < e) {
-	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
+	Py_UNICODE ch;
+	/* remaing bytes at the end? (size should be even) */
+	if (e-q<2) {
+	    errmsg = "truncated data";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = ((const char *)e)-starts;
+	    goto utf16Error;
+	    /* The remaining input chars are ignored if the callback
+	       chooses to skip the input */
+	}
+	ch = (q[ihi] << 8) | q[ilo];
+
 	q += 2;
 
 	if (ch < 0xD800 || ch > 0xDFFF) {
@@ -1409,6 +1475,8 @@ PyUnicode_DecodeUTF16(const char *s,
 	/* UTF-16 code pair: */
 	if (q >= e) {
 	    errmsg = "unexpected end of data";
+	    startinpos = (((const char *)q)-2)-starts;
+	    endinpos = ((const char *)e)-starts;
 	    goto utf16Error;
 	}
 	if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -1425,15 +1493,24 @@ PyUnicode_DecodeUTF16(const char *s,
 	    }
 	    else {
                 errmsg = "illegal UTF-16 surrogate";
+		startinpos = (((const char *)q)-4)-starts;
+		endinpos = startinpos+2;
 		goto utf16Error;
 	    }
 
 	}
 	errmsg = "illegal encoding";
+	startinpos = (((const char *)q)-2)-starts;
+	endinpos = startinpos+2;
 	/* Fall through to report the error */
 
     utf16Error:
-	if (utf16_decoding_error(&p, errors, errmsg))
+	outpos = p-PyUnicode_AS_UNICODE(unicode);
+	if (unicode_decode_call_errorhandler(
+	         errors, &errorHandler,
+	         "utf16", errmsg,
+	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
+	         (PyObject **)&unicode, &outpos, &p))
 	    goto onError;
     }
 
@@ -1444,10 +1521,14 @@ PyUnicode_DecodeUTF16(const char *s,
     if (_PyUnicode_Resize(&unicode, p - unicode->str))
         goto onError;
 
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return (PyObject *)unicode;
 
 onError:
     Py_DECREF(unicode);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return NULL;
 }
 
@@ -1528,63 +1609,43 @@ PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
 
 /* --- Unicode Escape Codec ----------------------------------------------- */
 
-static
-int unicodeescape_decoding_error(Py_UNICODE **x,
-                                 const char *errors,
-                                 const char *details) 
-{
-    if ((errors == NULL) ||
-        (strcmp(errors,"strict") == 0)) {
-        PyErr_Format(PyExc_UnicodeError,
-                     "Unicode-Escape decoding error: %.400s",
-                     details);
-        return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-        return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-        **x = Py_UNICODE_REPLACEMENT_CHARACTER;
-	(*x)++;
-        return 0;
-    }
-    else {
-        PyErr_Format(PyExc_ValueError,
-                     "Unicode-Escape decoding error; "
-                     "unknown error handling code: %.400s",
-                     errors);
-        return -1;
-    }
-}
-
 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
 
 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
 					int size,
 					const char *errors)
 {
+    const char *starts = s;
+    int startinpos;
+    int endinpos;
+    int outpos;
+    int i;
     PyUnicodeObject *v;
-    Py_UNICODE *p, *buf;
+    Py_UNICODE *p;
     const char *end;
     char* message;
     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
 
     /* Escaped strings will always be longer than the resulting
        Unicode string, so we start with size here and then reduce the
-       length after conversion to the true value. */
+       length after conversion to the true value.
+       (but if the error callback returns a long replacement string
+       we'll have to allocate more space) */
     v = _PyUnicode_New(size);
     if (v == NULL)
         goto onError;
     if (size == 0)
         return (PyObject *)v;
 
-    p = buf = PyUnicode_AS_UNICODE(v);
+    p = PyUnicode_AS_UNICODE(v);
     end = s + size;
 
     while (s < end) {
         unsigned char c;
         Py_UNICODE x;
-        int i, digits;
+        int digits;
 
         /* Non-escape characters are interpreted as Unicode ordinals */
         if (*s != '\\') {
@@ -1592,6 +1653,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
             continue;
         }
 
+        startinpos = s-starts;
         /* \ - Escapes */
         s++;
         switch (*s++) {
@@ -1640,14 +1702,28 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
             message = "truncated \\UXXXXXXXX escape";
         hexescape:
             chr = 0;
-            for (i = 0; i < digits; i++) {
+            outpos = p-PyUnicode_AS_UNICODE(v);
+            if (s+digits>end) {
+                endinpos = size;
+                if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "unicodeescape", "end of string in escape sequence",
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    (PyObject **)&v, &outpos, &p))
+                    goto onError;
+                goto nextByte;
+            }
+            for (i = 0; i < digits; ++i) {
                 c = (unsigned char) s[i];
                 if (!isxdigit(c)) {
-                    if (unicodeescape_decoding_error(&p, errors, message))
+                    endinpos = (s+i+1)-starts;
+                    if (unicode_decode_call_errorhandler(
+                        errors, &errorHandler,
+                        "unicodeescape", message,
+                        starts, size, &startinpos, &endinpos, &exc, &s,
+                        (PyObject **)&v, &outpos, &p))
                         goto onError;
-                    chr = 0xffffffff;
-                    i++;
-                    break;
+                    goto nextByte;
                 }
                 chr = (chr<<4) & ~0xF;
                 if (c >= '0' && c <= '9')
@@ -1659,9 +1735,9 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
             }
             s += i;
             if (chr == 0xffffffff)
-                    /* _decoding_error will have already written into the
-                       target buffer. */
-                    break;
+                /* _decoding_error will have already written into the
+                   target buffer. */
+                break;
         store:
             /* when we get here, chr is a 32-bit unicode character */
             if (chr <= 0xffff)
@@ -1678,10 +1754,13 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
 #endif
             } else {
-                if (unicodeescape_decoding_error(
-                    &p, errors,
-                    "illegal Unicode character")
-                    )
+                endinpos = s-starts;
+                outpos = p-PyUnicode_AS_UNICODE(v);
+                if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "unicodeescape", "illegal Unicode character",
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    (PyObject **)&v, &outpos, &p))
                     goto onError;
             }
             break;
@@ -1717,13 +1796,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                         goto store;
                 }
             }
-            if (unicodeescape_decoding_error(&p, errors, message))
+            endinpos = s-starts;
+            outpos = p-PyUnicode_AS_UNICODE(v);
+            if (unicode_decode_call_errorhandler(
+                errors, &errorHandler,
+                "unicodeescape", message,
+                starts, size, &startinpos, &endinpos, &exc, &s,
+                (PyObject **)&v, &outpos, &p))
                 goto onError;
             break;
 
         default:
             if (s > end) {
-                if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
+                message = "\\ at end of string";
+                s--;
+                endinpos = s-starts;
+                outpos = p-PyUnicode_AS_UNICODE(v);
+                if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "unicodeescape", message,
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    (PyObject **)&v, &outpos, &p))
                     goto onError;
             }
             else {
@@ -1732,9 +1825,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
             }
             break;
         }
+        nextByte:
+        ;
     }
-    if (_PyUnicode_Resize(&v, (int)(p - buf)))
-                goto onError;
+    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
+        goto onError;
     return (PyObject *)v;
 
 ucnhashError:
@@ -1742,10 +1837,14 @@ ucnhashError:
         PyExc_UnicodeError,
         "\\N escapes not supported (can't load unicodedata module)"
         );
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return NULL;
 
 onError:
     Py_XDECREF(v);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return NULL;
 }
 
@@ -1909,20 +2008,27 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 					   int size,
 					   const char *errors)
 {
+    const char *starts = s;
+    int startinpos;
+    int endinpos;
+    int outpos;
     PyUnicodeObject *v;
-    Py_UNICODE *p, *buf;
+    Py_UNICODE *p;
     const char *end;
     const char *bs;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
     
     /* Escaped strings will always be longer than the resulting
        Unicode string, so we start with size here and then reduce the
-       length after conversion to the true value. */
+       length after conversion to the true value. (But decoding error
+       handler might have to resize the string) */
     v = _PyUnicode_New(size);
     if (v == NULL)
 	goto onError;
     if (size == 0)
 	return (PyObject *)v;
-    p = buf = PyUnicode_AS_UNICODE(v);
+    p = PyUnicode_AS_UNICODE(v);
     end = s + size;
     while (s < end) {
 	unsigned char c;
@@ -1934,6 +2040,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 	    *p++ = (unsigned char)*s++;
 	    continue;
 	}
+	startinpos = s-starts;
 
 	/* \u-escapes are only interpreted iff the number of leading
 	   backslashes if odd */
@@ -1952,15 +2059,18 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 	s++;
 
 	/* \uXXXX with 4 hex digits */
-	for (x = 0, i = 0; i < 4; i++) {
-	    c = (unsigned char)s[i];
+	outpos = p-PyUnicode_AS_UNICODE(v);
+	for (x = 0, i = 0; i < 4; ++i, ++s) {
+	    c = (unsigned char)*s;
 	    if (!isxdigit(c)) {
-		if (unicodeescape_decoding_error(&p, errors,
-						 "truncated \\uXXXX"))
+		endinpos = s-starts;
+		if (unicode_decode_call_errorhandler(
+		    errors, &errorHandler,
+		    "rawunicodeescape", "truncated \\uXXXX",
+		    starts, size, &startinpos, &endinpos, &exc, &s,
+		    (PyObject **)&v, &outpos, &p))
 		    goto onError;
-		x = 0xffffffff;
-		i++;
-		break;
+		goto nextByte;
 	    }
 	    x = (x<<4) & ~0xF;
 	    if (c >= '0' && c <= '9')
@@ -1970,16 +2080,20 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 	    else
 		x += 10 + c - 'A';
 	}
-	s += i;
-	if (x != 0xffffffff)
-		*p++ = x;
+	*p++ = x;
+	nextByte:
+	;
     }
-    if (_PyUnicode_Resize(&v, (int)(p - buf)))
+    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	goto onError;
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return (PyObject *)v;
     
  onError:
     Py_XDECREF(v);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return NULL;
 }
 
@@ -2059,71 +2173,271 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
     return NULL;
 }
 
-static
-int latin1_encoding_error(const Py_UNICODE **source,
-			  char **dest,
-			  const char *errors,
-			  const char *details) 
-{
-    if ((errors == NULL) ||
-	(strcmp(errors,"strict") == 0)) {
-	PyErr_Format(PyExc_UnicodeError,
-		     "Latin-1 encoding error: %.400s",
-		     details);
-	return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-	return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-	**dest = '?';
-	(*dest)++;
-	return 0;
+/* create or adjust a UnicodeEncodeError */
+static void make_encode_exception(PyObject **exceptionObject,
+    const char *encoding,
+    const Py_UNICODE *unicode, int size,
+    int startpos, int endpos,
+    const char *reason)
+{
+    if (*exceptionObject == NULL) {
+	*exceptionObject = PyUnicodeEncodeError_Create(
+	    encoding, unicode, size, startpos, endpos, reason);
     }
     else {
-	PyErr_Format(PyExc_ValueError,
-		     "Latin-1 encoding error; "
-		     "unknown error handling code: %.400s",
-		     errors);
-	return -1;
+	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
+	    goto onError;
+	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
+	    goto onError;
+	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
+	    goto onError;
+	return;
+	onError:
+	Py_DECREF(*exceptionObject);
+	*exceptionObject = NULL;
     }
 }
 
-PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
-				 int size,
-				 const char *errors)
+/* raises a UnicodeEncodeError */
+static void raise_encode_exception(PyObject **exceptionObject,
+    const char *encoding,
+    const Py_UNICODE *unicode, int size,
+    int startpos, int endpos,
+    const char *reason)
 {
-    PyObject *repr;
-    char *s, *start;
+    make_encode_exception(exceptionObject,
+	encoding, unicode, size, startpos, endpos, reason);
+    if (*exceptionObject != NULL)
+	PyCodec_StrictErrors(*exceptionObject);
+}
 
-    repr = PyString_FromStringAndSize(NULL, size);
-    if (repr == NULL)
-        return NULL;
-    if (size == 0)
-	return repr;
+/* error handling callback helper:
+   build arguments, call the callback and check the arguments,
+   put the result into newpos and return the replacement string, which
+   has to be freed by the caller */
+static PyObject *unicode_encode_call_errorhandler(const char *errors,
+    PyObject **errorHandler,
+    const char *encoding, const char *reason,
+    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
+    int startpos, int endpos,
+    int *newpos)
+{
+    static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
 
-    s = PyString_AS_STRING(repr);
-    start = s;
-    while (size-- > 0) {
-        Py_UNICODE ch = *p++;
-	if (ch >= 256) {
-	    if (latin1_encoding_error(&p, &s, errors, 
-				      "ordinal not in range(256)"))
-		goto onError;
+    PyObject *restuple;
+    PyObject *resunicode;
+
+    if (*errorHandler == NULL) {
+	*errorHandler = PyCodec_LookupError(errors);
+        if (*errorHandler == NULL)
+	    return NULL;
+    }
+
+    make_encode_exception(exceptionObject,
+	encoding, unicode, size, startpos, endpos, reason);
+    if (*exceptionObject == NULL)
+	return NULL;
+
+    restuple = PyObject_CallFunctionObjArgs(
+	*errorHandler, *exceptionObject, NULL);
+    if (restuple == NULL)
+	return NULL;
+    if (!PyTuple_Check(restuple)) {
+	PyErr_Format(PyExc_TypeError, &argparse[4]);
+	Py_DECREF(restuple);
+	return NULL;
+    }
+    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+	&resunicode, newpos)) {
+	Py_DECREF(restuple);
+	return NULL;
+    }
+    if (*newpos<0)
+	*newpos = 0;
+    else if (*newpos>size)
+	*newpos = size;
+    Py_INCREF(resunicode);
+    Py_DECREF(restuple);
+    return resunicode;
+}
+
+static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
+				 int size,
+				 const char *errors,
+				 int limit)
+{
+    /* output object */
+    PyObject *res;
+    /* pointers to the beginning and end+1 of input */
+    const Py_UNICODE *startp = p;
+    const Py_UNICODE *endp = p + size;
+    /* pointer to the beginning of the unencodable characters */
+    /* const Py_UNICODE *badp = NULL; */
+    /* pointer into the output */
+    char *str;
+    /* current output position */
+    int respos = 0;
+    int ressize;
+    char *encoding = (limit == 256) ? "latin-1" : "ascii";
+    char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    /* the following variable is used for caching string comparisons
+     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
+    int known_errorHandler = -1;
+
+    /* allocate enough for a simple encoding without
+       replacements, if we need more, we'll resize */
+    res = PyString_FromStringAndSize(NULL, size);
+    if (res == NULL)
+        goto onError;
+    if (size == 0)
+	return res;
+    str = PyString_AS_STRING(res);
+    ressize = size;
+
+    while (p<endp) {
+	Py_UNICODE c = *p;
+
+	/* can we encode this? */
+	if (c<limit) {
+	    /* no overflow check, because we know that the space is enough */
+	    *str++ = (char)c;
+	    ++p;
+	}
+	else {
+	    int unicodepos = p-startp;
+	    int requiredsize;
+	    PyObject *repunicode;
+	    int repsize;
+	    int newpos;
+	    int respos;
+	    Py_UNICODE *uni2;
+	    /* startpos for collecting unencodable chars */
+	    const Py_UNICODE *collstart = p;
+	    const Py_UNICODE *collend = p;
+	    /* find all unecodable characters */
+	    while ((collend < endp) && ((*collend)>=limit))
+		++collend;
+	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
+	    if (known_errorHandler==-1) {
+		if ((errors==NULL) || (!strcmp(errors, "strict")))
+		    known_errorHandler = 1;
+		else if (!strcmp(errors, "replace"))
+		    known_errorHandler = 2;
+		else if (!strcmp(errors, "ignore"))
+		    known_errorHandler = 3;
+		else if (!strcmp(errors, "xmlcharrefreplace"))
+		    known_errorHandler = 4;
+		else
+		    known_errorHandler = 0;
+	    }
+	    switch (known_errorHandler) {
+		case 1: /* strict */
+		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
+		    goto onError;
+		case 2: /* replace */
+		    while (collstart++<collend)
+			*str++ = '?'; /* fall through */
+		case 3: /* ignore */
+		    p = collend;
+		    break;
+		case 4: /* xmlcharrefreplace */
+		    respos = str-PyString_AS_STRING(res);
+		    /* determine replacement size (temporarily (mis)uses p) */
+		    for (p = collstart, repsize = 0; p < collend; ++p) {
+			if (*p<10)
+			    repsize += 2+1+1;
+			else if (*p<100)
+			    repsize += 2+2+1;
+			else if (*p<1000)
+			    repsize += 2+3+1;
+			else if (*p<10000)
+			    repsize += 2+4+1;
+			else if (*p<100000)
+			    repsize += 2+5+1;
+			else if (*p<1000000)
+			    repsize += 2+6+1;
+			else
+			    repsize += 2+7+1;
+		    }
+		    requiredsize = respos+repsize+(endp-collend);
+		    if (requiredsize > ressize) {
+			if (requiredsize<2*ressize)
+			    requiredsize = 2*ressize;
+			if (_PyString_Resize(&res, requiredsize))
+			    goto onError;
+			str = PyString_AS_STRING(res) + respos;
+			ressize = requiredsize;
+		    }
+		    /* generate replacement (temporarily (mis)uses p) */
+		    for (p = collstart; p < collend; ++p) {
+			str += sprintf(str, "&#%d;", (int)*p);
+		    }
+		    p = collend;
+		    break;
+		default:
+		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+			encoding, reason, startp, size, &exc,
+			collstart-startp, collend-startp, &newpos);
+		    if (repunicode == NULL)
+			goto onError;
+		    /* need more space? (at least enough for what we
+		       have+the replacement+the rest of the string, so
+		       we won't have to check space for encodable characters) */
+		    respos = str-PyString_AS_STRING(res);
+		    repsize = PyUnicode_GET_SIZE(repunicode);
+		    requiredsize = respos+repsize+(endp-collend);
+		    if (requiredsize > ressize) {
+			if (requiredsize<2*ressize)
+			    requiredsize = 2*ressize;
+			if (_PyString_Resize(&res, requiredsize)) {
+			    Py_DECREF(repunicode);
+			    goto onError;
+			}
+			str = PyString_AS_STRING(res) + respos;
+			ressize = requiredsize;
+		    }
+		    /* check if there is anything unencodable in the replacement
+		       and copy it to the output */
+		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
+			c = *uni2;
+			if (c >= limit) {
+			    raise_encode_exception(&exc, encoding, startp, size,
+				unicodepos, unicodepos+1, reason);
+			    Py_DECREF(repunicode);
+			    goto onError;
+			}
+			*str = (char)c;
+		    }
+		    p = startp + newpos;
+		    Py_DECREF(repunicode);
+	    }
 	}
-	else
-            *s++ = (char)ch;
     }
-    /* Resize if error handling skipped some characters */
-    if (s - start < PyString_GET_SIZE(repr))
-	_PyString_Resize(&repr, s - start);
-    return repr;
+    /* Resize if we allocated to much */
+    respos = str-PyString_AS_STRING(res);
+    if (respos<ressize)
+       /* If this falls res will be NULL */
+	_PyString_Resize(&res, respos);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return res;
 
- onError:
-    Py_DECREF(repr);
+    onError:
+    Py_XDECREF(res);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return NULL;
 }
 
+PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
+				 int size,
+				 const char *errors)
+{
+    return unicode_encode_ucs1(p, size, errors, 256);
+}
+
 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
 {
     if (!PyUnicode_Check(unicode)) {
@@ -2137,42 +2451,19 @@ PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
 
 /* --- 7-bit ASCII Codec -------------------------------------------------- */
 
-static
-int ascii_decoding_error(const char **source,
-			 Py_UNICODE **dest,
-			 const char *errors,
-			 const char *details) 
-{
-    if ((errors == NULL) ||
-	(strcmp(errors,"strict") == 0)) {
-	PyErr_Format(PyExc_UnicodeError,
-		     "ASCII decoding error: %.400s",
-		     details);
-	return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-	return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
-	(*dest)++;
-	return 0;
-    }
-    else {
-	PyErr_Format(PyExc_ValueError,
-		     "ASCII decoding error; "
-		     "unknown error handling code: %.400s",
-		     errors);
-	return -1;
-    }
-}
-
 PyObject *PyUnicode_DecodeASCII(const char *s,
 				int size,
 				const char *errors)
 {
+    const char *starts = s;
     PyUnicodeObject *v;
     Py_UNICODE *p;
+    int startinpos;
+    int endinpos;
+    int outpos;
+    const char *e;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
     
     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
     if (size == 1 && *(unsigned char*)s < 128) {
@@ -2186,89 +2477,44 @@ PyObject *PyUnicode_DecodeASCII(const char *s,
     if (size == 0)
 	return (PyObject *)v;
     p = PyUnicode_AS_UNICODE(v);
-    while (size-- > 0) {
-	register unsigned char c;
-
-	c = (unsigned char)*s++;
-	if (c < 128)
+    e = s + size;
+    while (s < e) {
+	register unsigned char c = (unsigned char)*s;
+	if (c < 128) {
 	    *p++ = c;
-	else if (ascii_decoding_error(&s, &p, errors, 
-				      "ordinal not in range(128)"))
+	    ++s;
+	}
+	else {
+	    startinpos = s-starts;
+	    endinpos = startinpos + 1;
+	    outpos = p-PyUnicode_AS_UNICODE(v);
+	    if (unicode_decode_call_errorhandler(
+		 errors, &errorHandler,
+		 "ascii", "ordinal not in range(128)",
+		 starts, size, &startinpos, &endinpos, &exc, &s,
+		 (PyObject **)&v, &outpos, &p))
 		goto onError;
+	}
     }
     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
 	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	    goto onError;
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return (PyObject *)v;
     
  onError:
     Py_XDECREF(v);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return NULL;
 }
 
-static
-int ascii_encoding_error(const Py_UNICODE **source,
-			 char **dest,
-			 const char *errors,
-			 const char *details) 
-{
-    if ((errors == NULL) ||
-	(strcmp(errors,"strict") == 0)) {
-	PyErr_Format(PyExc_UnicodeError,
-		     "ASCII encoding error: %.400s",
-		     details);
-	return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-	return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-	**dest = '?';
-	(*dest)++;
-	return 0;
-    }
-    else {
-	PyErr_Format(PyExc_ValueError,
-		     "ASCII encoding error; "
-		     "unknown error handling code: %.400s",
-		     errors);
-	return -1;
-    }
-}
-
 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
 				int size,
 				const char *errors)
 {
-    PyObject *repr;
-    char *s, *start;
-
-    repr = PyString_FromStringAndSize(NULL, size);
-    if (repr == NULL)
-        return NULL;
-    if (size == 0)
-	return repr;
-
-    s = PyString_AS_STRING(repr);
-    start = s;
-    while (size-- > 0) {
-        Py_UNICODE ch = *p++;
-	if (ch >= 128) {
-	    if (ascii_encoding_error(&p, &s, errors, 
-				      "ordinal not in range(128)"))
-		goto onError;
-	}
-	else
-            *s++ = (char)ch;
-    }
-    /* Resize if error handling skipped some characters */
-    if (s - start < PyString_GET_SIZE(repr))
-	_PyString_Resize(&repr, s - start);
-    return repr;
-
- onError:
-    Py_DECREF(repr);
-    return NULL;
+    return unicode_encode_ucs1(p, size, errors, 128);
 }
 
 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
@@ -2348,44 +2594,21 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
 
 /* --- Character Mapping Codec -------------------------------------------- */
 
-static
-int charmap_decoding_error(const char **source,
-			 Py_UNICODE **dest,
-			 const char *errors,
-			 const char *details) 
-{
-    if ((errors == NULL) ||
-	(strcmp(errors,"strict") == 0)) {
-	PyErr_Format(PyExc_UnicodeError,
-		     "charmap decoding error: %.400s",
-		     details);
-	return -1;
-    }
-    else if (strcmp(errors,"ignore") == 0) {
-	return 0;
-    }
-    else if (strcmp(errors,"replace") == 0) {
-	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
-	(*dest)++;
-	return 0;
-    }
-    else {
-	PyErr_Format(PyExc_ValueError,
-		     "charmap decoding error; "
-		     "unknown error handling code: %.400s",
-		     errors);
-	return -1;
-    }
-}
-
 PyObject *PyUnicode_DecodeCharmap(const char *s,
 				  int size,
 				  PyObject *mapping,
 				  const char *errors)
 {
+    const char *starts = s;
+    int startinpos;
+    int endinpos;
+    int outpos;
+    const char *e;
     PyUnicodeObject *v;
     Py_UNICODE *p;
     int extrachars = 0;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
     
     /* Default to Latin-1 */
     if (mapping == NULL)
@@ -2397,8 +2620,9 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
     if (size == 0)
 	return (PyObject *)v;
     p = PyUnicode_AS_UNICODE(v);
-    while (size-- > 0) {
-	unsigned char ch = *s++;
+    e = s + size;
+    while (s < e) {
+	unsigned char ch = *s;
 	PyObject *w, *x;
 
 	/* Get mapping (char ordinal -> integer, Unicode char or None) */
@@ -2430,11 +2654,18 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
 	}
 	else if (x == Py_None) {
 	    /* undefined mapping */
-	    if (charmap_decoding_error(&s, &p, errors, 
-				       "character maps to <undefined>")) {
+	    outpos = p-PyUnicode_AS_UNICODE(v);
+	    startinpos = s-starts;
+	    endinpos = startinpos+1;
+	    if (unicode_decode_call_errorhandler(
+		 errors, &errorHandler,
+		 "charmap", "character maps to <undefined>",
+		 starts, size, &startinpos, &endinpos, &exc, &s,
+		 (PyObject **)&v, &outpos, &p)) {
 		Py_DECREF(x);
 		goto onError;
 	    }
+	    continue;
 	}
 	else if (PyUnicode_Check(x)) {
 	    int targetsize = PyUnicode_GET_SIZE(x);
@@ -2474,45 +2705,233 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
 	    goto onError;
 	}
 	Py_DECREF(x);
+	++s;
     }
     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
 	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	    goto onError;
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     return (PyObject *)v;
     
  onError:
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
     Py_XDECREF(v);
     return NULL;
 }
 
-static
-int charmap_encoding_error(const Py_UNICODE **source,
-			   char **dest,
-			   const char *errors,
-			   const char *details) 
-{
-    if ((errors == NULL) ||
-	(strcmp(errors,"strict") == 0)) {
-	PyErr_Format(PyExc_UnicodeError,
-		     "charmap encoding error: %.400s",
-		     details);
-	return -1;
+/* Lookup the character ch in the mapping. If the character
+   can't be found, Py_None is returned (or NULL, if another
+   error occured). */
+static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
+{
+    PyObject *w = PyInt_FromLong((long)c);
+    PyObject *x;
+
+    if (w == NULL)
+	 return NULL;
+    x = PyObject_GetItem(mapping, w);
+    Py_DECREF(w);
+    if (x == NULL) {
+	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+	    /* No mapping found means: mapping is undefined. */
+	    PyErr_Clear();
+	    x = Py_None;
+	    Py_INCREF(x);
+	    return x;
+	} else
+	    return NULL;
     }
-    else if (strcmp(errors,"ignore") == 0) {
-	return 0;
+    else if (PyInt_Check(x)) {
+	long value = PyInt_AS_LONG(x);
+	if (value < 0 || value > 255) {
+	    PyErr_SetString(PyExc_TypeError,
+			     "character mapping must be in range(256)");
+	    Py_DECREF(x);
+	    return NULL;
+	}
+	return x;
     }
-    else if (strcmp(errors,"replace") == 0) {
-	**dest = '?';
-	(*dest)++;
-	return 0;
+    else if (PyString_Check(x))
+	return x;
+    else {
+	/* wrong return value */
+	PyErr_SetString(PyExc_TypeError,
+	      "character mapping must return integer, None or str");
+	Py_DECREF(x);
+	return NULL;
     }
+}
+
+/* lookup the character, put the result in the output string and adjust
+   various state variables. Reallocate the output string if not enough
+   space is available. Return a new reference to the object that
+   was put in the output buffer, or Py_None, if the mapping was undefined
+   (in which case no character was written) or NULL, if a
+   reallocation error ocurred. The called must decref the result */
+static
+PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
+    PyObject **outobj, int *outpos)
+{
+    PyObject *rep = charmapencode_lookup(c, mapping);
+
+    if (rep==NULL)
+	return NULL;
+    else if (rep==Py_None)
+	return rep;
     else {
-	PyErr_Format(PyExc_ValueError,
-		     "charmap encoding error; "
-		     "unknown error handling code: %.400s",
-		     errors);
-	return -1;
+	char *outstart = PyString_AS_STRING(*outobj);
+	int outsize = PyString_GET_SIZE(*outobj);
+	if (PyInt_Check(rep)) {
+	    int requiredsize = *outpos+1;
+	    if (outsize<requiredsize) {
+		/* exponentially overallocate to minimize reallocations */
+		if (requiredsize < 2*outsize)
+		    requiredsize = 2*outsize;
+		if (_PyString_Resize(outobj, requiredsize)) {
+		    Py_DECREF(rep);
+		    return NULL;
+		}
+		outstart = PyString_AS_STRING(*outobj);
+	    }
+	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
+	}
+	else {
+	    const char *repchars = PyString_AS_STRING(rep);
+	    int repsize = PyString_GET_SIZE(rep);
+	    int requiredsize = *outpos+repsize;
+	    if (outsize<requiredsize) {
+		/* exponentially overallocate to minimize reallocations */
+		if (requiredsize < 2*outsize)
+		    requiredsize = 2*outsize;
+		if (_PyString_Resize(outobj, requiredsize)) {
+		    Py_DECREF(rep);
+		    return NULL;
+		}
+		outstart = PyString_AS_STRING(*outobj);
+	    }
+	    memcpy(outstart + *outpos, repchars, repsize);
+	    *outpos += repsize;
+	}
+    }
+    return rep;
+}
+
+/* handle an error in PyUnicode_EncodeCharmap
+   Return 0 on success, -1 on error */
+static
+int charmap_encoding_error(
+    const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
+    PyObject **exceptionObject,
+    int *known_errorHandler, PyObject *errorHandler, const char *errors,
+    PyObject **res, int *respos)
+{
+    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+    int repsize;
+    int newpos;
+    Py_UNICODE *uni2;
+    /* startpos for collecting unencodable chars */
+    int collstartpos = *inpos;
+    int collendpos = *inpos+1;
+    int collpos;
+    char *encoding = "charmap";
+    char *reason = "character maps to <undefined>";
+
+    PyObject *x;
+    /* find all unencodable characters */
+    while (collendpos < size) {
+	x = charmapencode_lookup(p[collendpos], mapping);
+	if (x==NULL)
+	    return -1;
+	else if (x!=Py_None) {
+	    Py_DECREF(x);
+	    break;
+	}
+	Py_DECREF(x);
+	++collendpos;
+    }
+    /* cache callback name lookup
+     * (if not done yet, i.e. it's the first error) */
+    if (*known_errorHandler==-1) {
+	if ((errors==NULL) || (!strcmp(errors, "strict")))
+	    *known_errorHandler = 1;
+	else if (!strcmp(errors, "replace"))
+	    *known_errorHandler = 2;
+	else if (!strcmp(errors, "ignore"))
+	    *known_errorHandler = 3;
+	else if (!strcmp(errors, "xmlcharrefreplace"))
+	    *known_errorHandler = 4;
+	else
+	    *known_errorHandler = 0;
+    }
+    switch (*known_errorHandler) {
+	case 1: /* strict */
+	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+	    return -1;
+	case 2: /* replace */
+	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
+		x = charmapencode_output('?', mapping, res, respos);
+		if (x==NULL) {
+		    return -1;
+		}
+		else if (x==Py_None) {
+		    Py_DECREF(x);
+		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+		    return -1;
+		}
+		Py_DECREF(x);
+	    }
+	    /* fall through */
+	case 3: /* ignore */
+	    *inpos = collendpos;
+	    break;
+	case 4: /* xmlcharrefreplace */
+	    /* generate replacement (temporarily (mis)uses p) */
+	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+		char buffer[2+29+1+1];
+		char *cp;
+		sprintf(buffer, "&#%d;", (int)p[collpos]);
+		for (cp = buffer; *cp; ++cp) {
+		    x = charmapencode_output(*cp, mapping, res, respos);
+		    if (x==NULL)
+			return -1;
+		    else if (x==Py_None) {
+			Py_DECREF(x);
+			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+			return -1;
+		    }
+		    Py_DECREF(x);
+		}
+	    }
+	    *inpos = collendpos;
+	    break;
+	default:
+	    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+		encoding, reason, p, size, exceptionObject,
+		collstartpos, collendpos, &newpos);
+	    if (repunicode == NULL)
+		return -1;
+	    /* generate replacement  */
+	    repsize = PyUnicode_GET_SIZE(repunicode);
+	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
+		x = charmapencode_output(*uni2, mapping, res, respos);
+		if (x==NULL) {
+		    Py_DECREF(repunicode);
+		    return -1;
+		}
+		else if (x==Py_None) {
+		    Py_DECREF(repunicode);
+		    Py_DECREF(x);
+		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+		    return -1;
+		}
+		Py_DECREF(x);
+	    }
+	    *inpos = newpos;
+	    Py_DECREF(repunicode);
     }
+    return 0;
 }
 
 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
@@ -2520,101 +2939,62 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
 				  PyObject *mapping,
 				  const char *errors)
 {
-    PyObject *v;
-    char *s;
-    int extrachars = 0;
+    /* output object */
+    PyObject *res = NULL;
+    /* current input position */
+    int inpos = 0;
+    /* current output position */
+    int respos = 0;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    /* the following variable is used for caching string comparisons
+     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+     * 3=ignore, 4=xmlcharrefreplace */
+    int known_errorHandler = -1;
 
     /* Default to Latin-1 */
     if (mapping == NULL)
 	return PyUnicode_EncodeLatin1(p, size, errors);
 
-    v = PyString_FromStringAndSize(NULL, size);
-    if (v == NULL)
-        return NULL;
+    /* allocate enough for a simple encoding without
+       replacements, if we need more, we'll resize */
+    res = PyString_FromStringAndSize(NULL, size);
+    if (res == NULL)
+        goto onError;
     if (size == 0)
-	return v;
-    s = PyString_AS_STRING(v);
-    while (size-- > 0) {
-	Py_UNICODE ch = *p++;
-	PyObject *w, *x;
+	return res;
 
-	/* Get mapping (Unicode ordinal -> string char, integer or None) */
-	w = PyInt_FromLong((long)ch);
-	if (w == NULL)
+    while (inpos<size) {
+	/* try to encode it */
+	PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
+	if (x==NULL) /* error */
 	    goto onError;
-	x = PyObject_GetItem(mapping, w);
-	Py_DECREF(w);
-	if (x == NULL) {
-	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
-		/* No mapping found means: mapping is undefined. */
-		PyErr_Clear();
-		x = Py_None;
-		Py_INCREF(x);
-	    } else
+	if (x==Py_None) { /* unencodable character */
+	    if (charmap_encoding_error(p, size, &inpos, mapping,
+		&exc,
+		&known_errorHandler, errorHandler, errors,
+		&res, &respos))
 		goto onError;
 	}
+	else
+	    /* done with this character => adjust input position */
+	    ++inpos;
+	Py_DECREF(x);
+    }
 
-	/* Apply mapping */
-	if (PyInt_Check(x)) {
-	    long value = PyInt_AS_LONG(x);
-	    if (value < 0 || value > 255) {
-		PyErr_SetString(PyExc_TypeError,
-				"character mapping must be in range(256)");
-		Py_DECREF(x);
-		goto onError;
-	    }
-	    *s++ = (char)value;
-	}
-	else if (x == Py_None) {
-	    /* undefined mapping */
-	    if (charmap_encoding_error(&p, &s, errors, 
-				       "character maps to <undefined>")) {
-		Py_DECREF(x);
-		goto onError;
-	    }
-	}
-	else if (PyString_Check(x)) {
-	    int targetsize = PyString_GET_SIZE(x);
-
-	    if (targetsize == 1)
-		/* 1-1 mapping */
-		*s++ = *PyString_AS_STRING(x);
-
-	    else if (targetsize > 1) {
-		/* 1-n mapping */
-		if (targetsize > extrachars) {
-		    /* resize first */
-		    int oldpos = (int)(s - PyString_AS_STRING(v));
-		    int needed = (targetsize - extrachars) + \
-			         (targetsize << 2);
-		    extrachars += needed;
-		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
-			Py_DECREF(x);
-			goto onError;
-		    }
-		    s = PyString_AS_STRING(v) + oldpos;
-		}
-		memcpy(s, PyString_AS_STRING(x), targetsize);
-		s += targetsize;
-		extrachars -= targetsize;
-	    }
-	    /* 1-0 mapping: skip the character */
-	}
-	else {
-	    /* wrong return value */
-	    PyErr_SetString(PyExc_TypeError,
-		  "character mapping must return integer, None or unicode");
-	    Py_DECREF(x);
+    /* Resize if we allocated to much */
+    if (respos<PyString_GET_SIZE(res)) {
+	if (_PyString_Resize(&res, respos))
 	    goto onError;
-	}
-	Py_DECREF(x);
     }
-    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
-	_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
-    return v;
+    Py_XDECREF(exc);
+    Py_XDECREF(errorHandler);
+    return res;
 
- onError:
-    Py_XDECREF(v);
+    onError:
+    Py_XDECREF(res);
+    Py_XDECREF(exc);
+    Py_XDECREF(errorHandler);
     return NULL;
 }
 
@@ -2631,115 +3011,344 @@ PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
 				   NULL);
 }
 
+/* create or adjust a UnicodeTranslateError */
+static void make_translate_exception(PyObject **exceptionObject,
+    const Py_UNICODE *unicode, int size,
+    int startpos, int endpos,
+    const char *reason)
+{
+    if (*exceptionObject == NULL) {
+    	*exceptionObject = PyUnicodeTranslateError_Create(
+	    unicode, size, startpos, endpos, reason);
+    }
+    else {
+	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
+	    goto onError;
+	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
+	    goto onError;
+	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
+	    goto onError;
+	return;
+	onError:
+	Py_DECREF(*exceptionObject);
+	*exceptionObject = NULL;
+    }
+}
+
+/* raises a UnicodeTranslateError */
+static void raise_translate_exception(PyObject **exceptionObject,
+    const Py_UNICODE *unicode, int size,
+    int startpos, int endpos,
+    const char *reason)
+{
+    make_translate_exception(exceptionObject,
+	unicode, size, startpos, endpos, reason);
+    if (*exceptionObject != NULL)
+	PyCodec_StrictErrors(*exceptionObject);
+}
+
+/* error handling callback helper:
+   build arguments, call the callback and check the arguments,
+   put the result into newpos and return the replacement string, which
+   has to be freed by the caller */
+static PyObject *unicode_translate_call_errorhandler(const char *errors,
+    PyObject **errorHandler,
+    const char *reason,
+    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
+    int startpos, int endpos,
+    int *newpos)
+{
+    static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
+
+    PyObject *restuple;
+    PyObject *resunicode;
+
+    if (*errorHandler == NULL) {
+	*errorHandler = PyCodec_LookupError(errors);
+        if (*errorHandler == NULL)
+	    return NULL;
+    }
+
+    make_translate_exception(exceptionObject,
+	unicode, size, startpos, endpos, reason);
+    if (*exceptionObject == NULL)
+	return NULL;
+
+    restuple = PyObject_CallFunctionObjArgs(
+	*errorHandler, *exceptionObject, NULL);
+    if (restuple == NULL)
+	return NULL;
+    if (!PyTuple_Check(restuple)) {
+	PyErr_Format(PyExc_TypeError, &argparse[4]);
+	Py_DECREF(restuple);
+	return NULL;
+    }
+    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+	&resunicode, newpos)) {
+	Py_DECREF(restuple);
+	return NULL;
+    }
+    if (*newpos<0)
+	*newpos = 0;
+    else if (*newpos>size)
+	*newpos = size;
+    Py_INCREF(resunicode);
+    Py_DECREF(restuple);
+    return resunicode;
+}
+
+/* Lookup the character ch in the mapping and put the result in result,
+   which must be decrefed by the caller.
+   Return 0 on success, -1 on error */
 static
-int translate_error(const Py_UNICODE **source,
-		    Py_UNICODE **dest,
-		    const char *errors,
-		    const char *details) 
-{
-    if ((errors == NULL) ||
-	(strcmp(errors,"strict") == 0)) {
-	PyErr_Format(PyExc_UnicodeError,
-		     "translate error: %.400s",
-		     details);
-	return -1;
+int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
+{
+    PyObject *w = PyInt_FromLong((long)c);
+    PyObject *x;
+
+    if (w == NULL)
+	 return -1;
+    x = PyObject_GetItem(mapping, w);
+    Py_DECREF(w);
+    if (x == NULL) {
+	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
+	    /* No mapping found means: use 1:1 mapping. */
+	    PyErr_Clear();
+	    *result = NULL;
+	    return 0;
+	} else
+	    return -1;
     }
-    else if (strcmp(errors,"ignore") == 0) {
+    else if (x == Py_None) {
+	*result = x;
 	return 0;
     }
-    else if (strcmp(errors,"replace") == 0) {
-	**dest = '?';
-	(*dest)++;
+    else if (PyInt_Check(x)) {
+	long value = PyInt_AS_LONG(x);
+	long max = PyUnicode_GetMax();
+	if (value < 0 || value > max) {
+	    PyErr_Format(PyExc_TypeError,
+			     "character mapping must be in range(0x%lx)", max+1);
+	    Py_DECREF(x);
+	    return -1;
+	}
+	*result = x;
+	return 0;
+    }
+    else if (PyUnicode_Check(x)) {
+	*result = x;
 	return 0;
     }
     else {
-	PyErr_Format(PyExc_ValueError,
-		     "translate error; "
-		     "unknown error handling code: %.400s",
-		     errors);
+	/* wrong return value */
+	PyErr_SetString(PyExc_TypeError,
+	      "character mapping must return integer, None or unicode");
+	return -1;
+    }
+}
+/* ensure that *outobj is at least requiredsize characters long,
+if not reallocate and adjust various state variables.
+Return 0 on success, -1 on error */
+static
+int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
+    int requiredsize)
+{
+    if (requiredsize > *outsize) {
+	/* remember old output position */
+	int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
+	/* exponentially overallocate to minimize reallocations */
+	if (requiredsize < 2 * *outsize)
+	    requiredsize = 2 * *outsize;
+	if (_PyUnicode_Resize(outobj, requiredsize))
+	    return -1;
+	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
+	*outsize = requiredsize;
+    }
+    return 0;
+}
+/* lookup the character, put the result in the output string and adjust
+   various state variables. Return a new reference to the object that
+   was put in the output buffer in *result, or Py_None, if the mapping was
+   undefined (in which case no character was written).
+   The called must decref result.
+   Return 0 on success, -1 on error. */
+static
+int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
+    PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
+{
+    if (charmaptranslate_lookup(c, mapping, res))
 	return -1;
+    if (*res==NULL) {
+	/* not found => default to 1:1 mapping */
+	*(*outp)++ = (Py_UNICODE)c;
+    }
+    else if (*res==Py_None)
+	;
+    else if (PyInt_Check(*res)) {
+	/* no overflow check, because we know that the space is enough */
+	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
+    }
+    else if (PyUnicode_Check(*res)) {
+	int repsize = PyUnicode_GET_SIZE(*res);
+	if (repsize==1) {
+	    /* no overflow check, because we know that the space is enough */
+	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
+	}
+	else if (repsize!=0) {
+	    /* more than one character */
+	    int requiredsize = *outsize + repsize - 1;
+	    if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
+		return -1;
+	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
+	    *outp += repsize;
+	}
     }
+    else
+	return -1;
+    return 0;
 }
 
-PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
+PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
 				     int size,
 				     PyObject *mapping,
 				     const char *errors)
 {
-    PyUnicodeObject *v;
-    Py_UNICODE *p;
-    
+    /* output object */
+    PyObject *res = NULL;
+    /* pointers to the beginning and end+1 of input */
+    const Py_UNICODE *startp = p;
+    const Py_UNICODE *endp = p + size;
+    /* pointer into the output */
+    Py_UNICODE *str;
+    /* current output position */
+    int respos = 0;
+    int ressize;
+    char *reason = "character maps to <undefined>";
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    /* the following variable is used for caching string comparisons
+     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
+     * 3=ignore, 4=xmlcharrefreplace */
+    int known_errorHandler = -1;
+
     if (mapping == NULL) {
 	PyErr_BadArgument();
 	return NULL;
     }
-    
-    /* Output will never be longer than input */
-    v = _PyUnicode_New(size);
-    if (v == NULL)
-	goto onError;
-    if (size == 0)
-	goto done;
-    p = PyUnicode_AS_UNICODE(v);
-    while (size-- > 0) {
-	Py_UNICODE ch = *s++;
-	PyObject *w, *x;
 
-	/* Get mapping */
-	w = PyInt_FromLong(ch);
-	if (w == NULL)
-	    goto onError;
-	x = PyObject_GetItem(mapping, w);
-	Py_DECREF(w);
-	if (x == NULL) {
-	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
-		/* No mapping found: default to 1-1 mapping */
-		PyErr_Clear();
-		*p++ = ch;
-		continue;
-	    }
+    /* allocate enough for a simple 1:1 translation without
+       replacements, if we need more, we'll resize */
+    res = PyUnicode_FromUnicode(NULL, size);
+    if (res == NULL)
+        goto onError;
+    if (size == 0)
+	return res;
+    str = PyUnicode_AS_UNICODE(res);
+    ressize = size;
+
+    while (p<endp) {
+	/* try to encode it */
+	PyObject *x = NULL;
+	if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
+	    Py_XDECREF(x);
 	    goto onError;
 	}
-
-	/* Apply mapping */
-	if (PyInt_Check(x))
-	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
-	else if (x == Py_None) {
-	    /* undefined mapping */
-	    if (translate_error(&s, &p, errors, 
-				"character maps to <undefined>")) {
-		Py_DECREF(x);
-		goto onError;
+	if (x!=Py_None) /* it worked => adjust input pointer */
+	    ++p;
+	else { /* untranslatable character */
+	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+	    int repsize;
+	    int newpos;
+	    Py_UNICODE *uni2;
+	    /* startpos for collecting untranslatable chars */
+	    const Py_UNICODE *collstart = p;
+	    const Py_UNICODE *collend = p+1;
+	    const Py_UNICODE *coll;
+
+	    Py_XDECREF(x);
+	    /* find all untranslatable characters */
+	    while (collend < endp) {
+	    	if (charmaptranslate_lookup(*collend, mapping, &x))
+		    goto onError;
+		Py_XDECREF(x);
+		if (x!=Py_None)
+		    break;
+		++collend;
 	    }
-	}
-	else if (PyUnicode_Check(x)) {
-	    if (PyUnicode_GET_SIZE(x) != 1) {
-		/* 1-n mapping */
-		PyErr_SetString(PyExc_NotImplementedError,
-				"1-n mappings are currently not implemented");
-		Py_DECREF(x);
-		goto onError;
+	    /* cache callback name lookup
+	     * (if not done yet, i.e. it's the first error) */
+	    if (known_errorHandler==-1) {
+		if ((errors==NULL) || (!strcmp(errors, "strict")))
+		    known_errorHandler = 1;
+		else if (!strcmp(errors, "replace"))
+		    known_errorHandler = 2;
+		else if (!strcmp(errors, "ignore"))
+		    known_errorHandler = 3;
+		else if (!strcmp(errors, "xmlcharrefreplace"))
+		    known_errorHandler = 4;
+		else
+		    known_errorHandler = 0;
+	    }
+	    switch (known_errorHandler) {
+		case 1: /* strict */
+		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
+		    goto onError;
+		case 2: /* replace */
+		    /* No need to check for space, this is a 1:1 replacement */
+		    for (coll = collstart; coll<collend; ++coll)
+			*str++ = '?';
+		    /* fall through */
+		case 3: /* ignore */
+		    p = collend;
+		    break;
+		case 4: /* xmlcharrefreplace */
+		    /* generate replacement (temporarily (mis)uses p) */
+		    for (p = collstart; p < collend; ++p) {
+			char buffer[2+29+1+1];
+			char *cp;
+			sprintf(buffer, "&#%d;", (int)*p);
+			if (charmaptranslate_makespace(&res, &str, &ressize,
+			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
+			    goto onError;
+			for (cp = buffer; *cp; ++cp)
+			    *str++ = *cp;
+		    }
+		    p = collend;
+		    break;
+		default:
+		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+			reason, startp, size, &exc,
+			collstart-startp, collend-startp, &newpos);
+		    if (repunicode == NULL)
+			goto onError;
+		    /* generate replacement  */
+		    repsize = PyUnicode_GET_SIZE(repunicode);
+		    if (charmaptranslate_makespace(&res, &str, &ressize,
+			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
+			Py_DECREF(repunicode);
+			goto onError;
+		    }
+		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
+			*str++ = *uni2;
+		    p = startp + newpos;
+		    Py_DECREF(repunicode);
 	    }
-	    *p++ = *PyUnicode_AS_UNICODE(x);
-	}
-	else {
-	    /* wrong return value */
-	    PyErr_SetString(PyExc_TypeError,
-		  "translate mapping must return integer, None or unicode");
-	    Py_DECREF(x);
-	    goto onError;
 	}
-	Py_DECREF(x);
     }
-    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
-	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
+    /* Resize if we allocated to much */
+    respos = str-PyUnicode_AS_UNICODE(res);
+    if (respos<ressize) {
+	if (_PyUnicode_Resize(&res, respos))
 	    goto onError;
+    }
+    Py_XDECREF(exc);
+    Py_XDECREF(errorHandler);
+    return res;
 
- done:
-    return (PyObject *)v;
-    
- onError:
-    Py_XDECREF(v);
+    onError:
+    Py_XDECREF(res);
+    Py_XDECREF(exc);
+    Py_XDECREF(errorHandler);
     return NULL;
 }
 
@@ -2772,6 +3381,13 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
 			    const char *errors)
 {
     Py_UNICODE *p, *end;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    const char *encoding = "decimal";
+    const char *reason = "invalid decimal Unicode string";
+    /* the following variable is used for caching string comparisons
+     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
+    int known_errorHandler = -1;
 
     if (output == NULL) {
 	PyErr_BadArgument();
@@ -2781,40 +3397,110 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
     p = s;
     end = s + length;
     while (p < end) {
-	register Py_UNICODE ch = *p++;
+	register Py_UNICODE ch = *p;
 	int decimal;
+	PyObject *repunicode;
+	int repsize;
+	int newpos;
+	Py_UNICODE *uni2;
+	Py_UNICODE *collstart;
+	Py_UNICODE *collend;
 	
 	if (Py_UNICODE_ISSPACE(ch)) {
 	    *output++ = ' ';
+	    ++p;
 	    continue;
 	}
 	decimal = Py_UNICODE_TODECIMAL(ch);
 	if (decimal >= 0) {
 	    *output++ = '0' + decimal;
+	    ++p;
 	    continue;
 	}
 	if (0 < ch && ch < 256) {
 	    *output++ = (char)ch;
+	    ++p;
 	    continue;
 	}
-	/* All other characters are considered invalid */
-	if (errors == NULL || strcmp(errors, "strict") == 0) {
-	    PyErr_SetString(PyExc_ValueError,
-			    "invalid decimal Unicode string");
-	    goto onError;
+	/* All other characters are considered unencodable */
+	collstart = p;
+	collend = p+1;
+	while (collend < end) {
+	    if ((0 < *collend && *collend < 256) ||
+	        !Py_UNICODE_ISSPACE(*collend) ||
+	        Py_UNICODE_TODECIMAL(*collend))
+		break;
 	}
-	else if (strcmp(errors, "ignore") == 0)
-	    continue;
-	else if (strcmp(errors, "replace") == 0) {
-	    *output++ = '?';
-	    continue;
+	/* cache callback name lookup
+	 * (if not done yet, i.e. it's the first error) */
+	if (known_errorHandler==-1) {
+	    if ((errors==NULL) || (!strcmp(errors, "strict")))
+		known_errorHandler = 1;
+	    else if (!strcmp(errors, "replace"))
+		known_errorHandler = 2;
+	    else if (!strcmp(errors, "ignore"))
+		known_errorHandler = 3;
+	    else if (!strcmp(errors, "xmlcharrefreplace"))
+		known_errorHandler = 4;
+	    else
+		known_errorHandler = 0;
+	}
+	switch (known_errorHandler) {
+	    case 1: /* strict */
+		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
+		goto onError;
+	    case 2: /* replace */
+		for (p = collstart; p < collend; ++p)
+		    *output++ = '?';
+		/* fall through */
+	    case 3: /* ignore */
+		p = collend;
+		break;
+	    case 4: /* xmlcharrefreplace */
+		/* generate replacement (temporarily (mis)uses p) */
+		for (p = collstart; p < collend; ++p)
+		    output += sprintf(output, "&#%d;", (int)*p);
+		p = collend;
+		break;
+	    default:
+		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
+		    encoding, reason, s, length, &exc,
+		    collstart-s, collend-s, &newpos);
+		if (repunicode == NULL)
+		    goto onError;
+		/* generate replacement  */
+		repsize = PyUnicode_GET_SIZE(repunicode);
+		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
+		    Py_UNICODE ch = *uni2;
+		    if (Py_UNICODE_ISSPACE(ch))
+			*output++ = ' ';
+		    else {
+			decimal = Py_UNICODE_TODECIMAL(ch);
+			if (decimal >= 0)
+			    *output++ = '0' + decimal;
+			else if (0 < ch && ch < 256)
+			    *output++ = (char)ch;
+			else {
+			    Py_DECREF(repunicode);
+			    raise_encode_exception(&exc, encoding,
+				s, length, collstart-s, collend-s, reason);
+			    goto onError;
+			}
+		    }
+		}
+		p = s + newpos;
+		Py_DECREF(repunicode);
 	}
     }
     /* 0-terminate the output string */
     *output++ = '\0';
+    Py_XDECREF(exc);
+    Py_XDECREF(errorHandler);
     return 0;
 
  onError:
+    Py_XDECREF(exc);
+    Py_XDECREF(errorHandler);
     return -1;
 }
 
@@ -3927,7 +4613,9 @@ PyDoc_STRVAR(encode__doc__,
 Return an encoded string version of S. Default encoding is the current\n\
 default string encoding. errors may be given to set a different error\n\
 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
-a ValueError. Other possible values are 'ignore' and 'replace'.");
+a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
+'xmlcharrefreplace' as well as any other name registered with\n\
+codecs.register_error that can handle UnicodeEncodeErrors.");
 
 static PyObject *
 unicode_encode(PyUnicodeObject *self, PyObject *args)
author	Walter Dörwald <walter@livinglogic.de>	2002-09-02 13:14:32 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2002-09-02 13:14:32 (GMT)
commit	3aeb632c3152fa082132ce55b9a880e0d16b04ae (patch)
tree	192bc1543ea77a826d0c940d024dbc8ebba82156 /Objects/unicodeobject.c
parent	94fab762de532de551987e1f48a125145f85304b (diff)
download	cpython-3aeb632c3152fa082132ce55b9a880e0d16b04ae.zip cpython-3aeb632c3152fa082132ce55b9a880e0d16b04ae.tar.gz cpython-3aeb632c3152fa082132ce55b9a880e0d16b04ae.tar.bz2