Issue #15866: The xmlcharrefreplace error handler no more produces two XML

entities for a non-BMP character on narrow build.
author: Serhiy Storchaka <storchaka@gmail.com> 2013-08-06 13:56:26 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2013-08-06 13:56:26 (GMT)
commit: e822b034e766e03cd8fbe7ab52fbc2d46fff6d33 (patch)
tree: a2e79671034ae47c4a0d6d0d030aecda007c8123 /Objects
parent: 5ad3514822a80c094f1cfe47ae59450001043482 (diff)
download: cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.zip
cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.tar.gz
cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.tar.bz2
1 files changed, 61 insertions, 21 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 64a5ef5..866eb9b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u)
     return PyUnicode_FromStringAndSize(u, size);
 }
 
+/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
+ * by 'ptr', possibly combining surrogate pairs on narrow builds.
+ * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
+ * that should be returned and 'end' pointing to the end of the buffer.
+ * ('end' is used on narrow builds to detect a lone surrogate at the
+ * end of the buffer that should be returned unchanged.)
+ * The ptr and end arguments should be side-effect free and ptr must an lvalue.
+ * The type of the returned char is always Py_UCS4.
+ *
+ * Note: the macro advances ptr to next char, so it might have side-effects
+ *       (especially if used with other macros).
+ */
+
+/* helper macros used by _Py_UNICODE_NEXT */
+#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
+#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+/* Join two surrogate characters and return a single Py_UCS4 value. */
+#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
+    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
+      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
+
+#ifdef Py_UNICODE_WIDE
+#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
+#else
+#define _Py_UNICODE_NEXT(ptr, end)                                      \
+     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
+        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
+       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
+       (Py_UCS4)*(ptr)++)
+#endif
+
 #ifdef HAVE_WCHAR_H
 
 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
@@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
             case 4: /* xmlcharrefreplace */
                 respos = str-PyString_AS_STRING(res);
                 /* determine replacement size (temporarily (mis)uses p) */
-                for (p = collstart, repsize = 0; p < collend; ++p) {
-                    if (*p<10)
+                for (p = collstart, repsize = 0; p < collend;) {
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    if (ch < 10)
                         repsize += 2+1+1;
-                    else if (*p<100)
+                    else if (ch < 100)
                         repsize += 2+2+1;
-                    else if (*p<1000)
+                    else if (ch < 1000)
                         repsize += 2+3+1;
-                    else if (*p<10000)
+                    else if (ch < 10000)
                         repsize += 2+4+1;
-#ifndef Py_UNICODE_WIDE
-                    else
+                    else if (ch < 100000)
                         repsize += 2+5+1;
-#else
-                    else if (*p<100000)
-                        repsize += 2+5+1;
-                    else if (*p<1000000)
+                    else if (ch < 1000000)
                         repsize += 2+6+1;
                     else
                         repsize += 2+7+1;
-#endif
                 }
                 requiredsize = respos+repsize+(endp-collend);
                 if (requiredsize > ressize) {
@@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
                     ressize = requiredsize;
                 }
                 /* generate replacement (temporarily (mis)uses p) */
-                for (p = collstart; p < collend; ++p) {
-                    str += sprintf(str, "&#%d;", (int)*p);
+                for (p = collstart; p < collend;) {
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    str += sprintf(str, "&#%d;", (int)ch);
                 }
                 p = collend;
                 break;
@@ -4649,11 +4677,20 @@ int charmap_encoding_error(
         *inpos = collendpos;
         break;
     case 4: /* xmlcharrefreplace */
-        /* generate replacement (temporarily (mis)uses p) */
-        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+        /* generate replacement */
+        for (collpos = collstartpos; collpos < collendpos;) {
             char buffer[2+29+1+1];
             char *cp;
-            sprintf(buffer, "&#%d;", (int)p[collpos]);
+            Py_UCS4 ch = p[collpos++];
+#ifndef Py_UNICODE_WIDE
+            if ((0xD800 <= ch && ch <= 0xDBFF) &&
+                (collpos < collendpos) &&
+                (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
+                ch = ((((ch & 0x03FF) << 10) |
+                       ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
+            }
+#endif
+            sprintf(buffer, "&#%d;", (int)ch);
             for (cp = buffer; *cp; ++cp) {
                 x = charmapencode_output(*cp, mapping, res, respos);
                 if (x==enc_EXCEPTION)
@@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
                 break;
             case 4: /* xmlcharrefreplace */
                 /* generate replacement (temporarily (mis)uses p) */
-                for (p = collstart; p < collend; ++p) {
+                for (p = collstart; p < collend;) {
                     char buffer[2+29+1+1];
                     char *cp;
-                    sprintf(buffer, "&#%d;", (int)*p);
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    sprintf(buffer, "&#%d;", (int)ch);
                     if (charmaptranslate_makespace(&res, &str,
                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
                         goto onError;
@@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
             break;
         case 4: /* xmlcharrefreplace */
             /* generate replacement (temporarily (mis)uses p) */
-            for (p = collstart; p < collend; ++p)
-                output += sprintf(output, "&#%d;", (int)*p);
+            for (p = collstart; p < collend;) {
+                Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                output += sprintf(output, "&#%d;", ch);
+            }
             p = collend;
             break;
         default:
author	Serhiy Storchaka <storchaka@gmail.com>	2013-08-06 13:56:26 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2013-08-06 13:56:26 (GMT)
commit	e822b034e766e03cd8fbe7ab52fbc2d46fff6d33 (patch)
tree	a2e79671034ae47c4a0d6d0d030aecda007c8123 /Objects
parent	5ad3514822a80c094f1cfe47ae59450001043482 (diff)
download	cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.zip cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.tar.gz cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.tar.bz2