Merged revisions 72260 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r72260 | walter.doerwald | 2009-05-04 00:36:33 +0200 (Mo, 04 Mai 2009) | 5 lines Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call PyUnicode_DecodeUTF8() once, remember the result and output it in a second step. This avoids problems with counting UTF-8 bytes that ignores the effect of using the replace error handler in PyUnicode_DecodeUTF8(). ........
author: Walter Dörwald <walter@livinglogic.de> 2009-05-03 22:55:55 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2009-05-03 22:55:55 (GMT)
commit: c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec (patch)
tree: 821d3c8846e4ce61eccbb43d40295321a0ca11dc
parent: 129ab1d8090f54b308b1d4ddaf4fd241ba312a95 (diff)
download: cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.zip
cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.tar.gz
cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.tar.bz2
2 files changed, 38 insertions, 49 deletions
diff --git a/Misc/NEWS b/Misc/NEWS
index 00270fd..fccafcc 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -499,6 +499,11 @@ Core and Builtins
 
 - The re.sub(), re.subn() and re.split() functions now accept a flags parameter.
 
+- Issue #5108: Handle %s like %S, %R and %A in PyUnicode_FromFormatV(): Call
+  PyUnicode_DecodeUTF8() once, remember the result and output it in a second
+  step. This avoids problems with counting UTF-8 bytes that ignores the effect
+  of using the replace error handler in PyUnicode_DecodeUTF8().
+
 Library
 -------
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index cc70bad..6ad73e0 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -723,16 +723,26 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
     count = vargs;
 #endif
 #endif
-    /* step 1: count the number of %S/%R/%A format specifications
-     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
-     * these objects once during step 3 and put the result in
-     an array) */
+    /* step 1: count the number of %S/%R/%A/%s format specifications
+     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
+     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
+     * result in an array) */
     for (f = format; *f; f++) {
-        if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
-            ++callcount;
+         if (*f == '%') {
+             if (*(f+1)=='%')
+                 continue;
+             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
+                 ++callcount;
+             while (ISDIGIT((unsigned)*f))
+                 width = (width*10) + *f++ - '0';
+             while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
+                 ;
+             if (*f == 's')
+                 ++callcount;
+         }
     }
     /* step 2: allocate memory for the results of
-     * PyObject_Str()/PyObject_Repr() calls */
+     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
     if (callcount) {
         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
         if (!callresults) {
@@ -781,35 +791,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
             case 's':
             {
                 /* UTF-8 */
-                unsigned char*s;
-                s = va_arg(count, unsigned char*);
-                while (*s) {
-                    if (*s < 128) {
-                        n++; s++;
-                    } else if (*s < 0xc0) {
-                        /* invalid UTF-8 */
-                        n++; s++;
-                    } else if (*s < 0xc0) {
-                        n++;
-                        s++; if(!*s)break;
-                        s++;
-                    } else if (*s < 0xe0) {
-                        n++;
-                        s++; if(!*s)break;
-                        s++; if(!*s)break;
-                        s++;
-                    } else {
-#ifdef Py_UNICODE_WIDE
-                        n++;
-#else
-                        n+=2;
-#endif
-                        s++; if(!*s)break;
-                        s++; if(!*s)break;
-                        s++; if(!*s)break;
-                        s++;
-                    }
-                }
+                unsigned char *s = va_arg(count, unsigned char*);
+                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
+                if (!str)
+                    goto fail;
+                n += PyUnicode_GET_SIZE(str);
+                /* Remember the str and switch to the next slot */
+                *callresult++ = str;
                 break;
             }
             case 'U':
@@ -978,19 +966,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
                 break;
             case 's':
             {
-                /* Parameter must be UTF-8 encoded.
-                   In case of encoding errors, use
-                   the replacement character. */
-                PyObject *u;
-                p = va_arg(vargs, char*);
-                u = PyUnicode_DecodeUTF8(p, strlen(p),
-                                         "replace");
-                if (!u)
-                    goto fail;
-                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
-                                PyUnicode_GET_SIZE(u));
-                s += PyUnicode_GET_SIZE(u);
-                Py_DECREF(u);
+                /* unused, since we already have the result */
+                (void) va_arg(vargs, char *);
+                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
+                                PyUnicode_GET_SIZE(*callresult));
+                s += PyUnicode_GET_SIZE(*callresult);
+                /* We're done with the unicode()/repr() => forget it */
+                Py_DECREF(*callresult);
+                /* switch to next unicode()/repr() result */
+                ++callresult;
                 break;
             }
             case 'U':
author	Walter Dörwald <walter@livinglogic.de>	2009-05-03 22:55:55 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2009-05-03 22:55:55 (GMT)
commit	c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec (patch)
tree	821d3c8846e4ce61eccbb43d40295321a0ca11dc
parent	129ab1d8090f54b308b1d4ddaf4fd241ba312a95 (diff)
download	cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.zip cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.tar.gz cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.tar.bz2