diff options
author | Walter Dörwald <walter@livinglogic.de> | 2009-05-03 22:55:55 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2009-05-03 22:55:55 (GMT) |
commit | c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec (patch) | |
tree | 821d3c8846e4ce61eccbb43d40295321a0ca11dc | |
parent | 129ab1d8090f54b308b1d4ddaf4fd241ba312a95 (diff) | |
download | cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.zip cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.tar.gz cpython-c1651a0b968390ef6b722d3c2e1ca72c5a7c9cec.tar.bz2 |
Merged revisions 72260 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r72260 | walter.doerwald | 2009-05-04 00:36:33 +0200 (Mo, 04 Mai 2009) | 5 lines
Issue #5108: Handle %s like %S and %R in PyUnicode_FromFormatV(): Call
PyUnicode_DecodeUTF8() once, remember the result and output it in a second
step. This avoids problems with counting UTF-8 bytes that ignores the effect
of using the replace error handler in PyUnicode_DecodeUTF8().
........
-rw-r--r-- | Misc/NEWS | 5 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 82 |
2 files changed, 38 insertions, 49 deletions
@@ -499,6 +499,11 @@ Core and Builtins - The re.sub(), re.subn() and re.split() functions now accept a flags parameter. +- Issue #5108: Handle %s like %S, %R and %A in PyUnicode_FromFormatV(): Call + PyUnicode_DecodeUTF8() once, remember the result and output it in a second + step. This avoids problems with counting UTF-8 bytes that ignores the effect + of using the replace error handler in PyUnicode_DecodeUTF8(). + Library ------- diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index cc70bad..6ad73e0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -723,16 +723,26 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) count = vargs; #endif #endif - /* step 1: count the number of %S/%R/%A format specifications - * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for - * these objects once during step 3 and put the result in - an array) */ + /* step 1: count the number of %S/%R/%A/%s format specifications + * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ + * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the + * result in an array) */ for (f = format; *f; f++) { - if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')) - ++callcount; + if (*f == '%') { + if (*(f+1)=='%') + continue; + if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') + ++callcount; + while (ISDIGIT((unsigned)*f)) + width = (width*10) + *f++ - '0'; + while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) + ; + if (*f == 's') + ++callcount; + } } /* step 2: allocate memory for the results of - * PyObject_Str()/PyObject_Repr() calls */ + * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ if (callcount) { callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); if (!callresults) { @@ -781,35 +791,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 's': { /* UTF-8 */ - unsigned char*s; - s = va_arg(count, unsigned char*); - while (*s) { - if (*s < 128) { - n++; s++; - } else if (*s < 0xc0) { - /* invalid UTF-8 */ - n++; s++; - } else if (*s < 0xc0) { - n++; - s++; if(!*s)break; - s++; - } else if (*s < 0xe0) { - n++; - s++; if(!*s)break; - s++; if(!*s)break; - s++; - } else { -#ifdef Py_UNICODE_WIDE - n++; -#else - n+=2; -#endif - s++; if(!*s)break; - s++; if(!*s)break; - s++; if(!*s)break; - s++; - } - } + unsigned char *s = va_arg(count, unsigned char*); + PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); + if (!str) + goto fail; + n += PyUnicode_GET_SIZE(str); + /* Remember the str and switch to the next slot */ + *callresult++ = str; break; } case 'U': @@ -978,19 +966,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) break; case 's': { - /* Parameter must be UTF-8 encoded. - In case of encoding errors, use - the replacement character. */ - PyObject *u; - p = va_arg(vargs, char*); - u = PyUnicode_DecodeUTF8(p, strlen(p), - "replace"); - if (!u) - goto fail; - Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), - PyUnicode_GET_SIZE(u)); - s += PyUnicode_GET_SIZE(u); - Py_DECREF(u); + /* unused, since we already have the result */ + (void) va_arg(vargs, char *); + Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), + PyUnicode_GET_SIZE(*callresult)); + s += PyUnicode_GET_SIZE(*callresult); + /* We're done with the unicode()/repr() => forget it */ + Py_DECREF(*callresult); + /* switch to next unicode()/repr() result */ + ++callresult; break; } case 'U': |