diff options
-rw-r--r-- | Lib/test/test_unicode.py | 19 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 151 |
3 files changed, 104 insertions, 69 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 86887a5..c4e54e7 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1455,9 +1455,28 @@ class UnicodeTest(string_tests.CommonTest, 'string, got a non-ASCII byte: 0xe9$', PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + # test "%c" self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd') self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff') + # test "%" + self.assertEqual(PyUnicode_FromFormat(b'%'), '%') + self.assertEqual(PyUnicode_FromFormat(b'%%'), '%') + self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s') + self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]') + self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc') + + # test "%i" + self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010') + self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010') + + # not supported: copy the raw format string. these tests are just here + # to check for crashs and should not be considered as specifications + self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s') + self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc') + self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i') + self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s') + # other tests text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'") @@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #10829: Refactor PyUnicode_FromFormat(), use the same function to parse + the format string in the 3 steps, fix crashs on invalid format strings. + - Issue #11246: Fix PyUnicode_FromFormat("%V") to decode the byte string from UTF-8 (with replace error handler) instead of ISO-8859-1 (in strict mode). Patch written by Ray Allen. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7fec6e5..4f1177e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -714,6 +714,70 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, *fmt = '\0'; } +/* helper for PyUnicode_FromFormatV() */ + +static const char* +parse_format_flags(const char *f, + int *p_width, int *p_precision, + int *p_longflag, int *p_longlongflag, int *p_size_tflag) +{ + int width, precision, longflag, longlongflag, size_tflag; + + /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ + f++; + width = 0; + while (Py_ISDIGIT((unsigned)*f)) + width = (width*10) + *f++ - '0'; + precision = 0; + if (*f == '.') { + f++; + while (Py_ISDIGIT((unsigned)*f)) + precision = (precision*10) + *f++ - '0'; + if (*f == '%') { + /* "%.3%s" => f points to "3" */ + f--; + } + } + if (*f == '\0') { + /* bogus format "%.1" => go backward, f points to "1" */ + f--; + } + if (p_width != NULL) + *p_width = width; + if (p_precision != NULL) + *p_precision = precision; + + /* Handle %ld, %lu, %lld and %llu. */ + longflag = 0; + longlongflag = 0; + + if (*f == 'l') { + if (f[1] == 'd' || f[1] == 'u') { + longflag = 1; + ++f; + } +#ifdef HAVE_LONG_LONG + else if (f[1] == 'l' && + (f[2] == 'd' || f[2] == 'u')) { + longlongflag = 1; + f += 2; + } +#endif + } + /* handle the size_t flag. */ + else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { + size_tflag = 1; + ++f; + } + if (p_longflag != NULL) + *p_longflag = longflag; + if (p_longlongflag != NULL) + *p_longlongflag = longlongflag; + if (p_size_tflag != NULL) + *p_size_tflag = size_tflag; + return f; +} + #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} /* size of fixed-size buffer for formatting single arguments */ @@ -757,15 +821,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) * result in an array) */ for (f = format; *f; f++) { if (*f == '%') { - if (*(f+1)=='%') - continue; - if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V') - ++callcount; - while (Py_ISDIGIT((unsigned)*f)) - width = (width*10) + *f++ - '0'; - while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) - ; - if (*f == 's') + /* skip width or width.precision (eg. "1.2" of "%1.2f") */ + f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); + if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') ++callcount; } else if (128 <= (unsigned char)*f) { @@ -790,33 +848,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) for (f = format; *f; f++) { if (*f == '%') { #ifdef HAVE_LONG_LONG - int longlongflag = 0; + int longlongflag; #endif - const char* p = f; - width = 0; - while (Py_ISDIGIT((unsigned)*f)) - width = (width*10) + *f++ - '0'; - while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) - ; - - /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since - * they don't affect the amount of space we reserve. - */ - if (*f == 'l') { - if (f[1] == 'd' || f[1] == 'u') { - ++f; - } -#ifdef HAVE_LONG_LONG - else if (f[1] == 'l' && - (f[2] == 'd' || f[2] == 'u')) { - longlongflag = 1; - f += 2; - } -#endif - } - else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { - ++f; - } + const char* p; + + p = f; + f = parse_format_flags(f, &width, NULL, + NULL, &longlongflag, NULL); switch (*f) { case 'c': @@ -981,40 +1019,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) for (f = format; *f; f++) { if (*f == '%') { - const char* p = f++; - int longflag = 0; - int longlongflag = 0; - int size_tflag = 0; - zeropad = (*f == '0'); - /* parse the width.precision part */ - width = 0; - while (Py_ISDIGIT((unsigned)*f)) - width = (width*10) + *f++ - '0'; - precision = 0; - if (*f == '.') { - f++; - while (Py_ISDIGIT((unsigned)*f)) - precision = (precision*10) + *f++ - '0'; - } - /* Handle %ld, %lu, %lld and %llu. */ - if (*f == 'l') { - if (f[1] == 'd' || f[1] == 'u') { - longflag = 1; - ++f; - } -#ifdef HAVE_LONG_LONG - else if (f[1] == 'l' && - (f[2] == 'd' || f[2] == 'u')) { - longlongflag = 1; - f += 2; - } -#endif - } - /* handle the size_t flag. */ - if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { - size_tflag = 1; - ++f; - } + const char* p; + int longflag; + int longlongflag; + int size_tflag; + + p = f; + zeropad = (f[1] == '0'); + f = parse_format_flags(f, &width, &precision, + &longflag, &longlongflag, &size_tflag); switch (*f) { case 'c': |