diff options
-rw-r--r-- | Doc/c-api/unicode.rst | 11 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 229 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 155 |
4 files changed, 298 insertions, 100 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index abf353f..2f03f69 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -526,12 +526,23 @@ APIs: The `"%lld"` and `"%llu"` format specifiers are only available when :const:`HAVE_LONG_LONG` is defined. + .. note:: + The width formatter unit is number of characters rather than bytes. + The precision formatter unit is number of bytes for ``"%s"`` and + ``"%V"`` (if the ``PyObject*`` argument is NULL), and a number of + characters for ``"%A"``, ``"%U"``, ``"%S"``, ``"%R"`` and ``"%V"`` + (if the ``PyObject*`` argument is not NULL). + .. versionchanged:: 3.2 Support for ``"%lld"`` and ``"%llu"`` added. .. versionchanged:: 3.3 Support for ``"%li"``, ``"%lli"`` and ``"%zi"`` added. + .. versionchanged:: 3.4 + Support width and precision formatter for ``"%s"``, ``"%A"``, ``"%U"``, + ``"%V"``, ``"%S"``, ``"%R"`` added. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 32cba06..6a646a0 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2007,9 +2007,13 @@ class UnicodeTest(string_tests.CommonTest, for arg in args) return _PyUnicode_FromFormat(format, *cargs) + def check_format(expected, format, *args): + text = PyUnicode_FromFormat(format, *args) + self.assertEqual(expected, text) + # ascii format, non-ascii argument - text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9') - self.assertEqual(text, 'ascii\x7f=unicode\xe9') + check_format('ascii\x7f=unicode\xe9', + b'ascii\x7f=%U', 'unicode\xe9') # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() # raises an error @@ -2019,83 +2023,200 @@ class UnicodeTest(string_tests.CommonTest, PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') # test "%c" - self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd') - self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff') + check_format('\uabcd', + b'%c', c_int(0xabcd)) + check_format('\U0010ffff', + b'%c', c_int(0x10ffff)) # test "%" - self.assertEqual(PyUnicode_FromFormat(b'%'), '%') - self.assertEqual(PyUnicode_FromFormat(b'%%'), '%') - self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s') - self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]') - self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc') + check_format('%', + b'%') + check_format('%', + b'%%') + check_format('%s', + b'%%s') + check_format('[%]', + b'[%%]') + check_format('%abc', + b'%%%s', b'abc') + + # truncated string + check_format('abc', + b'%.3s', b'abcdef') + check_format('abc[\ufffd', + b'%.5s', 'abc[\u20ac]'.encode('utf8')) + check_format("'\\u20acABC'", + b'%A', '\u20acABC') + check_format("'\\u20", + b'%.5A', '\u20acABCDEF') + check_format("'\u20acABC'", + b'%R', '\u20acABC') + check_format("'\u20acA", + b'%.3R', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3S', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3U', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3V', '\u20acABCDEF', None) + check_format('abc[\ufffd', + b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + + # following tests comes from #7330 + # test width modifier and precision modifier with %S + check_format("repr= abc", + b'repr=%5S', 'abc') + check_format("repr=ab", + b'repr=%.2S', 'abc') + check_format("repr= ab", + b'repr=%5.2S', 'abc') + + # test width modifier and precision modifier with %R + check_format("repr= 'abc'", + b'repr=%8R', 'abc') + check_format("repr='ab", + b'repr=%.3R', 'abc') + check_format("repr= 'ab", + b'repr=%5.3R', 'abc') + + # test width modifier and precision modifier with %A + check_format("repr= 'abc'", + b'repr=%8A', 'abc') + check_format("repr='ab", + b'repr=%.3A', 'abc') + check_format("repr= 'ab", + b'repr=%5.3A', 'abc') + + # test width modifier and precision modifier with %s + check_format("repr= abc", + b'repr=%5s', b'abc') + check_format("repr=ab", + b'repr=%.2s', b'abc') + check_format("repr= ab", + b'repr=%5.2s', b'abc') + + # test width modifier and precision modifier with %U + check_format("repr= abc", + b'repr=%5U', 'abc') + check_format("repr=ab", + b'repr=%.2U', 'abc') + check_format("repr= ab", + b'repr=%5.2U', 'abc') + + # test width modifier and precision modifier with %V + check_format("repr= abc", + b'repr=%5V', 'abc', b'123') + check_format("repr=ab", + b'repr=%.2V', 'abc', b'123') + check_format("repr= ab", + b'repr=%5.2V', 'abc', b'123') + check_format("repr= 123", + b'repr=%5V', None, b'123') + check_format("repr=12", + b'repr=%.2V', None, b'123') + check_format("repr= 12", + b'repr=%5.2V', None, b'123') # test integer formats (%i, %d, %u) - self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010') - self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010') - self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123') - - self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123') - - self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123') - self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123') - self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123') - self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123') + check_format('010', + b'%03i', c_int(10)) + check_format('0010', + b'%0.4i', c_int(10)) + check_format('-123', + b'%i', c_int(-123)) + check_format('-123', + b'%li', c_long(-123)) + check_format('-123', + b'%lli', c_longlong(-123)) + check_format('-123', + b'%zi', c_ssize_t(-123)) + + check_format('-123', + b'%d', c_int(-123)) + check_format('-123', + b'%ld', c_long(-123)) + check_format('-123', + b'%lld', c_longlong(-123)) + check_format('-123', + b'%zd', c_ssize_t(-123)) + + check_format('123', + b'%u', c_uint(123)) + check_format('123', + b'%lu', c_ulong(123)) + check_format('123', + b'%llu', c_ulonglong(123)) + check_format('123', + b'%zu', c_size_t(123)) # test long output min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) max_longlong = -min_longlong - 1 - self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(min_longlong)), str(min_longlong)) - self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(max_longlong)), str(max_longlong)) + check_format(str(min_longlong), + b'%lld', c_longlong(min_longlong)) + check_format(str(max_longlong), + b'%lld', c_longlong(max_longlong)) max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 - self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(max_ulonglong)), str(max_ulonglong)) + check_format(str(max_ulonglong), + b'%llu', c_ulonglong(max_ulonglong)) PyUnicode_FromFormat(b'%p', c_void_p(-1)) # test padding (width and/or precision) - self.assertEqual(PyUnicode_FromFormat(b'%010i', c_int(123)), '123'.rjust(10, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100i', c_int(123)), '123'.rjust(100)) - self.assertEqual(PyUnicode_FromFormat(b'%.100i', c_int(123)), '123'.rjust(100, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100.80i', c_int(123)), '123'.rjust(80, '0').rjust(100)) - - self.assertEqual(PyUnicode_FromFormat(b'%010u', c_uint(123)), '123'.rjust(10, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100u', c_uint(123)), '123'.rjust(100)) - self.assertEqual(PyUnicode_FromFormat(b'%.100u', c_uint(123)), '123'.rjust(100, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100.80u', c_uint(123)), '123'.rjust(80, '0').rjust(100)) - - self.assertEqual(PyUnicode_FromFormat(b'%010x', c_int(0x123)), '123'.rjust(10, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100x', c_int(0x123)), '123'.rjust(100)) - self.assertEqual(PyUnicode_FromFormat(b'%.100x', c_int(0x123)), '123'.rjust(100, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100.80x', c_int(0x123)), '123'.rjust(80, '0').rjust(100)) + check_format('123'.rjust(10, '0'), + b'%010i', c_int(123)) + check_format('123'.rjust(100), + b'%100i', c_int(123)) + check_format('123'.rjust(100, '0'), + b'%.100i', c_int(123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80i', c_int(123)) + + check_format('123'.rjust(10, '0'), + b'%010u', c_uint(123)) + check_format('123'.rjust(100), + b'%100u', c_uint(123)) + check_format('123'.rjust(100, '0'), + b'%.100u', c_uint(123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80u', c_uint(123)) + + check_format('123'.rjust(10, '0'), + b'%010x', c_int(0x123)) + check_format('123'.rjust(100), + b'%100x', c_int(0x123)) + check_format('123'.rjust(100, '0'), + b'%.100x', c_int(0x123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80x', c_int(0x123)) # test %A - text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') - self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'") + check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", + b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') # test %V - text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz') - self.assertEqual(text, 'repr=abc') + check_format('repr=abc', + b'repr=%V', 'abc', b'xyz') # Test string decode from parameter of %s using utf-8. # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of # '\u4eba\u6c11' - text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') - self.assertEqual(text, 'repr=\u4eba\u6c11') + check_format('repr=\u4eba\u6c11', + b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') #Test replace error handler. - text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff') - self.assertEqual(text, 'repr=abc\ufffd') + check_format('repr=abc\ufffd', + b'repr=%V', None, b'abc\xff') # not supported: copy the raw format string. these tests are just here # to check for crashs and should not be considered as specifications - self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s') - self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc') - self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i') - self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s') + check_format('%s', + b'%1%s', b'abc') + check_format('%1abc', + b'%1abc') + check_format('%+i', + b'%+i', c_int(10)) + check_format('%.%s', + b'%.%s', b'abc') # Test PyUnicode_AsWideChar() def test_aswidechar(self): @@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1? Core and Builtins ----------------- +- Issue #7330: Implement width and precision (ex: "%5.3s") for the format + string of PyUnicode_FromFormat() function, original patch written by Ysj Ray. + - Issue #1545463: Global variables caught in reference cycles are now garbage-collected at shutdown. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4c532af..795c18f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2346,6 +2346,67 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, plus 1 for the sign. 53/22 is an upper bound for log10(256). */ #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) +static int +unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t width, Py_ssize_t precision) +{ + Py_ssize_t length, fill, arglen; + Py_UCS4 maxchar; + + if (PyUnicode_READY(str) == -1) + return -1; + + length = PyUnicode_GET_LENGTH(str); + if ((precision == -1 || precision >= length) + && width <= length) + return _PyUnicodeWriter_WriteStr(writer, str); + + if (precision != -1) + length = Py_MIN(precision, length); + + arglen = Py_MAX(length, width); + if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) + maxchar = _PyUnicode_FindMaxChar(str, 0, length); + else + maxchar = writer->maxchar; + + if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) + return -1; + + if (width > length) { + fill = width - length; + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) + return -1; + writer->pos += fill; + } + + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, 0, length); + writer->pos += length; + return 0; +} + +static int +unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, + Py_ssize_t width, Py_ssize_t precision) +{ + /* UTF-8 */ + Py_ssize_t length; + PyObject *unicode; + int res; + + length = strlen(str); + if (precision != -1) + length = Py_MIN(length, precision); + unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); + if (unicode == NULL) + return -1; + + res = unicode_fromformat_write_str(writer, unicode, width, -1); + Py_DECREF(unicode); + return res; +} + static const char* unicode_fromformat_arg(_PyUnicodeWriter *writer, const char *f, va_list *vargs) @@ -2353,12 +2414,12 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, const char *p; Py_ssize_t len; int zeropad; - int width; - int precision; + Py_ssize_t width; + Py_ssize_t precision; int longflag; int longlongflag; int size_tflag; - int fill; + Py_ssize_t fill; p = f; f++; @@ -2369,28 +2430,36 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, } /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ - width = 0; - while (Py_ISDIGIT((unsigned)*f)) { - if (width > (INT_MAX - ((int)*f - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "width too big"); - return NULL; - } - width = (width*10) + (*f - '0'); - f++; - } - precision = 0; - if (*f == '.') { + width = -1; + if (Py_ISDIGIT((unsigned)*f)) { + width = *f - '0'; f++; while (Py_ISDIGIT((unsigned)*f)) { - if (precision > (INT_MAX - ((int)*f - '0')) / 10) { + if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { PyErr_SetString(PyExc_ValueError, - "precision too big"); + "width too big"); return NULL; } - precision = (precision*10) + (*f - '0'); + width = (width * 10) + (*f - '0'); f++; } + } + precision = -1; + if (*f == '.') { + f++; + if (Py_ISDIGIT((unsigned)*f)) { + precision = (*f - '0'); + f++; + while (Py_ISDIGIT((unsigned)*f)) { + if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "precision too big"); + return NULL; + } + precision = (precision * 10) + (*f - '0'); + f++; + } + } if (*f == '%') { /* "%.3%s" => f points to "3" */ f--; @@ -2449,6 +2518,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, /* used by sprintf */ char fmt[10]; /* should be enough for "%0lld\0" */ char buffer[MAX_LONG_LONG_CHARS]; + Py_ssize_t arglen; if (*f == 'u') { makefmt(fmt, longflag, longlongflag, size_tflag, *f); @@ -2494,26 +2564,29 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, if (precision < len) precision = len; + + arglen = Py_MAX(precision, width); + assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127); + if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) + return NULL; + if (width > precision) { Py_UCS4 fillchar; fill = width - precision; fillchar = zeropad?'0':' '; - if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1) - return NULL; if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) return NULL; writer->pos += fill; } if (precision > len) { fill = precision - len; - if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1) - return NULL; if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) return NULL; writer->pos += fill; } - if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) - return NULL; + + unicode_write_cstr(writer->buffer, writer->pos, buffer, len); + writer->pos += len; break; } @@ -2535,8 +2608,11 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, len += 2; } - if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) + assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127); + if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) return NULL; + unicode_write_cstr(writer->buffer, writer->pos, number, len); + writer->pos += len; break; } @@ -2544,14 +2620,8 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, { /* UTF-8 */ const char *s = va_arg(*vargs, const char*); - PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); - if (!str) + if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { - Py_DECREF(str); - return NULL; - } - Py_DECREF(str); break; } @@ -2560,7 +2630,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, PyObject *obj = va_arg(*vargs, PyObject *); assert(obj && _PyUnicode_CHECK(obj)); - if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) return NULL; break; } @@ -2569,22 +2639,15 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, { PyObject *obj = va_arg(*vargs, PyObject *); const char *str = va_arg(*vargs, const char *); - PyObject *str_obj; - assert(obj || str); if (obj) { assert(_PyUnicode_CHECK(obj)); - if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) return NULL; } else { - str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); - if (!str_obj) - return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { - Py_DECREF(str_obj); + assert(str != NULL); + if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) return NULL; - } - Py_DECREF(str_obj); } break; } @@ -2597,7 +2660,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, str = PyObject_Str(obj); if (!str) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { Py_DECREF(str); return NULL; } @@ -2613,7 +2676,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, repr = PyObject_Repr(obj); if (!repr) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { + if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { Py_DECREF(repr); return NULL; } @@ -2629,7 +2692,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, ascii = PyObject_ASCII(obj); if (!ascii) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { + if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { Py_DECREF(ascii); return NULL; } |