From f3466bc04008660c4a5c3ed6f70144f138ae2e7f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 22 May 2023 00:32:39 +0300 Subject: gh-98836: Extend PyUnicode_FromFormat() (GH-98838) * Support for conversion specifiers o (octal) and X (uppercase hexadecimal). * Support for length modifiers j (intmax_t) and t (ptrdiff_t). * Length modifiers are now applied to all integer conversions. * Support for wchar_t C strings (%ls and %lV). * Support for variable width and precision (*). * Support for flag - (left alignment). --- Doc/c-api/unicode.rst | 228 ++++++++------ Doc/whatsnew/3.12.rst | 6 + Lib/test/test_capi/test_unicode.py | 217 ++++++++++---- .../2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst | 4 + Modules/_ssl.c | 5 +- Modules/_testcapi/unicode.c | 63 +++- Modules/selectmodule.c | 5 +- Modules/socketmodule.c | 8 +- Objects/unicodeobject.c | 326 ++++++++++++++------- Parser/tokenizer.c | 11 +- 10 files changed, 585 insertions(+), 288 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index ab3a2e2..6771f37 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -394,98 +394,149 @@ APIs: arguments, calculate the size of the resulting Python Unicode string and return a string with the values formatted into it. The variable arguments must be C types and must correspond exactly to the format characters in the *format* - ASCII-encoded string. The following format characters are allowed: - - .. % This should be exactly the same as the table in PyErr_Format. - - .. tabularcolumns:: |l|l|L| - - +-------------------+---------------------+----------------------------------+ - | Format Characters | Type | Comment | - +===================+=====================+==================================+ - | :attr:`%%` | *n/a* | The literal % character. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%c` | int | A single character, | - | | | represented as a C int. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%d` | int | Equivalent to | - | | | ``printf("%d")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%u` | unsigned int | Equivalent to | - | | | ``printf("%u")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%ld` | long | Equivalent to | - | | | ``printf("%ld")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%li` | long | Equivalent to | - | | | ``printf("%li")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%lu` | unsigned long | Equivalent to | - | | | ``printf("%lu")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%lld` | long long | Equivalent to | - | | | ``printf("%lld")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%lli` | long long | Equivalent to | - | | | ``printf("%lli")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%llu` | unsigned long long | Equivalent to | - | | | ``printf("%llu")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%zd` | :c:type:`\ | Equivalent to | - | | Py_ssize_t` | ``printf("%zd")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%zi` | :c:type:`\ | Equivalent to | - | | Py_ssize_t` | ``printf("%zi")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%zu` | size_t | Equivalent to | - | | | ``printf("%zu")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%i` | int | Equivalent to | - | | | ``printf("%i")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%x` | int | Equivalent to | - | | | ``printf("%x")``. [1]_ | - +-------------------+---------------------+----------------------------------+ - | :attr:`%s` | const char\* | A null-terminated C character | - | | | array. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%p` | const void\* | The hex representation of a C | - | | | pointer. Mostly equivalent to | - | | | ``printf("%p")`` except that | - | | | it is guaranteed to start with | - | | | the literal ``0x`` regardless | - | | | of what the platform's | - | | | ``printf`` yields. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%A` | PyObject\* | The result of calling | - | | | :func:`ascii`. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%U` | PyObject\* | A Unicode object. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%V` | PyObject\*, | A Unicode object (which may be | - | | const char\* | ``NULL``) and a null-terminated | - | | | C character array as a second | - | | | parameter (which will be used, | - | | | if the first parameter is | - | | | ``NULL``). | - +-------------------+---------------------+----------------------------------+ - | :attr:`%S` | PyObject\* | The result of calling | - | | | :c:func:`PyObject_Str`. | - +-------------------+---------------------+----------------------------------+ - | :attr:`%R` | PyObject\* | The result of calling | - | | | :c:func:`PyObject_Repr`. | - +-------------------+---------------------+----------------------------------+ + ASCII-encoded string. + + A conversion specifier contains two or more characters and has the following + components, which must occur in this order: + + #. The ``'%'`` character, which marks the start of the specifier. + + #. Conversion flags (optional), which affect the result of some conversion + types. + + #. Minimum field width (optional). + If specified as an ``'*'`` (asterisk), the actual width is given in the + next argument, which must be of type :c:expr:`int`, and the object to + convert comes after the minimum field width and optional precision. + + #. Precision (optional), given as a ``'.'`` (dot) followed by the precision. + If specified as ``'*'`` (an asterisk), the actual precision is given in + the next argument, which must be of type :c:expr:`int`, and the value to + convert comes after the precision. + + #. Length modifier (optional). + + #. Conversion type. + + The conversion flag characters are: + + .. tabularcolumns:: |l|L| + + +-------+-------------------------------------------------------------+ + | Flag | Meaning | + +=======+=============================================================+ + | ``0`` | The conversion will be zero padded for numeric values. | + +-------+-------------------------------------------------------------+ + | ``-`` | The converted value is left adjusted (overrides the ``0`` | + | | flag if both are given). | + +-------+-------------------------------------------------------------+ + + The length modifiers for following integer conversions (``d``, ``i``, + ``o``, ``u``, ``x``, or ``X``) specify the type of the argument + (:c:expr:`int` by default): + + .. tabularcolumns:: |l|L| + + +----------+-----------------------------------------------------+ + | Modifier | Types | + +==========+=====================================================+ + | ``l`` | :c:expr:`long` or :c:expr:`unsigned long` | + +----------+-----------------------------------------------------+ + | ``ll`` | :c:expr:`long long` or :c:expr:`unsigned long long` | + +----------+-----------------------------------------------------+ + | ``j`` | :c:expr:`intmax_t` or :c:expr:`uintmax_t` | + +----------+-----------------------------------------------------+ + | ``z`` | :c:expr:`size_t` or :c:expr:`ssize_t` | + +----------+-----------------------------------------------------+ + | ``t`` | :c:expr:`ptrdiff_t` | + +----------+-----------------------------------------------------+ + + The length modifier ``l`` for following conversions ``s`` or ``V`` specify + that the type of the argument is :c:expr:`const wchar_t*`. + + The conversion specifiers are: + + .. list-table:: + :widths: auto + :header-rows: 1 + + * - Conversion Specifier + - Type + - Comment + + * - ``%`` + - *n/a* + - The literal ``%`` character. + + * - ``d``, ``i`` + - Specified by the length modifier + - The decimal representation of a signed C integer. + + * - ``u`` + - Specified by the length modifier + - The decimal representation of an unsigned C integer. + + * - ``o`` + - Specified by the length modifier + - The octal representation of an unsigned C integer. + + * - ``x`` + - Specified by the length modifier + - The hexadecimal representation of an unsigned C integer (lowercase). + + * - ``X`` + - Specified by the length modifier + - The hexadecimal representation of an unsigned C integer (uppercase). + + * - ``c`` + - :c:expr:`int` + - A single character. + + * - ``s`` + - :c:expr:`const char*` or :c:expr:`const wchar_t*` + - A null-terminated C character array. + + * - ``p`` + - :c:expr:`const void*` + - The hex representation of a C pointer. + Mostly equivalent to ``printf("%p")`` except that it is guaranteed to + start with the literal ``0x`` regardless of what the platform's + ``printf`` yields. + + * - ``A`` + - :c:expr:`PyObject*` + - The result of calling :func:`ascii`. + + * - ``U`` + - :c:expr:`PyObject*` + - A Unicode object. + + * - ``V`` + - :c:expr:`PyObject*`, :c:expr:`const char*` or :c:expr:`const wchar_t*` + - A Unicode object (which may be ``NULL``) and a null-terminated + C character array as a second parameter (which will be used, + if the first parameter is ``NULL``). + + * - ``S`` + - :c:expr:`PyObject*` + - The result of calling :c:func:`PyObject_Str`. + + * - ``R`` + - :c:expr:`PyObject*` + - The result of calling :c:func:`PyObject_Repr`. .. note:: The width formatter unit is number of characters rather than bytes. - The precision formatter unit is number of bytes for ``"%s"`` and + The precision formatter unit is number of bytes or :c:expr:`wchar_t` + items (if the length modifier ``l`` is used) for ``"%s"`` and ``"%V"`` (if the ``PyObject*`` argument is ``NULL``), and a number of characters for ``"%A"``, ``"%U"``, ``"%S"``, ``"%R"`` and ``"%V"`` (if the ``PyObject*`` argument is not ``NULL``). - .. [1] For integer specifiers (d, u, ld, li, lu, lld, lli, llu, zd, zi, - zu, i, x): the 0-conversion flag has effect even when a precision is given. + .. note:: + Unlike to C :c:func:`printf` the ``0`` flag has effect even when + a precision is given for integer conversions (``d``, ``i``, ``u``, ``o``, + ``x``, or ``X``). .. versionchanged:: 3.2 Support for ``"%lld"`` and ``"%llu"`` added. @@ -498,6 +549,13 @@ APIs: ``"%V"``, ``"%S"``, ``"%R"`` added. .. versionchanged:: 3.12 + Support for conversion specifiers ``o`` and ``X``. + Support for length modifiers ``j`` and ``t``. + Length modifiers are now applied to all integer conversions. + Length modifier ``l`` is now applied to conversion specifiers ``s`` and ``V``. + Support for variable width and precision ``*``. + Support for flag ``-``. + An unrecognized format character now sets a :exc:`SystemError`. In previous versions it caused all the rest of the format string to be copied as-is to the result string, and any extra arguments discarded. diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 14f03ef..caf2107 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -1402,6 +1402,12 @@ Porting to Python 3.12 :py:meth:`~class.__subclasses__` (using :c:func:`PyObject_CallMethod`, for example). +* Add support of more formatting options (left aligning, octals, uppercase + hexadecimals, ``intmax_t``, ``ptrdiff_t``, ``wchar_t`` C + strings, variable width and precision) in :c:func:`PyUnicode_FromFormat` and + :c:func:`PyUnicode_FromFormatV`. + (Contributed by Serhiy Storchaka in :gh:`98836`.) + * An unrecognized format character in :c:func:`PyUnicode_FromFormat` and :c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`. In previous versions it caused all the rest of the format string to be diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 00807d9..9c76620 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -319,12 +319,17 @@ class CAPITest(unittest.TestCase): def test_from_format(self): """Test PyUnicode_FromFormat()""" + # Length modifiers "j" and "t" are not tested here because ctypes does + # not expose types for intmax_t and ptrdiff_t. + # _testcapi.test_string_from_format() has a wider coverage of all + # formats. import_helper.import_module('ctypes') from ctypes import ( c_char_p, pythonapi, py_object, sizeof, c_int, c_long, c_longlong, c_ssize_t, - c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) + c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p, + sizeof, c_wchar, c_wchar_p) name = "PyUnicode_FromFormat" _PyUnicode_FromFormat = getattr(pythonapi, name) _PyUnicode_FromFormat.argtypes = (c_char_p,) @@ -449,37 +454,28 @@ class CAPITest(unittest.TestCase): check_format("repr= 12", b'repr=%5.2V', None, b'123') - # test integer formats (%i, %d, %u) + # test integer formats (%i, %d, %u, %o, %x, %X) check_format('010', b'%03i', c_int(10)) check_format('0010', b'%0.4i', c_int(10)) - check_format('-123', - b'%i', c_int(-123)) - check_format('-123', - b'%li', c_long(-123)) - check_format('-123', - b'%lli', c_longlong(-123)) - check_format('-123', - b'%zi', c_ssize_t(-123)) - - check_format('-123', - b'%d', c_int(-123)) - check_format('-123', - b'%ld', c_long(-123)) - check_format('-123', - b'%lld', c_longlong(-123)) - check_format('-123', - b'%zd', c_ssize_t(-123)) - - check_format('123', - b'%u', c_uint(123)) - check_format('123', - b'%lu', c_ulong(123)) - check_format('123', - b'%llu', c_ulonglong(123)) - check_format('123', - b'%zu', c_size_t(123)) + for conv, signed, value, expected in [ + (b'i', True, -123, '-123'), + (b'd', True, -123, '-123'), + (b'u', False, 123, '123'), + (b'o', False, 0o123, '123'), + (b'x', False, 0xabc, 'abc'), + (b'X', False, 0xabc, 'ABC'), + ]: + for mod, ctype in [ + (b'', c_int if signed else c_uint), + (b'l', c_long if signed else c_ulong), + (b'll', c_longlong if signed else c_ulonglong), + (b'z', c_ssize_t if signed else c_size_t), + ]: + with self.subTest(format=b'%' + mod + conv): + check_format(expected, + b'%' + mod + conv, ctype(value)) # test long output min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) @@ -494,40 +490,144 @@ class CAPITest(unittest.TestCase): PyUnicode_FromFormat(b'%p', c_void_p(-1)) # test padding (width and/or precision) - check_format('123'.rjust(10, '0'), - b'%010i', c_int(123)) - check_format('123'.rjust(100), - b'%100i', c_int(123)) - check_format('123'.rjust(100, '0'), - b'%.100i', c_int(123)) - check_format('123'.rjust(80, '0').rjust(100), - b'%100.80i', c_int(123)) - - check_format('123'.rjust(10, '0'), - b'%010u', c_uint(123)) - check_format('123'.rjust(100), - b'%100u', c_uint(123)) - check_format('123'.rjust(100, '0'), - b'%.100u', c_uint(123)) - check_format('123'.rjust(80, '0').rjust(100), - b'%100.80u', c_uint(123)) - - check_format('123'.rjust(10, '0'), - b'%010x', c_int(0x123)) - check_format('123'.rjust(100), - b'%100x', c_int(0x123)) - check_format('123'.rjust(100, '0'), - b'%.100x', c_int(0x123)) - check_format('123'.rjust(80, '0').rjust(100), - b'%100.80x', c_int(0x123)) + check_format('123', b'%2i', c_int(123)) + check_format(' 123', b'%10i', c_int(123)) + check_format('0000000123', b'%010i', c_int(123)) + check_format('123 ', b'%-10i', c_int(123)) + check_format('123 ', b'%-010i', c_int(123)) + check_format('123', b'%.2i', c_int(123)) + check_format('0000123', b'%.7i', c_int(123)) + check_format(' 123', b'%10.2i', c_int(123)) + check_format(' 0000123', b'%10.7i', c_int(123)) + check_format('0000000123', b'%010.7i', c_int(123)) + check_format('0000123 ', b'%-10.7i', c_int(123)) + check_format('0000123 ', b'%-010.7i', c_int(123)) + + check_format('-123', b'%2i', c_int(-123)) + check_format(' -123', b'%10i', c_int(-123)) + check_format('-000000123', b'%010i', c_int(-123)) + check_format('-123 ', b'%-10i', c_int(-123)) + check_format('-123 ', b'%-010i', c_int(-123)) + check_format('-123', b'%.2i', c_int(-123)) + check_format('-0000123', b'%.7i', c_int(-123)) + check_format(' -123', b'%10.2i', c_int(-123)) + check_format(' -0000123', b'%10.7i', c_int(-123)) + check_format('-000000123', b'%010.7i', c_int(-123)) + check_format('-0000123 ', b'%-10.7i', c_int(-123)) + check_format('-0000123 ', b'%-010.7i', c_int(-123)) + + check_format('123', b'%2u', c_uint(123)) + check_format(' 123', b'%10u', c_uint(123)) + check_format('0000000123', b'%010u', c_uint(123)) + check_format('123 ', b'%-10u', c_uint(123)) + check_format('123 ', b'%-010u', c_uint(123)) + check_format('123', b'%.2u', c_uint(123)) + check_format('0000123', b'%.7u', c_uint(123)) + check_format(' 123', b'%10.2u', c_uint(123)) + check_format(' 0000123', b'%10.7u', c_uint(123)) + check_format('0000000123', b'%010.7u', c_uint(123)) + check_format('0000123 ', b'%-10.7u', c_uint(123)) + check_format('0000123 ', b'%-010.7u', c_uint(123)) + + check_format('123', b'%2o', c_uint(0o123)) + check_format(' 123', b'%10o', c_uint(0o123)) + check_format('0000000123', b'%010o', c_uint(0o123)) + check_format('123 ', b'%-10o', c_uint(0o123)) + check_format('123 ', b'%-010o', c_uint(0o123)) + check_format('123', b'%.2o', c_uint(0o123)) + check_format('0000123', b'%.7o', c_uint(0o123)) + check_format(' 123', b'%10.2o', c_uint(0o123)) + check_format(' 0000123', b'%10.7o', c_uint(0o123)) + check_format('0000000123', b'%010.7o', c_uint(0o123)) + check_format('0000123 ', b'%-10.7o', c_uint(0o123)) + check_format('0000123 ', b'%-010.7o', c_uint(0o123)) + + check_format('abc', b'%2x', c_uint(0xabc)) + check_format(' abc', b'%10x', c_uint(0xabc)) + check_format('0000000abc', b'%010x', c_uint(0xabc)) + check_format('abc ', b'%-10x', c_uint(0xabc)) + check_format('abc ', b'%-010x', c_uint(0xabc)) + check_format('abc', b'%.2x', c_uint(0xabc)) + check_format('0000abc', b'%.7x', c_uint(0xabc)) + check_format(' abc', b'%10.2x', c_uint(0xabc)) + check_format(' 0000abc', b'%10.7x', c_uint(0xabc)) + check_format('0000000abc', b'%010.7x', c_uint(0xabc)) + check_format('0000abc ', b'%-10.7x', c_uint(0xabc)) + check_format('0000abc ', b'%-010.7x', c_uint(0xabc)) + + check_format('ABC', b'%2X', c_uint(0xabc)) + check_format(' ABC', b'%10X', c_uint(0xabc)) + check_format('0000000ABC', b'%010X', c_uint(0xabc)) + check_format('ABC ', b'%-10X', c_uint(0xabc)) + check_format('ABC ', b'%-010X', c_uint(0xabc)) + check_format('ABC', b'%.2X', c_uint(0xabc)) + check_format('0000ABC', b'%.7X', c_uint(0xabc)) + check_format(' ABC', b'%10.2X', c_uint(0xabc)) + check_format(' 0000ABC', b'%10.7X', c_uint(0xabc)) + check_format('0000000ABC', b'%010.7X', c_uint(0xabc)) + check_format('0000ABC ', b'%-10.7X', c_uint(0xabc)) + check_format('0000ABC ', b'%-010.7X', c_uint(0xabc)) # test %A check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') # test %V - check_format('repr=abc', - b'repr=%V', 'abc', b'xyz') + check_format('abc', + b'%V', 'abc', b'xyz') + check_format('xyz', + b'%V', None, b'xyz') + + # test %ls + check_format('abc', b'%ls', c_wchar_p('abc')) + check_format('\u4eba\u6c11', b'%ls', c_wchar_p('\u4eba\u6c11')) + check_format('\U0001f4bb+\U0001f40d', + b'%ls', c_wchar_p('\U0001f4bb+\U0001f40d')) + check_format(' ab', b'%5.2ls', c_wchar_p('abc')) + check_format(' \u4eba\u6c11', b'%5ls', c_wchar_p('\u4eba\u6c11')) + check_format(' \U0001f4bb+\U0001f40d', + b'%5ls', c_wchar_p('\U0001f4bb+\U0001f40d')) + check_format('\u4eba', b'%.1ls', c_wchar_p('\u4eba\u6c11')) + check_format('\U0001f4bb' if sizeof(c_wchar) > 2 else '\ud83d', + b'%.1ls', c_wchar_p('\U0001f4bb+\U0001f40d')) + check_format('\U0001f4bb+' if sizeof(c_wchar) > 2 else '\U0001f4bb', + b'%.2ls', c_wchar_p('\U0001f4bb+\U0001f40d')) + + # test %lV + check_format('abc', + b'%lV', 'abc', c_wchar_p('xyz')) + check_format('xyz', + b'%lV', None, c_wchar_p('xyz')) + check_format('\u4eba\u6c11', + b'%lV', None, c_wchar_p('\u4eba\u6c11')) + check_format('\U0001f4bb+\U0001f40d', + b'%lV', None, c_wchar_p('\U0001f4bb+\U0001f40d')) + check_format(' ab', + b'%5.2lV', None, c_wchar_p('abc')) + check_format(' \u4eba\u6c11', + b'%5lV', None, c_wchar_p('\u4eba\u6c11')) + check_format(' \U0001f4bb+\U0001f40d', + b'%5lV', None, c_wchar_p('\U0001f4bb+\U0001f40d')) + check_format('\u4eba', + b'%.1lV', None, c_wchar_p('\u4eba\u6c11')) + check_format('\U0001f4bb' if sizeof(c_wchar) > 2 else '\ud83d', + b'%.1lV', None, c_wchar_p('\U0001f4bb+\U0001f40d')) + check_format('\U0001f4bb+' if sizeof(c_wchar) > 2 else '\U0001f4bb', + b'%.2lV', None, c_wchar_p('\U0001f4bb+\U0001f40d')) + + # test variable width and precision + check_format(' abc', b'%*s', c_int(5), b'abc') + check_format('ab', b'%.*s', c_int(2), b'abc') + check_format(' ab', b'%*.*s', c_int(5), c_int(2), b'abc') + check_format(' abc', b'%*U', c_int(5), 'abc') + check_format('ab', b'%.*U', c_int(2), 'abc') + check_format(' ab', b'%*.*U', c_int(5), c_int(2), 'abc') + check_format(' ab', b'%*.*V', c_int(5), c_int(2), None, b'abc') + check_format(' ab', b'%*.*lV', c_int(5), c_int(2), + None, c_wchar_p('abc')) + check_format(' 123', b'%*i', c_int(8), c_int(123)) + check_format('00123', b'%.*i', c_int(5), c_int(123)) + check_format(' 00123', b'%*.*i', c_int(8), c_int(5), c_int(123)) # test %p # We cannot test the exact result, @@ -564,10 +664,11 @@ class CAPITest(unittest.TestCase): check_format('', b'%s', b'') - # check for crashes + # test invalid format strings. these tests are just here + # to check for crashes and should not be considered as specifications for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1', b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc', - b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'): + b'%l', b'%ll', b'%z', b'%lls', b'%zs'): with self.subTest(fmt=fmt): self.assertRaisesRegex(SystemError, 'invalid format string', PyUnicode_FromFormat, fmt, b'abc') diff --git a/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst b/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst new file mode 100644 index 0000000..e3730eb --- /dev/null +++ b/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst @@ -0,0 +1,4 @@ +Add support of more formatting options (left aligning, octals, uppercase +hexadecimals, :c:expr:`intmax_t`, :c:expr:`ptrdiff_t`, :c:expr:`wchar_t` C +strings, variable width and precision) in :c:func:`PyUnicode_FromFormat` and +:c:func:`PyUnicode_FromFormatV`. diff --git a/Modules/_ssl.c b/Modules/_ssl.c index 016a5a5..5bf6b3b 100644 --- a/Modules/_ssl.c +++ b/Modules/_ssl.c @@ -1330,10 +1330,8 @@ _get_peer_alt_names (_sslmodulestate *state, X509 *certificate) { p[0], p[1], p[2], p[3] ); } else if (name->d.ip->length == 16) { - /* PyUnicode_FromFormat() does not support %X */ unsigned char *p = name->d.ip->data; - len = sprintf( - buf, + v = PyUnicode_FromFormat( "%X:%X:%X:%X:%X:%X:%X:%X", p[0] << 8 | p[1], p[2] << 8 | p[3], @@ -1344,7 +1342,6 @@ _get_peer_alt_names (_sslmodulestate *state, X509 *certificate) { p[12] << 8 | p[13], p[14] << 8 | p[15] ); - v = PyUnicode_FromStringAndSize(buf, len); } else { v = PyUnicode_FromString(""); } diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 7dd3b9c..73929ea 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -1,3 +1,5 @@ +#include // ptrdiff_t + #define PY_SSIZE_T_CLEAN #include "parts.h" @@ -1130,25 +1132,48 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1( "%c", "c", 'c'); CHECK_FORMAT_1( "%0c", "c", 'c'); CHECK_FORMAT_1("%00c", "c", 'c'); - CHECK_FORMAT_1( "%2c", "c", 'c'); - CHECK_FORMAT_1("%02c", "c", 'c'); - CHECK_FORMAT_1("%.0c", "c", 'c'); - CHECK_FORMAT_1("%.2c", "c", 'c'); + CHECK_FORMAT_1( "%2c", NULL, 'c'); + CHECK_FORMAT_1("%02c", NULL, 'c'); + CHECK_FORMAT_1("%.0c", NULL, 'c'); + CHECK_FORMAT_1("%.2c", NULL, 'c'); // Integers CHECK_FORMAT_1("%d", "123", (int)123); CHECK_FORMAT_1("%i", "123", (int)123); CHECK_FORMAT_1("%u", "123", (unsigned int)123); + CHECK_FORMAT_1("%x", "7b", (unsigned int)123); + CHECK_FORMAT_1("%X", "7B", (unsigned int)123); + CHECK_FORMAT_1("%o", "173", (unsigned int)123); CHECK_FORMAT_1("%ld", "123", (long)123); CHECK_FORMAT_1("%li", "123", (long)123); CHECK_FORMAT_1("%lu", "123", (unsigned long)123); + CHECK_FORMAT_1("%lx", "7b", (unsigned long)123); + CHECK_FORMAT_1("%lX", "7B", (unsigned long)123); + CHECK_FORMAT_1("%lo", "173", (unsigned long)123); CHECK_FORMAT_1("%lld", "123", (long long)123); CHECK_FORMAT_1("%lli", "123", (long long)123); CHECK_FORMAT_1("%llu", "123", (unsigned long long)123); + CHECK_FORMAT_1("%llx", "7b", (unsigned long long)123); + CHECK_FORMAT_1("%llX", "7B", (unsigned long long)123); + CHECK_FORMAT_1("%llo", "173", (unsigned long long)123); CHECK_FORMAT_1("%zd", "123", (Py_ssize_t)123); CHECK_FORMAT_1("%zi", "123", (Py_ssize_t)123); CHECK_FORMAT_1("%zu", "123", (size_t)123); - CHECK_FORMAT_1("%x", "7b", (int)123); + CHECK_FORMAT_1("%zx", "7b", (size_t)123); + CHECK_FORMAT_1("%zX", "7B", (size_t)123); + CHECK_FORMAT_1("%zo", "173", (size_t)123); + CHECK_FORMAT_1("%td", "123", (ptrdiff_t)123); + CHECK_FORMAT_1("%ti", "123", (ptrdiff_t)123); + CHECK_FORMAT_1("%tu", "123", (ptrdiff_t)123); + CHECK_FORMAT_1("%tx", "7b", (ptrdiff_t)123); + CHECK_FORMAT_1("%tX", "7B", (ptrdiff_t)123); + CHECK_FORMAT_1("%to", "173", (ptrdiff_t)123); + CHECK_FORMAT_1("%jd", "123", (intmax_t)123); + CHECK_FORMAT_1("%ji", "123", (intmax_t)123); + CHECK_FORMAT_1("%ju", "123", (uintmax_t)123); + CHECK_FORMAT_1("%jx", "7b", (uintmax_t)123); + CHECK_FORMAT_1("%jX", "7B", (uintmax_t)123); + CHECK_FORMAT_1("%jo", "173", (uintmax_t)123); CHECK_FORMAT_1("%d", "-123", (int)-123); CHECK_FORMAT_1("%i", "-123", (int)-123); @@ -1158,7 +1183,10 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%lli", "-123", (long long)-123); CHECK_FORMAT_1("%zd", "-123", (Py_ssize_t)-123); CHECK_FORMAT_1("%zi", "-123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%x", "ffffff85", (int)-123); + CHECK_FORMAT_1("%td", "-123", (ptrdiff_t)-123); + CHECK_FORMAT_1("%ti", "-123", (ptrdiff_t)-123); + CHECK_FORMAT_1("%jd", "-123", (intmax_t)-123); + CHECK_FORMAT_1("%ji", "-123", (intmax_t)-123); // Integers: width < length CHECK_FORMAT_1("%1d", "123", (int)123); @@ -1183,7 +1211,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%1lli", "-123", (long long)-123); CHECK_FORMAT_1("%1zd", "-123", (Py_ssize_t)-123); CHECK_FORMAT_1("%1zi", "-123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%1x", "ffffff85", (int)-123); // Integers: width > length CHECK_FORMAT_1("%5d", " 123", (int)123); @@ -1208,7 +1235,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%5lli", " -123", (long long)-123); CHECK_FORMAT_1("%5zd", " -123", (Py_ssize_t)-123); CHECK_FORMAT_1("%5zi", " -123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%9x", " ffffff85", (int)-123); // Integers: width > length, 0-flag CHECK_FORMAT_1("%05d", "00123", (int)123); @@ -1233,7 +1259,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%05lli", "-0123", (long long)-123); CHECK_FORMAT_1("%05zd", "-0123", (Py_ssize_t)-123); CHECK_FORMAT_1("%05zi", "-0123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%09x", "0ffffff85", (int)-123); // Integers: precision < length CHECK_FORMAT_1("%.1d", "123", (int)123); @@ -1258,7 +1283,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%.1lli", "-123", (long long)-123); CHECK_FORMAT_1("%.1zd", "-123", (Py_ssize_t)-123); CHECK_FORMAT_1("%.1zi", "-123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%.1x", "ffffff85", (int)-123); // Integers: precision > length CHECK_FORMAT_1("%.5d", "00123", (int)123); @@ -1283,7 +1307,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%.5lli", "-00123", (long long)-123); CHECK_FORMAT_1("%.5zd", "-00123", (Py_ssize_t)-123); CHECK_FORMAT_1("%.5zi", "-00123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%.9x", "0ffffff85", (int)-123); // Integers: width > precision > length CHECK_FORMAT_1("%7.5d", " 00123", (int)123); @@ -1308,7 +1331,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%7.5lli", " -00123", (long long)-123); CHECK_FORMAT_1("%7.5zd", " -00123", (Py_ssize_t)-123); CHECK_FORMAT_1("%7.5zi", " -00123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%10.9x", " 0ffffff85", (int)-123); // Integers: width > precision > length, 0-flag CHECK_FORMAT_1("%07.5d", "0000123", (int)123); @@ -1333,7 +1355,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%07.5lli", "-000123", (long long)-123); CHECK_FORMAT_1("%07.5zd", "-000123", (Py_ssize_t)-123); CHECK_FORMAT_1("%07.5zi", "-000123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%010.9x", "00ffffff85", (int)-123); // Integers: precision > width > length CHECK_FORMAT_1("%5.7d", "0000123", (int)123); @@ -1358,7 +1379,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%5.7lli", "-0000123", (long long)-123); CHECK_FORMAT_1("%5.7zd", "-0000123", (Py_ssize_t)-123); CHECK_FORMAT_1("%5.7zi", "-0000123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%9.10x", "00ffffff85", (int)-123); // Integers: precision > width > length, 0-flag CHECK_FORMAT_1("%05.7d", "0000123", (int)123); @@ -1383,7 +1403,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) CHECK_FORMAT_1("%05.7lli", "-0000123", (long long)-123); CHECK_FORMAT_1("%05.7zd", "-0000123", (Py_ssize_t)-123); CHECK_FORMAT_1("%05.7zi", "-0000123", (Py_ssize_t)-123); - CHECK_FORMAT_1("%09.10x", "00ffffff85", (int)-123); // Integers: precision = 0, arg = 0 (empty string in C) CHECK_FORMAT_1("%.0d", "0", (int)0); @@ -1402,66 +1421,80 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) // Strings CHECK_FORMAT_1("%s", "None", "None"); + CHECK_FORMAT_1("%ls", "None", L"None"); CHECK_FORMAT_1("%U", "None", unicode); CHECK_FORMAT_1("%A", "None", Py_None); CHECK_FORMAT_1("%S", "None", Py_None); CHECK_FORMAT_1("%R", "None", Py_None); CHECK_FORMAT_2("%V", "None", unicode, "ignored"); CHECK_FORMAT_2("%V", "None", NULL, "None"); + CHECK_FORMAT_2("%lV", "None", NULL, L"None"); // Strings: width < length CHECK_FORMAT_1("%1s", "None", "None"); + CHECK_FORMAT_1("%1ls", "None", L"None"); CHECK_FORMAT_1("%1U", "None", unicode); CHECK_FORMAT_1("%1A", "None", Py_None); CHECK_FORMAT_1("%1S", "None", Py_None); CHECK_FORMAT_1("%1R", "None", Py_None); CHECK_FORMAT_2("%1V", "None", unicode, "ignored"); CHECK_FORMAT_2("%1V", "None", NULL, "None"); + CHECK_FORMAT_2("%1lV", "None", NULL, L"None"); // Strings: width > length CHECK_FORMAT_1("%5s", " None", "None"); + CHECK_FORMAT_1("%5ls", " None", L"None"); CHECK_FORMAT_1("%5U", " None", unicode); CHECK_FORMAT_1("%5A", " None", Py_None); CHECK_FORMAT_1("%5S", " None", Py_None); CHECK_FORMAT_1("%5R", " None", Py_None); CHECK_FORMAT_2("%5V", " None", unicode, "ignored"); CHECK_FORMAT_2("%5V", " None", NULL, "None"); + CHECK_FORMAT_2("%5lV", " None", NULL, L"None"); // Strings: precision < length CHECK_FORMAT_1("%.1s", "N", "None"); + CHECK_FORMAT_1("%.1ls", "N", L"None"); CHECK_FORMAT_1("%.1U", "N", unicode); CHECK_FORMAT_1("%.1A", "N", Py_None); CHECK_FORMAT_1("%.1S", "N", Py_None); CHECK_FORMAT_1("%.1R", "N", Py_None); CHECK_FORMAT_2("%.1V", "N", unicode, "ignored"); CHECK_FORMAT_2("%.1V", "N", NULL, "None"); + CHECK_FORMAT_2("%.1lV", "N", NULL, L"None"); // Strings: precision > length CHECK_FORMAT_1("%.5s", "None", "None"); + CHECK_FORMAT_1("%.5ls", "None", L"None"); CHECK_FORMAT_1("%.5U", "None", unicode); CHECK_FORMAT_1("%.5A", "None", Py_None); CHECK_FORMAT_1("%.5S", "None", Py_None); CHECK_FORMAT_1("%.5R", "None", Py_None); CHECK_FORMAT_2("%.5V", "None", unicode, "ignored"); CHECK_FORMAT_2("%.5V", "None", NULL, "None"); + CHECK_FORMAT_2("%.5lV", "None", NULL, L"None"); // Strings: precision < length, width > length CHECK_FORMAT_1("%5.1s", " N", "None"); + CHECK_FORMAT_1("%5.1ls"," N", L"None"); CHECK_FORMAT_1("%5.1U", " N", unicode); CHECK_FORMAT_1("%5.1A", " N", Py_None); CHECK_FORMAT_1("%5.1S", " N", Py_None); CHECK_FORMAT_1("%5.1R", " N", Py_None); CHECK_FORMAT_2("%5.1V", " N", unicode, "ignored"); CHECK_FORMAT_2("%5.1V", " N", NULL, "None"); + CHECK_FORMAT_2("%5.1lV"," N", NULL, L"None"); // Strings: width < length, precision > length CHECK_FORMAT_1("%1.5s", "None", "None"); + CHECK_FORMAT_1("%1.5ls", "None", L"None"); CHECK_FORMAT_1("%1.5U", "None", unicode); CHECK_FORMAT_1("%1.5A", "None", Py_None); CHECK_FORMAT_1("%1.5S", "None", Py_None); CHECK_FORMAT_1("%1.5R", "None", Py_None); CHECK_FORMAT_2("%1.5V", "None", unicode, "ignored"); CHECK_FORMAT_2("%1.5V", "None", NULL, "None"); + CHECK_FORMAT_2("%1.5lV", "None", NULL, L"None"); Py_XDECREF(unicode); Py_RETURN_NONE; diff --git a/Modules/selectmodule.c b/Modules/selectmodule.c index 79bd5b5..9a4943c 100644 --- a/Modules/selectmodule.c +++ b/Modules/selectmodule.c @@ -1849,14 +1849,11 @@ static PyObject * kqueue_event_repr(kqueue_event_Object *s) { - char buf[1024]; - PyOS_snprintf( - buf, sizeof(buf), + return PyUnicode_FromFormat( "", (size_t)(s->e.ident), (int)s->e.filter, (unsigned int)s->e.flags, (unsigned int)s->e.fflags, (long long)(s->e.data), (void *)s->e.udata); - return PyUnicode_FromString(buf); } static int diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c index c11fb44..a86aaed 100644 --- a/Modules/socketmodule.c +++ b/Modules/socketmodule.c @@ -1339,8 +1339,6 @@ setbdaddr(const char *name, bdaddr_t *bdaddr) static PyObject * makebdaddr(bdaddr_t *bdaddr) { - char buf[(6 * 2) + 5 + 1]; - #ifdef MS_WINDOWS int i; unsigned int octets[6]; @@ -1349,16 +1347,14 @@ makebdaddr(bdaddr_t *bdaddr) octets[i] = ((*bdaddr) >> (8 * i)) & 0xFF; } - sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X", + return PyUnicode_FromFormat("%02X:%02X:%02X:%02X:%02X:%02X", octets[5], octets[4], octets[3], octets[2], octets[1], octets[0]); #else - sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X", + return PyUnicode_FromFormat("%02X:%02X:%02X:%02X:%02X:%02X", bdaddr->b[5], bdaddr->b[4], bdaddr->b[3], bdaddr->b[2], bdaddr->b[1], bdaddr->b[0]); #endif - - return PyUnicode_FromString(buf); } #endif diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7726f2f..ec5684b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -56,6 +56,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_unicodeobject.h" // struct _Py_unicode_state #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings() #include "stringlib/eq.h" // unicode_eq() +#include // ptrdiff_t #ifdef MS_WINDOWS #include @@ -2285,14 +2286,15 @@ PyUnicode_AsUCS4Copy(PyObject *string) return as_ucs4(string, NULL, 0, 1); } -/* maximum number of characters required for output of %lld or %p. - We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, - plus 1 for the sign. 53/22 is an upper bound for log10(256). */ -#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) +/* maximum number of characters required for output of %jo or %jd or %p. + We need at most ceil(log8(256)*sizeof(intmax_t)) digits, + plus 1 for the sign, plus 2 for the 0x prefix (for %p), + plus 1 for the terminal NUL. */ +#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3) static int unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, - Py_ssize_t width, Py_ssize_t precision) + Py_ssize_t width, Py_ssize_t precision, int flags) { Py_ssize_t length, fill, arglen; Py_UCS4 maxchar; @@ -2314,8 +2316,8 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) return -1; - if (width > length) { - fill = width - length; + fill = Py_MAX(width - length, 0); + if (fill && !(flags & F_LJUST)) { if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) return -1; writer->pos += fill; @@ -2324,12 +2326,19 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, str, 0, length); writer->pos += length; + + if (fill && (flags & F_LJUST)) { + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) + return -1; + writer->pos += fill; + } + return 0; } static int unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, - Py_ssize_t width, Py_ssize_t precision) + Py_ssize_t width, Py_ssize_t precision, int flags) { /* UTF-8 */ Py_ssize_t length; @@ -2349,24 +2358,58 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, if (unicode == NULL) return -1; - res = unicode_fromformat_write_str(writer, unicode, width, -1); + res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); + Py_DECREF(unicode); + return res; +} + +static int +unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str, + Py_ssize_t width, Py_ssize_t precision, int flags) +{ + /* UTF-8 */ + Py_ssize_t length; + PyObject *unicode; + int res; + + if (precision == -1) { + length = wcslen(str); + } + else { + length = 0; + while (length < precision && str[length]) { + length++; + } + } + unicode = PyUnicode_FromWideChar(str, length); + if (unicode == NULL) + return -1; + + res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); Py_DECREF(unicode); return res; } +#define F_LONG 1 +#define F_LONGLONG 2 +#define F_SIZE 3 +#define F_PTRDIFF 4 +#define F_INTMAX 5 +static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"}; +static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"}; +static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"}; +static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"}; +static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"}; + static const char* unicode_fromformat_arg(_PyUnicodeWriter *writer, const char *f, va_list *vargs) { const char *p; Py_ssize_t len; - int zeropad; + int flags = 0; Py_ssize_t width; Py_ssize_t precision; - int longflag; - int longlongflag; - int size_tflag; - Py_ssize_t fill; p = f; f++; @@ -2377,15 +2420,31 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, return f; } - zeropad = 0; - if (*f == '0') { - zeropad = 1; - f++; + /* Parse flags. Example: "%-i" => flags=F_LJUST. */ + /* Flags '+', ' ' and '#' are not particularly useful. + * They are not worth the implementation and maintenance costs. + * In addition, '#' should add "0" for "o" conversions for compatibility + * with printf, but it would confuse Python users. */ + while (1) { + switch (*f++) { + case '-': flags |= F_LJUST; continue; + case '0': flags |= F_ZERO; continue; + } + f--; + break; } /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ width = -1; - if (Py_ISDIGIT((unsigned)*f)) { + if (*f == '*') { + width = va_arg(*vargs, int); + if (width < 0) { + flags |= F_LJUST; + width = -width; + } + f++; + } + else if (Py_ISDIGIT((unsigned)*f)) { width = *f - '0'; f++; while (Py_ISDIGIT((unsigned)*f)) { @@ -2401,7 +2460,14 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, precision = -1; if (*f == '.') { f++; - if (Py_ISDIGIT((unsigned)*f)) { + if (*f == '*') { + precision = va_arg(*vargs, int); + if (precision < 0) { + precision = -2; + } + f++; + } + else if (Py_ISDIGIT((unsigned)*f)) { precision = (*f - '0'); f++; while (Py_ISDIGIT((unsigned)*f)) { @@ -2416,31 +2482,48 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, } } - /* Handle %ld, %lu, %lld and %llu. */ - longflag = 0; - longlongflag = 0; - size_tflag = 0; + int sizemod = 0; if (*f == 'l') { - if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { - longflag = 1; - ++f; - } - else if (f[1] == 'l' && - (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { - longlongflag = 1; + if (f[1] == 'l') { + sizemod = F_LONGLONG; f += 2; } + else { + sizemod = F_LONG; + ++f; + } } - /* handle the size_t flag. */ - else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { - size_tflag = 1; + else if (*f == 'z') { + sizemod = F_SIZE; + ++f; + } + else if (*f == 't') { + sizemod = F_PTRDIFF; + ++f; + } + else if (*f == 'j') { + sizemod = F_INTMAX; ++f; } - if (f[0] != '\0' && f[1] == '\0') writer->overallocate = 0; switch (*f) { + case 'd': case 'i': case 'o': case 'u': case 'x': case 'X': + break; + case 'c': case 'p': + if (sizemod || width >= 0 || precision >= 0) goto invalid_format; + break; + case 's': + case 'V': + if (sizemod && sizemod != F_LONG) goto invalid_format; + break; + default: + if (sizemod) goto invalid_format; + break; + } + + switch (*f) { case 'c': { int ordinal = va_arg(*vargs, int); @@ -2454,91 +2537,98 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, break; } - case 'i': - case 'd': - case 'u': - case 'x': + case 'd': case 'i': + case 'o': case 'u': case 'x': case 'X': { /* used by sprintf */ - char buffer[MAX_LONG_LONG_CHARS]; - Py_ssize_t arglen; - - if (*f == 'u') { - if (longflag) { - len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long)); - } - else if (longlongflag) { - len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long)); - } - else if (size_tflag) { - len = sprintf(buffer, "%zu", va_arg(*vargs, size_t)); - } - else { - len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int)); - } - } - else if (*f == 'x') { - len = sprintf(buffer, "%x", va_arg(*vargs, int)); - } - else { - if (longflag) { - len = sprintf(buffer, "%li", va_arg(*vargs, long)); - } - else if (longlongflag) { - len = sprintf(buffer, "%lli", va_arg(*vargs, long long)); - } - else if (size_tflag) { - len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t)); - } - else { - len = sprintf(buffer, "%i", va_arg(*vargs, int)); - } + char buffer[MAX_INTMAX_CHARS]; + const char *fmt = NULL; + switch (*f) { + case 'o': fmt = formats_o[sizemod]; break; + case 'u': fmt = formats_u[sizemod]; break; + case 'x': fmt = formats_x[sizemod]; break; + case 'X': fmt = formats_X[sizemod]; break; + default: fmt = formats[sizemod]; break; + } + int issigned = (*f == 'd' || *f == 'i'); + switch (sizemod) { + case F_LONG: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, long)) : + sprintf(buffer, fmt, va_arg(*vargs, unsigned long)); + break; + case F_LONGLONG: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, long long)) : + sprintf(buffer, fmt, va_arg(*vargs, unsigned long long)); + break; + case F_SIZE: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) : + sprintf(buffer, fmt, va_arg(*vargs, size_t)); + break; + case F_PTRDIFF: + len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t)); + break; + case F_INTMAX: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) : + sprintf(buffer, fmt, va_arg(*vargs, uintmax_t)); + break; + default: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, int)) : + sprintf(buffer, fmt, va_arg(*vargs, unsigned int)); + break; } assert(len >= 0); - int negative = (buffer[0] == '-'); - len -= negative; + int sign = (buffer[0] == '-'); + len -= sign; precision = Py_MAX(precision, len); - width = Py_MAX(width, precision + negative); + width = Py_MAX(width, precision + sign); + if ((flags & F_ZERO) && !(flags & F_LJUST)) { + precision = width - sign; + } - arglen = Py_MAX(precision, width); - if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) - return NULL; + Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0); + Py_ssize_t zeropad = Py_MAX(precision - len, 0); - if (width > precision) { - if (negative && zeropad) { - if (_PyUnicodeWriter_WriteChar(writer, '-') == -1) - return NULL; - } + if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1) + return NULL; - Py_UCS4 fillchar = zeropad?'0':' '; - fill = width - precision - negative; - if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) + if (spacepad && !(flags & F_LJUST)) { + if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1) return NULL; - writer->pos += fill; + writer->pos += spacepad; + } - if (negative && !zeropad) { - if (_PyUnicodeWriter_WriteChar(writer, '-') == -1) - return NULL; - } + if (sign) { + if (_PyUnicodeWriter_WriteChar(writer, '-') == -1) + return NULL; } - if (precision > len) { - fill = precision - len; - if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) + if (zeropad) { + if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1) return NULL; - writer->pos += fill; + writer->pos += zeropad; } - if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[negative], len) < 0) + if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0) return NULL; + + if (spacepad && (flags & F_LJUST)) { + if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1) + return NULL; + writer->pos += spacepad; + } break; } case 'p': { - char number[MAX_LONG_LONG_CHARS]; + char number[MAX_INTMAX_CHARS]; len = sprintf(number, "%p", va_arg(*vargs, void*)); assert(len >= 0); @@ -2561,10 +2651,17 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, case 's': { - /* UTF-8 */ - const char *s = va_arg(*vargs, const char*); - if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) - return NULL; + if (sizemod) { + const wchar_t *s = va_arg(*vargs, const wchar_t*); + if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0) + return NULL; + } + else { + /* UTF-8 */ + const char *s = va_arg(*vargs, const char*); + if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0) + return NULL; + } break; } @@ -2573,7 +2670,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, PyObject *obj = va_arg(*vargs, PyObject *); assert(obj && _PyUnicode_CHECK(obj)); - if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1) return NULL; break; } @@ -2581,15 +2678,27 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, case 'V': { PyObject *obj = va_arg(*vargs, PyObject *); - const char *str = va_arg(*vargs, const char *); + const char *str; + const wchar_t *wstr; + if (sizemod) { + wstr = va_arg(*vargs, const wchar_t*); + } + else { + str = va_arg(*vargs, const char *); + } if (obj) { assert(_PyUnicode_CHECK(obj)); - if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1) + return NULL; + } + else if (sizemod) { + assert(wstr != NULL); + if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0) return NULL; } else { assert(str != NULL); - if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) + if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0) return NULL; } break; @@ -2603,7 +2712,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, str = PyObject_Str(obj); if (!str) return NULL; - if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { + if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) { Py_DECREF(str); return NULL; } @@ -2619,7 +2728,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, repr = PyObject_Repr(obj); if (!repr) return NULL; - if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { + if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) { Py_DECREF(repr); return NULL; } @@ -2635,7 +2744,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, ascii = PyObject_ASCII(obj); if (!ascii) return NULL; - if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { + if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) { Py_DECREF(ascii); return NULL; } @@ -2644,6 +2753,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, } default: + invalid_format: PyErr_Format(PyExc_SystemError, "invalid format string: %s", p); return NULL; } diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index fb94fbe..fc4afcc 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1556,14 +1556,11 @@ verify_identifier(struct tok_state *tok) tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); } Py_DECREF(s); - // PyUnicode_FromFormatV() does not support %X - char hex[9]; - (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch); if (Py_UNICODE_ISPRINTABLE(ch)) { - syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); } else { - syntaxerror(tok, "invalid non-printable character U+%s", hex); + syntaxerror(tok, "invalid non-printable character U+%04X", ch); } return 0; } @@ -2541,9 +2538,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } if (!Py_UNICODE_ISPRINTABLE(c)) { - char hex[9]; - (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); - return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex)); + return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c)); } if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { -- cgit v0.12