summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/c-api/unicode.rst228
-rw-r--r--Doc/whatsnew/3.12.rst6
-rw-r--r--Lib/test/test_capi/test_unicode.py217
-rw-r--r--Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst4
-rw-r--r--Modules/_ssl.c5
-rw-r--r--Modules/_testcapi/unicode.c63
-rw-r--r--Modules/selectmodule.c5
-rw-r--r--Modules/socketmodule.c8
-rw-r--r--Objects/unicodeobject.c326
-rw-r--r--Parser/tokenizer.c11
10 files changed, 585 insertions, 288 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index ab3a2e2..6771f37 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -394,98 +394,149 @@ APIs:
arguments, calculate the size of the resulting Python Unicode string and return
a string with the values formatted into it. The variable arguments must be C
types and must correspond exactly to the format characters in the *format*
- ASCII-encoded string. The following format characters are allowed:
-
- .. % This should be exactly the same as the table in PyErr_Format.
-
- .. tabularcolumns:: |l|l|L|
-
- +-------------------+---------------------+----------------------------------+
- | Format Characters | Type | Comment |
- +===================+=====================+==================================+
- | :attr:`%%` | *n/a* | The literal % character. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%c` | int | A single character, |
- | | | represented as a C int. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%d` | int | Equivalent to |
- | | | ``printf("%d")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%u` | unsigned int | Equivalent to |
- | | | ``printf("%u")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%ld` | long | Equivalent to |
- | | | ``printf("%ld")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%li` | long | Equivalent to |
- | | | ``printf("%li")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%lu` | unsigned long | Equivalent to |
- | | | ``printf("%lu")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%lld` | long long | Equivalent to |
- | | | ``printf("%lld")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%lli` | long long | Equivalent to |
- | | | ``printf("%lli")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%llu` | unsigned long long | Equivalent to |
- | | | ``printf("%llu")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%zd` | :c:type:`\ | Equivalent to |
- | | Py_ssize_t` | ``printf("%zd")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%zi` | :c:type:`\ | Equivalent to |
- | | Py_ssize_t` | ``printf("%zi")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%zu` | size_t | Equivalent to |
- | | | ``printf("%zu")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%i` | int | Equivalent to |
- | | | ``printf("%i")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%x` | int | Equivalent to |
- | | | ``printf("%x")``. [1]_ |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%s` | const char\* | A null-terminated C character |
- | | | array. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%p` | const void\* | The hex representation of a C |
- | | | pointer. Mostly equivalent to |
- | | | ``printf("%p")`` except that |
- | | | it is guaranteed to start with |
- | | | the literal ``0x`` regardless |
- | | | of what the platform's |
- | | | ``printf`` yields. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%A` | PyObject\* | The result of calling |
- | | | :func:`ascii`. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%U` | PyObject\* | A Unicode object. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%V` | PyObject\*, | A Unicode object (which may be |
- | | const char\* | ``NULL``) and a null-terminated |
- | | | C character array as a second |
- | | | parameter (which will be used, |
- | | | if the first parameter is |
- | | | ``NULL``). |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%S` | PyObject\* | The result of calling |
- | | | :c:func:`PyObject_Str`. |
- +-------------------+---------------------+----------------------------------+
- | :attr:`%R` | PyObject\* | The result of calling |
- | | | :c:func:`PyObject_Repr`. |
- +-------------------+---------------------+----------------------------------+
+ ASCII-encoded string.
+
+ A conversion specifier contains two or more characters and has the following
+ components, which must occur in this order:
+
+ #. The ``'%'`` character, which marks the start of the specifier.
+
+ #. Conversion flags (optional), which affect the result of some conversion
+ types.
+
+ #. Minimum field width (optional).
+ If specified as an ``'*'`` (asterisk), the actual width is given in the
+ next argument, which must be of type :c:expr:`int`, and the object to
+ convert comes after the minimum field width and optional precision.
+
+ #. Precision (optional), given as a ``'.'`` (dot) followed by the precision.
+ If specified as ``'*'`` (an asterisk), the actual precision is given in
+ the next argument, which must be of type :c:expr:`int`, and the value to
+ convert comes after the precision.
+
+ #. Length modifier (optional).
+
+ #. Conversion type.
+
+ The conversion flag characters are:
+
+ .. tabularcolumns:: |l|L|
+
+ +-------+-------------------------------------------------------------+
+ | Flag | Meaning |
+ +=======+=============================================================+
+ | ``0`` | The conversion will be zero padded for numeric values. |
+ +-------+-------------------------------------------------------------+
+ | ``-`` | The converted value is left adjusted (overrides the ``0`` |
+ | | flag if both are given). |
+ +-------+-------------------------------------------------------------+
+
+ The length modifiers for following integer conversions (``d``, ``i``,
+ ``o``, ``u``, ``x``, or ``X``) specify the type of the argument
+ (:c:expr:`int` by default):
+
+ .. tabularcolumns:: |l|L|
+
+ +----------+-----------------------------------------------------+
+ | Modifier | Types |
+ +==========+=====================================================+
+ | ``l`` | :c:expr:`long` or :c:expr:`unsigned long` |
+ +----------+-----------------------------------------------------+
+ | ``ll`` | :c:expr:`long long` or :c:expr:`unsigned long long` |
+ +----------+-----------------------------------------------------+
+ | ``j`` | :c:expr:`intmax_t` or :c:expr:`uintmax_t` |
+ +----------+-----------------------------------------------------+
+ | ``z`` | :c:expr:`size_t` or :c:expr:`ssize_t` |
+ +----------+-----------------------------------------------------+
+ | ``t`` | :c:expr:`ptrdiff_t` |
+ +----------+-----------------------------------------------------+
+
+ The length modifier ``l`` for following conversions ``s`` or ``V`` specify
+ that the type of the argument is :c:expr:`const wchar_t*`.
+
+ The conversion specifiers are:
+
+ .. list-table::
+ :widths: auto
+ :header-rows: 1
+
+ * - Conversion Specifier
+ - Type
+ - Comment
+
+ * - ``%``
+ - *n/a*
+ - The literal ``%`` character.
+
+ * - ``d``, ``i``
+ - Specified by the length modifier
+ - The decimal representation of a signed C integer.
+
+ * - ``u``
+ - Specified by the length modifier
+ - The decimal representation of an unsigned C integer.
+
+ * - ``o``
+ - Specified by the length modifier
+ - The octal representation of an unsigned C integer.
+
+ * - ``x``
+ - Specified by the length modifier
+ - The hexadecimal representation of an unsigned C integer (lowercase).
+
+ * - ``X``
+ - Specified by the length modifier
+ - The hexadecimal representation of an unsigned C integer (uppercase).
+
+ * - ``c``
+ - :c:expr:`int`
+ - A single character.
+
+ * - ``s``
+ - :c:expr:`const char*` or :c:expr:`const wchar_t*`
+ - A null-terminated C character array.
+
+ * - ``p``
+ - :c:expr:`const void*`
+ - The hex representation of a C pointer.
+ Mostly equivalent to ``printf("%p")`` except that it is guaranteed to
+ start with the literal ``0x`` regardless of what the platform's
+ ``printf`` yields.
+
+ * - ``A``
+ - :c:expr:`PyObject*`
+ - The result of calling :func:`ascii`.
+
+ * - ``U``
+ - :c:expr:`PyObject*`
+ - A Unicode object.
+
+ * - ``V``
+ - :c:expr:`PyObject*`, :c:expr:`const char*` or :c:expr:`const wchar_t*`
+ - A Unicode object (which may be ``NULL``) and a null-terminated
+ C character array as a second parameter (which will be used,
+ if the first parameter is ``NULL``).
+
+ * - ``S``
+ - :c:expr:`PyObject*`
+ - The result of calling :c:func:`PyObject_Str`.
+
+ * - ``R``
+ - :c:expr:`PyObject*`
+ - The result of calling :c:func:`PyObject_Repr`.
.. note::
The width formatter unit is number of characters rather than bytes.
- The precision formatter unit is number of bytes for ``"%s"`` and
+ The precision formatter unit is number of bytes or :c:expr:`wchar_t`
+ items (if the length modifier ``l`` is used) for ``"%s"`` and
``"%V"`` (if the ``PyObject*`` argument is ``NULL``), and a number of
characters for ``"%A"``, ``"%U"``, ``"%S"``, ``"%R"`` and ``"%V"``
(if the ``PyObject*`` argument is not ``NULL``).
- .. [1] For integer specifiers (d, u, ld, li, lu, lld, lli, llu, zd, zi,
- zu, i, x): the 0-conversion flag has effect even when a precision is given.
+ .. note::
+ Unlike to C :c:func:`printf` the ``0`` flag has effect even when
+ a precision is given for integer conversions (``d``, ``i``, ``u``, ``o``,
+ ``x``, or ``X``).
.. versionchanged:: 3.2
Support for ``"%lld"`` and ``"%llu"`` added.
@@ -498,6 +549,13 @@ APIs:
``"%V"``, ``"%S"``, ``"%R"`` added.
.. versionchanged:: 3.12
+ Support for conversion specifiers ``o`` and ``X``.
+ Support for length modifiers ``j`` and ``t``.
+ Length modifiers are now applied to all integer conversions.
+ Length modifier ``l`` is now applied to conversion specifiers ``s`` and ``V``.
+ Support for variable width and precision ``*``.
+ Support for flag ``-``.
+
An unrecognized format character now sets a :exc:`SystemError`.
In previous versions it caused all the rest of the format string to be
copied as-is to the result string, and any extra arguments discarded.
diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst
index 14f03ef..caf2107 100644
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@@ -1402,6 +1402,12 @@ Porting to Python 3.12
:py:meth:`~class.__subclasses__` (using :c:func:`PyObject_CallMethod`,
for example).
+* Add support of more formatting options (left aligning, octals, uppercase
+ hexadecimals, ``intmax_t``, ``ptrdiff_t``, ``wchar_t`` C
+ strings, variable width and precision) in :c:func:`PyUnicode_FromFormat` and
+ :c:func:`PyUnicode_FromFormatV`.
+ (Contributed by Serhiy Storchaka in :gh:`98836`.)
+
* An unrecognized format character in :c:func:`PyUnicode_FromFormat` and
:c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`.
In previous versions it caused all the rest of the format string to be
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
index 00807d9..9c76620 100644
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -319,12 +319,17 @@ class CAPITest(unittest.TestCase):
def test_from_format(self):
"""Test PyUnicode_FromFormat()"""
+ # Length modifiers "j" and "t" are not tested here because ctypes does
+ # not expose types for intmax_t and ptrdiff_t.
+ # _testcapi.test_string_from_format() has a wider coverage of all
+ # formats.
import_helper.import_module('ctypes')
from ctypes import (
c_char_p,
pythonapi, py_object, sizeof,
c_int, c_long, c_longlong, c_ssize_t,
- c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
+ c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p,
+ sizeof, c_wchar, c_wchar_p)
name = "PyUnicode_FromFormat"
_PyUnicode_FromFormat = getattr(pythonapi, name)
_PyUnicode_FromFormat.argtypes = (c_char_p,)
@@ -449,37 +454,28 @@ class CAPITest(unittest.TestCase):
check_format("repr= 12",
b'repr=%5.2V', None, b'123')
- # test integer formats (%i, %d, %u)
+ # test integer formats (%i, %d, %u, %o, %x, %X)
check_format('010',
b'%03i', c_int(10))
check_format('0010',
b'%0.4i', c_int(10))
- check_format('-123',
- b'%i', c_int(-123))
- check_format('-123',
- b'%li', c_long(-123))
- check_format('-123',
- b'%lli', c_longlong(-123))
- check_format('-123',
- b'%zi', c_ssize_t(-123))
-
- check_format('-123',
- b'%d', c_int(-123))
- check_format('-123',
- b'%ld', c_long(-123))
- check_format('-123',
- b'%lld', c_longlong(-123))
- check_format('-123',
- b'%zd', c_ssize_t(-123))
-
- check_format('123',
- b'%u', c_uint(123))
- check_format('123',
- b'%lu', c_ulong(123))
- check_format('123',
- b'%llu', c_ulonglong(123))
- check_format('123',
- b'%zu', c_size_t(123))
+ for conv, signed, value, expected in [
+ (b'i', True, -123, '-123'),
+ (b'd', True, -123, '-123'),
+ (b'u', False, 123, '123'),
+ (b'o', False, 0o123, '123'),
+ (b'x', False, 0xabc, 'abc'),
+ (b'X', False, 0xabc, 'ABC'),
+ ]:
+ for mod, ctype in [
+ (b'', c_int if signed else c_uint),
+ (b'l', c_long if signed else c_ulong),
+ (b'll', c_longlong if signed else c_ulonglong),
+ (b'z', c_ssize_t if signed else c_size_t),
+ ]:
+ with self.subTest(format=b'%' + mod + conv):
+ check_format(expected,
+ b'%' + mod + conv, ctype(value))
# test long output
min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
@@ -494,40 +490,144 @@ class CAPITest(unittest.TestCase):
PyUnicode_FromFormat(b'%p', c_void_p(-1))
# test padding (width and/or precision)
- check_format('123'.rjust(10, '0'),
- b'%010i', c_int(123))
- check_format('123'.rjust(100),
- b'%100i', c_int(123))
- check_format('123'.rjust(100, '0'),
- b'%.100i', c_int(123))
- check_format('123'.rjust(80, '0').rjust(100),
- b'%100.80i', c_int(123))
-
- check_format('123'.rjust(10, '0'),
- b'%010u', c_uint(123))
- check_format('123'.rjust(100),
- b'%100u', c_uint(123))
- check_format('123'.rjust(100, '0'),
- b'%.100u', c_uint(123))
- check_format('123'.rjust(80, '0').rjust(100),
- b'%100.80u', c_uint(123))
-
- check_format('123'.rjust(10, '0'),
- b'%010x', c_int(0x123))
- check_format('123'.rjust(100),
- b'%100x', c_int(0x123))
- check_format('123'.rjust(100, '0'),
- b'%.100x', c_int(0x123))
- check_format('123'.rjust(80, '0').rjust(100),
- b'%100.80x', c_int(0x123))
+ check_format('123', b'%2i', c_int(123))
+ check_format(' 123', b'%10i', c_int(123))
+ check_format('0000000123', b'%010i', c_int(123))
+ check_format('123 ', b'%-10i', c_int(123))
+ check_format('123 ', b'%-010i', c_int(123))
+ check_format('123', b'%.2i', c_int(123))
+ check_format('0000123', b'%.7i', c_int(123))
+ check_format(' 123', b'%10.2i', c_int(123))
+ check_format(' 0000123', b'%10.7i', c_int(123))
+ check_format('0000000123', b'%010.7i', c_int(123))
+ check_format('0000123 ', b'%-10.7i', c_int(123))
+ check_format('0000123 ', b'%-010.7i', c_int(123))
+
+ check_format('-123', b'%2i', c_int(-123))
+ check_format(' -123', b'%10i', c_int(-123))
+ check_format('-000000123', b'%010i', c_int(-123))
+ check_format('-123 ', b'%-10i', c_int(-123))
+ check_format('-123 ', b'%-010i', c_int(-123))
+ check_format('-123', b'%.2i', c_int(-123))
+ check_format('-0000123', b'%.7i', c_int(-123))
+ check_format(' -123', b'%10.2i', c_int(-123))
+ check_format(' -0000123', b'%10.7i', c_int(-123))
+ check_format('-000000123', b'%010.7i', c_int(-123))
+ check_format('-0000123 ', b'%-10.7i', c_int(-123))
+ check_format('-0000123 ', b'%-010.7i', c_int(-123))
+
+ check_format('123', b'%2u', c_uint(123))
+ check_format(' 123', b'%10u', c_uint(123))
+ check_format('0000000123', b'%010u', c_uint(123))
+ check_format('123 ', b'%-10u', c_uint(123))
+ check_format('123 ', b'%-010u', c_uint(123))
+ check_format('123', b'%.2u', c_uint(123))
+ check_format('0000123', b'%.7u', c_uint(123))
+ check_format(' 123', b'%10.2u', c_uint(123))
+ check_format(' 0000123', b'%10.7u', c_uint(123))
+ check_format('0000000123', b'%010.7u', c_uint(123))
+ check_format('0000123 ', b'%-10.7u', c_uint(123))
+ check_format('0000123 ', b'%-010.7u', c_uint(123))
+
+ check_format('123', b'%2o', c_uint(0o123))
+ check_format(' 123', b'%10o', c_uint(0o123))
+ check_format('0000000123', b'%010o', c_uint(0o123))
+ check_format('123 ', b'%-10o', c_uint(0o123))
+ check_format('123 ', b'%-010o', c_uint(0o123))
+ check_format('123', b'%.2o', c_uint(0o123))
+ check_format('0000123', b'%.7o', c_uint(0o123))
+ check_format(' 123', b'%10.2o', c_uint(0o123))
+ check_format(' 0000123', b'%10.7o', c_uint(0o123))
+ check_format('0000000123', b'%010.7o', c_uint(0o123))
+ check_format('0000123 ', b'%-10.7o', c_uint(0o123))
+ check_format('0000123 ', b'%-010.7o', c_uint(0o123))
+
+ check_format('abc', b'%2x', c_uint(0xabc))
+ check_format(' abc', b'%10x', c_uint(0xabc))
+ check_format('0000000abc', b'%010x', c_uint(0xabc))
+ check_format('abc ', b'%-10x', c_uint(0xabc))
+ check_format('abc ', b'%-010x', c_uint(0xabc))
+ check_format('abc', b'%.2x', c_uint(0xabc))
+ check_format('0000abc', b'%.7x', c_uint(0xabc))
+ check_format(' abc', b'%10.2x', c_uint(0xabc))
+ check_format(' 0000abc', b'%10.7x', c_uint(0xabc))
+ check_format('0000000abc', b'%010.7x', c_uint(0xabc))
+ check_format('0000abc ', b'%-10.7x', c_uint(0xabc))
+ check_format('0000abc ', b'%-010.7x', c_uint(0xabc))
+
+ check_format('ABC', b'%2X', c_uint(0xabc))
+ check_format(' ABC', b'%10X', c_uint(0xabc))
+ check_format('0000000ABC', b'%010X', c_uint(0xabc))
+ check_format('ABC ', b'%-10X', c_uint(0xabc))
+ check_format('ABC ', b'%-010X', c_uint(0xabc))
+ check_format('ABC', b'%.2X', c_uint(0xabc))
+ check_format('0000ABC', b'%.7X', c_uint(0xabc))
+ check_format(' ABC', b'%10.2X', c_uint(0xabc))
+ check_format(' 0000ABC', b'%10.7X', c_uint(0xabc))
+ check_format('0000000ABC', b'%010.7X', c_uint(0xabc))
+ check_format('0000ABC ', b'%-10.7X', c_uint(0xabc))
+ check_format('0000ABC ', b'%-010.7X', c_uint(0xabc))
# test %A
check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
# test %V
- check_format('repr=abc',
- b'repr=%V', 'abc', b'xyz')
+ check_format('abc',
+ b'%V', 'abc', b'xyz')
+ check_format('xyz',
+ b'%V', None, b'xyz')
+
+ # test %ls
+ check_format('abc', b'%ls', c_wchar_p('abc'))
+ check_format('\u4eba\u6c11', b'%ls', c_wchar_p('\u4eba\u6c11'))
+ check_format('\U0001f4bb+\U0001f40d',
+ b'%ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+ check_format(' ab', b'%5.2ls', c_wchar_p('abc'))
+ check_format(' \u4eba\u6c11', b'%5ls', c_wchar_p('\u4eba\u6c11'))
+ check_format(' \U0001f4bb+\U0001f40d',
+ b'%5ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+ check_format('\u4eba', b'%.1ls', c_wchar_p('\u4eba\u6c11'))
+ check_format('\U0001f4bb' if sizeof(c_wchar) > 2 else '\ud83d',
+ b'%.1ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+ check_format('\U0001f4bb+' if sizeof(c_wchar) > 2 else '\U0001f4bb',
+ b'%.2ls', c_wchar_p('\U0001f4bb+\U0001f40d'))
+
+ # test %lV
+ check_format('abc',
+ b'%lV', 'abc', c_wchar_p('xyz'))
+ check_format('xyz',
+ b'%lV', None, c_wchar_p('xyz'))
+ check_format('\u4eba\u6c11',
+ b'%lV', None, c_wchar_p('\u4eba\u6c11'))
+ check_format('\U0001f4bb+\U0001f40d',
+ b'%lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+ check_format(' ab',
+ b'%5.2lV', None, c_wchar_p('abc'))
+ check_format(' \u4eba\u6c11',
+ b'%5lV', None, c_wchar_p('\u4eba\u6c11'))
+ check_format(' \U0001f4bb+\U0001f40d',
+ b'%5lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+ check_format('\u4eba',
+ b'%.1lV', None, c_wchar_p('\u4eba\u6c11'))
+ check_format('\U0001f4bb' if sizeof(c_wchar) > 2 else '\ud83d',
+ b'%.1lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+ check_format('\U0001f4bb+' if sizeof(c_wchar) > 2 else '\U0001f4bb',
+ b'%.2lV', None, c_wchar_p('\U0001f4bb+\U0001f40d'))
+
+ # test variable width and precision
+ check_format(' abc', b'%*s', c_int(5), b'abc')
+ check_format('ab', b'%.*s', c_int(2), b'abc')
+ check_format(' ab', b'%*.*s', c_int(5), c_int(2), b'abc')
+ check_format(' abc', b'%*U', c_int(5), 'abc')
+ check_format('ab', b'%.*U', c_int(2), 'abc')
+ check_format(' ab', b'%*.*U', c_int(5), c_int(2), 'abc')
+ check_format(' ab', b'%*.*V', c_int(5), c_int(2), None, b'abc')
+ check_format(' ab', b'%*.*lV', c_int(5), c_int(2),
+ None, c_wchar_p('abc'))
+ check_format(' 123', b'%*i', c_int(8), c_int(123))
+ check_format('00123', b'%.*i', c_int(5), c_int(123))
+ check_format(' 00123', b'%*.*i', c_int(8), c_int(5), c_int(123))
# test %p
# We cannot test the exact result,
@@ -564,10 +664,11 @@ class CAPITest(unittest.TestCase):
check_format('',
b'%s', b'')
- # check for crashes
+ # test invalid format strings. these tests are just here
+ # to check for crashes and should not be considered as specifications
for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1',
b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc',
- b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'):
+ b'%l', b'%ll', b'%z', b'%lls', b'%zs'):
with self.subTest(fmt=fmt):
self.assertRaisesRegex(SystemError, 'invalid format string',
PyUnicode_FromFormat, fmt, b'abc')
diff --git a/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst b/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst
new file mode 100644
index 0000000..e3730eb
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2022-10-29-10-13-20.gh-issue-98836.Cy5h_z.rst
@@ -0,0 +1,4 @@
+Add support of more formatting options (left aligning, octals, uppercase
+hexadecimals, :c:expr:`intmax_t`, :c:expr:`ptrdiff_t`, :c:expr:`wchar_t` C
+strings, variable width and precision) in :c:func:`PyUnicode_FromFormat` and
+:c:func:`PyUnicode_FromFormatV`.
diff --git a/Modules/_ssl.c b/Modules/_ssl.c
index 016a5a5..5bf6b3b 100644
--- a/Modules/_ssl.c
+++ b/Modules/_ssl.c
@@ -1330,10 +1330,8 @@ _get_peer_alt_names (_sslmodulestate *state, X509 *certificate) {
p[0], p[1], p[2], p[3]
);
} else if (name->d.ip->length == 16) {
- /* PyUnicode_FromFormat() does not support %X */
unsigned char *p = name->d.ip->data;
- len = sprintf(
- buf,
+ v = PyUnicode_FromFormat(
"%X:%X:%X:%X:%X:%X:%X:%X",
p[0] << 8 | p[1],
p[2] << 8 | p[3],
@@ -1344,7 +1342,6 @@ _get_peer_alt_names (_sslmodulestate *state, X509 *certificate) {
p[12] << 8 | p[13],
p[14] << 8 | p[15]
);
- v = PyUnicode_FromStringAndSize(buf, len);
} else {
v = PyUnicode_FromString("<invalid>");
}
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
index 7dd3b9c..73929ea 100644
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -1,3 +1,5 @@
+#include <stddef.h> // ptrdiff_t
+
#define PY_SSIZE_T_CLEAN
#include "parts.h"
@@ -1130,25 +1132,48 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1( "%c", "c", 'c');
CHECK_FORMAT_1( "%0c", "c", 'c');
CHECK_FORMAT_1("%00c", "c", 'c');
- CHECK_FORMAT_1( "%2c", "c", 'c');
- CHECK_FORMAT_1("%02c", "c", 'c');
- CHECK_FORMAT_1("%.0c", "c", 'c');
- CHECK_FORMAT_1("%.2c", "c", 'c');
+ CHECK_FORMAT_1( "%2c", NULL, 'c');
+ CHECK_FORMAT_1("%02c", NULL, 'c');
+ CHECK_FORMAT_1("%.0c", NULL, 'c');
+ CHECK_FORMAT_1("%.2c", NULL, 'c');
// Integers
CHECK_FORMAT_1("%d", "123", (int)123);
CHECK_FORMAT_1("%i", "123", (int)123);
CHECK_FORMAT_1("%u", "123", (unsigned int)123);
+ CHECK_FORMAT_1("%x", "7b", (unsigned int)123);
+ CHECK_FORMAT_1("%X", "7B", (unsigned int)123);
+ CHECK_FORMAT_1("%o", "173", (unsigned int)123);
CHECK_FORMAT_1("%ld", "123", (long)123);
CHECK_FORMAT_1("%li", "123", (long)123);
CHECK_FORMAT_1("%lu", "123", (unsigned long)123);
+ CHECK_FORMAT_1("%lx", "7b", (unsigned long)123);
+ CHECK_FORMAT_1("%lX", "7B", (unsigned long)123);
+ CHECK_FORMAT_1("%lo", "173", (unsigned long)123);
CHECK_FORMAT_1("%lld", "123", (long long)123);
CHECK_FORMAT_1("%lli", "123", (long long)123);
CHECK_FORMAT_1("%llu", "123", (unsigned long long)123);
+ CHECK_FORMAT_1("%llx", "7b", (unsigned long long)123);
+ CHECK_FORMAT_1("%llX", "7B", (unsigned long long)123);
+ CHECK_FORMAT_1("%llo", "173", (unsigned long long)123);
CHECK_FORMAT_1("%zd", "123", (Py_ssize_t)123);
CHECK_FORMAT_1("%zi", "123", (Py_ssize_t)123);
CHECK_FORMAT_1("%zu", "123", (size_t)123);
- CHECK_FORMAT_1("%x", "7b", (int)123);
+ CHECK_FORMAT_1("%zx", "7b", (size_t)123);
+ CHECK_FORMAT_1("%zX", "7B", (size_t)123);
+ CHECK_FORMAT_1("%zo", "173", (size_t)123);
+ CHECK_FORMAT_1("%td", "123", (ptrdiff_t)123);
+ CHECK_FORMAT_1("%ti", "123", (ptrdiff_t)123);
+ CHECK_FORMAT_1("%tu", "123", (ptrdiff_t)123);
+ CHECK_FORMAT_1("%tx", "7b", (ptrdiff_t)123);
+ CHECK_FORMAT_1("%tX", "7B", (ptrdiff_t)123);
+ CHECK_FORMAT_1("%to", "173", (ptrdiff_t)123);
+ CHECK_FORMAT_1("%jd", "123", (intmax_t)123);
+ CHECK_FORMAT_1("%ji", "123", (intmax_t)123);
+ CHECK_FORMAT_1("%ju", "123", (uintmax_t)123);
+ CHECK_FORMAT_1("%jx", "7b", (uintmax_t)123);
+ CHECK_FORMAT_1("%jX", "7B", (uintmax_t)123);
+ CHECK_FORMAT_1("%jo", "173", (uintmax_t)123);
CHECK_FORMAT_1("%d", "-123", (int)-123);
CHECK_FORMAT_1("%i", "-123", (int)-123);
@@ -1158,7 +1183,10 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%lli", "-123", (long long)-123);
CHECK_FORMAT_1("%zd", "-123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%zi", "-123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%x", "ffffff85", (int)-123);
+ CHECK_FORMAT_1("%td", "-123", (ptrdiff_t)-123);
+ CHECK_FORMAT_1("%ti", "-123", (ptrdiff_t)-123);
+ CHECK_FORMAT_1("%jd", "-123", (intmax_t)-123);
+ CHECK_FORMAT_1("%ji", "-123", (intmax_t)-123);
// Integers: width < length
CHECK_FORMAT_1("%1d", "123", (int)123);
@@ -1183,7 +1211,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%1lli", "-123", (long long)-123);
CHECK_FORMAT_1("%1zd", "-123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%1zi", "-123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%1x", "ffffff85", (int)-123);
// Integers: width > length
CHECK_FORMAT_1("%5d", " 123", (int)123);
@@ -1208,7 +1235,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%5lli", " -123", (long long)-123);
CHECK_FORMAT_1("%5zd", " -123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%5zi", " -123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%9x", " ffffff85", (int)-123);
// Integers: width > length, 0-flag
CHECK_FORMAT_1("%05d", "00123", (int)123);
@@ -1233,7 +1259,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%05lli", "-0123", (long long)-123);
CHECK_FORMAT_1("%05zd", "-0123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%05zi", "-0123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%09x", "0ffffff85", (int)-123);
// Integers: precision < length
CHECK_FORMAT_1("%.1d", "123", (int)123);
@@ -1258,7 +1283,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%.1lli", "-123", (long long)-123);
CHECK_FORMAT_1("%.1zd", "-123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%.1zi", "-123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%.1x", "ffffff85", (int)-123);
// Integers: precision > length
CHECK_FORMAT_1("%.5d", "00123", (int)123);
@@ -1283,7 +1307,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%.5lli", "-00123", (long long)-123);
CHECK_FORMAT_1("%.5zd", "-00123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%.5zi", "-00123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%.9x", "0ffffff85", (int)-123);
// Integers: width > precision > length
CHECK_FORMAT_1("%7.5d", " 00123", (int)123);
@@ -1308,7 +1331,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%7.5lli", " -00123", (long long)-123);
CHECK_FORMAT_1("%7.5zd", " -00123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%7.5zi", " -00123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%10.9x", " 0ffffff85", (int)-123);
// Integers: width > precision > length, 0-flag
CHECK_FORMAT_1("%07.5d", "0000123", (int)123);
@@ -1333,7 +1355,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%07.5lli", "-000123", (long long)-123);
CHECK_FORMAT_1("%07.5zd", "-000123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%07.5zi", "-000123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%010.9x", "00ffffff85", (int)-123);
// Integers: precision > width > length
CHECK_FORMAT_1("%5.7d", "0000123", (int)123);
@@ -1358,7 +1379,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%5.7lli", "-0000123", (long long)-123);
CHECK_FORMAT_1("%5.7zd", "-0000123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%5.7zi", "-0000123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%9.10x", "00ffffff85", (int)-123);
// Integers: precision > width > length, 0-flag
CHECK_FORMAT_1("%05.7d", "0000123", (int)123);
@@ -1383,7 +1403,6 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
CHECK_FORMAT_1("%05.7lli", "-0000123", (long long)-123);
CHECK_FORMAT_1("%05.7zd", "-0000123", (Py_ssize_t)-123);
CHECK_FORMAT_1("%05.7zi", "-0000123", (Py_ssize_t)-123);
- CHECK_FORMAT_1("%09.10x", "00ffffff85", (int)-123);
// Integers: precision = 0, arg = 0 (empty string in C)
CHECK_FORMAT_1("%.0d", "0", (int)0);
@@ -1402,66 +1421,80 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
// Strings
CHECK_FORMAT_1("%s", "None", "None");
+ CHECK_FORMAT_1("%ls", "None", L"None");
CHECK_FORMAT_1("%U", "None", unicode);
CHECK_FORMAT_1("%A", "None", Py_None);
CHECK_FORMAT_1("%S", "None", Py_None);
CHECK_FORMAT_1("%R", "None", Py_None);
CHECK_FORMAT_2("%V", "None", unicode, "ignored");
CHECK_FORMAT_2("%V", "None", NULL, "None");
+ CHECK_FORMAT_2("%lV", "None", NULL, L"None");
// Strings: width < length
CHECK_FORMAT_1("%1s", "None", "None");
+ CHECK_FORMAT_1("%1ls", "None", L"None");
CHECK_FORMAT_1("%1U", "None", unicode);
CHECK_FORMAT_1("%1A", "None", Py_None);
CHECK_FORMAT_1("%1S", "None", Py_None);
CHECK_FORMAT_1("%1R", "None", Py_None);
CHECK_FORMAT_2("%1V", "None", unicode, "ignored");
CHECK_FORMAT_2("%1V", "None", NULL, "None");
+ CHECK_FORMAT_2("%1lV", "None", NULL, L"None");
// Strings: width > length
CHECK_FORMAT_1("%5s", " None", "None");
+ CHECK_FORMAT_1("%5ls", " None", L"None");
CHECK_FORMAT_1("%5U", " None", unicode);
CHECK_FORMAT_1("%5A", " None", Py_None);
CHECK_FORMAT_1("%5S", " None", Py_None);
CHECK_FORMAT_1("%5R", " None", Py_None);
CHECK_FORMAT_2("%5V", " None", unicode, "ignored");
CHECK_FORMAT_2("%5V", " None", NULL, "None");
+ CHECK_FORMAT_2("%5lV", " None", NULL, L"None");
// Strings: precision < length
CHECK_FORMAT_1("%.1s", "N", "None");
+ CHECK_FORMAT_1("%.1ls", "N", L"None");
CHECK_FORMAT_1("%.1U", "N", unicode);
CHECK_FORMAT_1("%.1A", "N", Py_None);
CHECK_FORMAT_1("%.1S", "N", Py_None);
CHECK_FORMAT_1("%.1R", "N", Py_None);
CHECK_FORMAT_2("%.1V", "N", unicode, "ignored");
CHECK_FORMAT_2("%.1V", "N", NULL, "None");
+ CHECK_FORMAT_2("%.1lV", "N", NULL, L"None");
// Strings: precision > length
CHECK_FORMAT_1("%.5s", "None", "None");
+ CHECK_FORMAT_1("%.5ls", "None", L"None");
CHECK_FORMAT_1("%.5U", "None", unicode);
CHECK_FORMAT_1("%.5A", "None", Py_None);
CHECK_FORMAT_1("%.5S", "None", Py_None);
CHECK_FORMAT_1("%.5R", "None", Py_None);
CHECK_FORMAT_2("%.5V", "None", unicode, "ignored");
CHECK_FORMAT_2("%.5V", "None", NULL, "None");
+ CHECK_FORMAT_2("%.5lV", "None", NULL, L"None");
// Strings: precision < length, width > length
CHECK_FORMAT_1("%5.1s", " N", "None");
+ CHECK_FORMAT_1("%5.1ls"," N", L"None");
CHECK_FORMAT_1("%5.1U", " N", unicode);
CHECK_FORMAT_1("%5.1A", " N", Py_None);
CHECK_FORMAT_1("%5.1S", " N", Py_None);
CHECK_FORMAT_1("%5.1R", " N", Py_None);
CHECK_FORMAT_2("%5.1V", " N", unicode, "ignored");
CHECK_FORMAT_2("%5.1V", " N", NULL, "None");
+ CHECK_FORMAT_2("%5.1lV"," N", NULL, L"None");
// Strings: width < length, precision > length
CHECK_FORMAT_1("%1.5s", "None", "None");
+ CHECK_FORMAT_1("%1.5ls", "None", L"None");
CHECK_FORMAT_1("%1.5U", "None", unicode);
CHECK_FORMAT_1("%1.5A", "None", Py_None);
CHECK_FORMAT_1("%1.5S", "None", Py_None);
CHECK_FORMAT_1("%1.5R", "None", Py_None);
CHECK_FORMAT_2("%1.5V", "None", unicode, "ignored");
CHECK_FORMAT_2("%1.5V", "None", NULL, "None");
+ CHECK_FORMAT_2("%1.5lV", "None", NULL, L"None");
Py_XDECREF(unicode);
Py_RETURN_NONE;
diff --git a/Modules/selectmodule.c b/Modules/selectmodule.c
index 79bd5b5..9a4943c 100644
--- a/Modules/selectmodule.c
+++ b/Modules/selectmodule.c
@@ -1849,14 +1849,11 @@ static PyObject *
kqueue_event_repr(kqueue_event_Object *s)
{
- char buf[1024];
- PyOS_snprintf(
- buf, sizeof(buf),
+ return PyUnicode_FromFormat(
"<select.kevent ident=%zu filter=%d flags=0x%x fflags=0x%x "
"data=0x%llx udata=%p>",
(size_t)(s->e.ident), (int)s->e.filter, (unsigned int)s->e.flags,
(unsigned int)s->e.fflags, (long long)(s->e.data), (void *)s->e.udata);
- return PyUnicode_FromString(buf);
}
static int
diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c
index c11fb44..a86aaed 100644
--- a/Modules/socketmodule.c
+++ b/Modules/socketmodule.c
@@ -1339,8 +1339,6 @@ setbdaddr(const char *name, bdaddr_t *bdaddr)
static PyObject *
makebdaddr(bdaddr_t *bdaddr)
{
- char buf[(6 * 2) + 5 + 1];
-
#ifdef MS_WINDOWS
int i;
unsigned int octets[6];
@@ -1349,16 +1347,14 @@ makebdaddr(bdaddr_t *bdaddr)
octets[i] = ((*bdaddr) >> (8 * i)) & 0xFF;
}
- sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X",
+ return PyUnicode_FromFormat("%02X:%02X:%02X:%02X:%02X:%02X",
octets[5], octets[4], octets[3],
octets[2], octets[1], octets[0]);
#else
- sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X",
+ return PyUnicode_FromFormat("%02X:%02X:%02X:%02X:%02X:%02X",
bdaddr->b[5], bdaddr->b[4], bdaddr->b[3],
bdaddr->b[2], bdaddr->b[1], bdaddr->b[0]);
#endif
-
- return PyUnicode_FromString(buf);
}
#endif
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7726f2f..ec5684b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -56,6 +56,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
#include "stringlib/eq.h" // unicode_eq()
+#include <stddef.h> // ptrdiff_t
#ifdef MS_WINDOWS
#include <windows.h>
@@ -2285,14 +2286,15 @@ PyUnicode_AsUCS4Copy(PyObject *string)
return as_ucs4(string, NULL, 0, 1);
}
-/* maximum number of characters required for output of %lld or %p.
- We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
- plus 1 for the sign. 53/22 is an upper bound for log10(256). */
-#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
+/* maximum number of characters required for output of %jo or %jd or %p.
+ We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
+ plus 1 for the sign, plus 2 for the 0x prefix (for %p),
+ plus 1 for the terminal NUL. */
+#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
static int
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
- Py_ssize_t width, Py_ssize_t precision)
+ Py_ssize_t width, Py_ssize_t precision, int flags)
{
Py_ssize_t length, fill, arglen;
Py_UCS4 maxchar;
@@ -2314,8 +2316,8 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
return -1;
- if (width > length) {
- fill = width - length;
+ fill = Py_MAX(width - length, 0);
+ if (fill && !(flags & F_LJUST)) {
if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
return -1;
writer->pos += fill;
@@ -2324,12 +2326,19 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
str, 0, length);
writer->pos += length;
+
+ if (fill && (flags & F_LJUST)) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
+ return -1;
+ writer->pos += fill;
+ }
+
return 0;
}
static int
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
- Py_ssize_t width, Py_ssize_t precision)
+ Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
Py_ssize_t length;
@@ -2349,24 +2358,58 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
if (unicode == NULL)
return -1;
- res = unicode_fromformat_write_str(writer, unicode, width, -1);
+ res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
+ Py_DECREF(unicode);
+ return res;
+}
+
+static int
+unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
+ Py_ssize_t width, Py_ssize_t precision, int flags)
+{
+ /* UTF-8 */
+ Py_ssize_t length;
+ PyObject *unicode;
+ int res;
+
+ if (precision == -1) {
+ length = wcslen(str);
+ }
+ else {
+ length = 0;
+ while (length < precision && str[length]) {
+ length++;
+ }
+ }
+ unicode = PyUnicode_FromWideChar(str, length);
+ if (unicode == NULL)
+ return -1;
+
+ res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
Py_DECREF(unicode);
return res;
}
+#define F_LONG 1
+#define F_LONGLONG 2
+#define F_SIZE 3
+#define F_PTRDIFF 4
+#define F_INTMAX 5
+static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"};
+static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"};
+static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"};
+static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"};
+static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"};
+
static const char*
unicode_fromformat_arg(_PyUnicodeWriter *writer,
const char *f, va_list *vargs)
{
const char *p;
Py_ssize_t len;
- int zeropad;
+ int flags = 0;
Py_ssize_t width;
Py_ssize_t precision;
- int longflag;
- int longlongflag;
- int size_tflag;
- Py_ssize_t fill;
p = f;
f++;
@@ -2377,15 +2420,31 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
return f;
}
- zeropad = 0;
- if (*f == '0') {
- zeropad = 1;
- f++;
+ /* Parse flags. Example: "%-i" => flags=F_LJUST. */
+ /* Flags '+', ' ' and '#' are not particularly useful.
+ * They are not worth the implementation and maintenance costs.
+ * In addition, '#' should add "0" for "o" conversions for compatibility
+ * with printf, but it would confuse Python users. */
+ while (1) {
+ switch (*f++) {
+ case '-': flags |= F_LJUST; continue;
+ case '0': flags |= F_ZERO; continue;
+ }
+ f--;
+ break;
}
/* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
width = -1;
- if (Py_ISDIGIT((unsigned)*f)) {
+ if (*f == '*') {
+ width = va_arg(*vargs, int);
+ if (width < 0) {
+ flags |= F_LJUST;
+ width = -width;
+ }
+ f++;
+ }
+ else if (Py_ISDIGIT((unsigned)*f)) {
width = *f - '0';
f++;
while (Py_ISDIGIT((unsigned)*f)) {
@@ -2401,7 +2460,14 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
precision = -1;
if (*f == '.') {
f++;
- if (Py_ISDIGIT((unsigned)*f)) {
+ if (*f == '*') {
+ precision = va_arg(*vargs, int);
+ if (precision < 0) {
+ precision = -2;
+ }
+ f++;
+ }
+ else if (Py_ISDIGIT((unsigned)*f)) {
precision = (*f - '0');
f++;
while (Py_ISDIGIT((unsigned)*f)) {
@@ -2416,31 +2482,48 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
}
}
- /* Handle %ld, %lu, %lld and %llu. */
- longflag = 0;
- longlongflag = 0;
- size_tflag = 0;
+ int sizemod = 0;
if (*f == 'l') {
- if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
- longflag = 1;
- ++f;
- }
- else if (f[1] == 'l' &&
- (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
- longlongflag = 1;
+ if (f[1] == 'l') {
+ sizemod = F_LONGLONG;
f += 2;
}
+ else {
+ sizemod = F_LONG;
+ ++f;
+ }
}
- /* handle the size_t flag. */
- else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
- size_tflag = 1;
+ else if (*f == 'z') {
+ sizemod = F_SIZE;
+ ++f;
+ }
+ else if (*f == 't') {
+ sizemod = F_PTRDIFF;
+ ++f;
+ }
+ else if (*f == 'j') {
+ sizemod = F_INTMAX;
++f;
}
-
if (f[0] != '\0' && f[1] == '\0')
writer->overallocate = 0;
switch (*f) {
+ case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
+ break;
+ case 'c': case 'p':
+ if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
+ break;
+ case 's':
+ case 'V':
+ if (sizemod && sizemod != F_LONG) goto invalid_format;
+ break;
+ default:
+ if (sizemod) goto invalid_format;
+ break;
+ }
+
+ switch (*f) {
case 'c':
{
int ordinal = va_arg(*vargs, int);
@@ -2454,91 +2537,98 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
break;
}
- case 'i':
- case 'd':
- case 'u':
- case 'x':
+ case 'd': case 'i':
+ case 'o': case 'u': case 'x': case 'X':
{
/* used by sprintf */
- char buffer[MAX_LONG_LONG_CHARS];
- Py_ssize_t arglen;
-
- if (*f == 'u') {
- if (longflag) {
- len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
- }
- else if (longlongflag) {
- len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
- }
- else if (size_tflag) {
- len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
- }
- else {
- len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
- }
- }
- else if (*f == 'x') {
- len = sprintf(buffer, "%x", va_arg(*vargs, int));
- }
- else {
- if (longflag) {
- len = sprintf(buffer, "%li", va_arg(*vargs, long));
- }
- else if (longlongflag) {
- len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
- }
- else if (size_tflag) {
- len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
- }
- else {
- len = sprintf(buffer, "%i", va_arg(*vargs, int));
- }
+ char buffer[MAX_INTMAX_CHARS];
+ const char *fmt = NULL;
+ switch (*f) {
+ case 'o': fmt = formats_o[sizemod]; break;
+ case 'u': fmt = formats_u[sizemod]; break;
+ case 'x': fmt = formats_x[sizemod]; break;
+ case 'X': fmt = formats_X[sizemod]; break;
+ default: fmt = formats[sizemod]; break;
+ }
+ int issigned = (*f == 'd' || *f == 'i');
+ switch (sizemod) {
+ case F_LONG:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, long)) :
+ sprintf(buffer, fmt, va_arg(*vargs, unsigned long));
+ break;
+ case F_LONGLONG:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, long long)) :
+ sprintf(buffer, fmt, va_arg(*vargs, unsigned long long));
+ break;
+ case F_SIZE:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) :
+ sprintf(buffer, fmt, va_arg(*vargs, size_t));
+ break;
+ case F_PTRDIFF:
+ len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t));
+ break;
+ case F_INTMAX:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) :
+ sprintf(buffer, fmt, va_arg(*vargs, uintmax_t));
+ break;
+ default:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, int)) :
+ sprintf(buffer, fmt, va_arg(*vargs, unsigned int));
+ break;
}
assert(len >= 0);
- int negative = (buffer[0] == '-');
- len -= negative;
+ int sign = (buffer[0] == '-');
+ len -= sign;
precision = Py_MAX(precision, len);
- width = Py_MAX(width, precision + negative);
+ width = Py_MAX(width, precision + sign);
+ if ((flags & F_ZERO) && !(flags & F_LJUST)) {
+ precision = width - sign;
+ }
- arglen = Py_MAX(precision, width);
- if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
- return NULL;
+ Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
+ Py_ssize_t zeropad = Py_MAX(precision - len, 0);
- if (width > precision) {
- if (negative && zeropad) {
- if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
- return NULL;
- }
+ if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
+ return NULL;
- Py_UCS4 fillchar = zeropad?'0':' ';
- fill = width - precision - negative;
- if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
+ if (spacepad && !(flags & F_LJUST)) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
return NULL;
- writer->pos += fill;
+ writer->pos += spacepad;
+ }
- if (negative && !zeropad) {
- if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
- return NULL;
- }
+ if (sign) {
+ if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
+ return NULL;
}
- if (precision > len) {
- fill = precision - len;
- if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
+ if (zeropad) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
return NULL;
- writer->pos += fill;
+ writer->pos += zeropad;
}
- if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[negative], len) < 0)
+ if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
return NULL;
+
+ if (spacepad && (flags & F_LJUST)) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
+ return NULL;
+ writer->pos += spacepad;
+ }
break;
}
case 'p':
{
- char number[MAX_LONG_LONG_CHARS];
+ char number[MAX_INTMAX_CHARS];
len = sprintf(number, "%p", va_arg(*vargs, void*));
assert(len >= 0);
@@ -2561,10 +2651,17 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
case 's':
{
- /* UTF-8 */
- const char *s = va_arg(*vargs, const char*);
- if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
- return NULL;
+ if (sizemod) {
+ const wchar_t *s = va_arg(*vargs, const wchar_t*);
+ if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
+ return NULL;
+ }
+ else {
+ /* UTF-8 */
+ const char *s = va_arg(*vargs, const char*);
+ if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
+ return NULL;
+ }
break;
}
@@ -2573,7 +2670,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
PyObject *obj = va_arg(*vargs, PyObject *);
assert(obj && _PyUnicode_CHECK(obj));
- if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+ if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
return NULL;
break;
}
@@ -2581,15 +2678,27 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
case 'V':
{
PyObject *obj = va_arg(*vargs, PyObject *);
- const char *str = va_arg(*vargs, const char *);
+ const char *str;
+ const wchar_t *wstr;
+ if (sizemod) {
+ wstr = va_arg(*vargs, const wchar_t*);
+ }
+ else {
+ str = va_arg(*vargs, const char *);
+ }
if (obj) {
assert(_PyUnicode_CHECK(obj));
- if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+ if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
+ return NULL;
+ }
+ else if (sizemod) {
+ assert(wstr != NULL);
+ if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
return NULL;
}
else {
assert(str != NULL);
- if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
+ if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
return NULL;
}
break;
@@ -2603,7 +2712,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
str = PyObject_Str(obj);
if (!str)
return NULL;
- if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
+ if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
Py_DECREF(str);
return NULL;
}
@@ -2619,7 +2728,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
repr = PyObject_Repr(obj);
if (!repr)
return NULL;
- if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
+ if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
Py_DECREF(repr);
return NULL;
}
@@ -2635,7 +2744,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
ascii = PyObject_ASCII(obj);
if (!ascii)
return NULL;
- if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
+ if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
Py_DECREF(ascii);
return NULL;
}
@@ -2644,6 +2753,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
}
default:
+ invalid_format:
PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
return NULL;
}
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index fb94fbe..fc4afcc 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1556,14 +1556,11 @@ verify_identifier(struct tok_state *tok)
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
}
Py_DECREF(s);
- // PyUnicode_FromFormatV() does not support %X
- char hex[9];
- (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
if (Py_UNICODE_ISPRINTABLE(ch)) {
- syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+ syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
}
else {
- syntaxerror(tok, "invalid non-printable character U+%s", hex);
+ syntaxerror(tok, "invalid non-printable character U+%04X", ch);
}
return 0;
}
@@ -2541,9 +2538,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
if (!Py_UNICODE_ISPRINTABLE(c)) {
- char hex[9];
- (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
- return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
+ return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%04X", c));
}
if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {