From 62f06508e76e023a81861caee6a45e1d639bf530 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 8 Aug 2022 19:21:07 +0300 Subject: gh-95781: More strict format string checking in PyUnicode_FromFormatV() (GH-95784) An unrecognized format character in PyUnicode_FromFormat() and PyUnicode_FromFormatV() now sets a SystemError. In previous versions it caused all the rest of the format string to be copied as-is to the result string, and any extra arguments discarded. --- Doc/c-api/unicode.rst | 8 ++++-- Doc/whatsnew/3.12.rst | 6 ++++ Lib/test/test_unicode.py | 23 +++++++-------- .../2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst | 4 +++ Objects/unicodeobject.c | 33 +++++++--------------- 5 files changed, 35 insertions(+), 39 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 339ee35..99afebd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -477,9 +477,6 @@ APIs: | | | :c:func:`PyObject_Repr`. | +-------------------+---------------------+----------------------------------+ - An unrecognized format character causes all the rest of the format string to be - copied as-is to the result string, and any extra arguments discarded. - .. note:: The width formatter unit is number of characters rather than bytes. The precision formatter unit is number of bytes for ``"%s"`` and @@ -500,6 +497,11 @@ APIs: Support width and precision formatter for ``"%s"``, ``"%A"``, ``"%U"``, ``"%V"``, ``"%S"``, ``"%R"`` added. + .. versionchanged:: 3.12 + An unrecognized format character now sets a :exc:`SystemError`. + In previous versions it caused all the rest of the format string to be + copied as-is to the result string, and any extra arguments discarded. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index f1696cc..6df122a 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -469,6 +469,12 @@ Porting to Python 3.12 :py:meth:`~class.__subclasses__` (using :c:func:`PyObject_CallMethod`, for example). +* An unrecognized format character in :c:func:`PyUnicode_FromFormat` and + :c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`. + In previous versions it caused all the rest of the format string to be + copied as-is to the result string, and any extra arguments discarded. + (Contributed by Serhiy Storchaka in :gh:`95781`.) + Deprecated ---------- diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 9765ed9..63bccb7 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2642,8 +2642,6 @@ class CAPITest(unittest.TestCase): # test "%" check_format('%', - b'%') - check_format('%', b'%%') check_format('%s', b'%%s') @@ -2819,23 +2817,22 @@ class CAPITest(unittest.TestCase): check_format('repr=abc\ufffd', b'repr=%V', None, b'abc\xff') - # not supported: copy the raw format string. these tests are just here - # to check for crashes and should not be considered as specifications - check_format('%s', - b'%1%s', b'abc') - check_format('%1abc', - b'%1abc') - check_format('%+i', - b'%+i', c_int(10)) - check_format('%.%s', - b'%.%s', b'abc') - # Issue #33817: empty strings check_format('', b'') check_format('', b'%s', b'') + # check for crashes + for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1', + b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc', + b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'): + with self.subTest(fmt=fmt): + self.assertRaisesRegex(SystemError, 'invalid format string', + PyUnicode_FromFormat, fmt, b'abc') + self.assertRaisesRegex(SystemError, 'invalid format string', + PyUnicode_FromFormat, b'%+i', c_int(10)) + # Test PyUnicode_AsWideChar() @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst b/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst new file mode 100644 index 0000000..eb2fd7e --- /dev/null +++ b/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst @@ -0,0 +1,4 @@ +An unrecognized format character in :c:func:`PyUnicode_FromFormat` and +:c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`. +In previous versions it caused all the rest of the format string to be +copied as-is to the result string, and any extra arguments discarded. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7ff7995..184a2bf 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2355,6 +2355,13 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, p = f; f++; + if (*f == '%') { + if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) + return NULL; + f++; + return f; + } + zeropad = 0; if (*f == '0') { zeropad = 1; @@ -2392,14 +2399,6 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, f++; } } - if (*f == '%') { - /* "%.3%s" => f points to "3" */ - f--; - } - } - if (*f == '\0') { - /* bogus format "%.123" => go backward, f points to "3" */ - f--; } /* Handle %ld, %lu, %lld and %llu. */ @@ -2423,7 +2422,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, ++f; } - if (f[1] == '\0') + if (f[0] != '\0' && f[1] == '\0') writer->overallocate = 0; switch (*f) { @@ -2616,21 +2615,9 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, break; } - case '%': - if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) - return NULL; - break; - default: - /* if we stumble upon an unknown formatting code, copy the rest - of the format string to the output string. (we cannot just - skip the code, since there's no way to know what's in the - argument list) */ - len = strlen(p); - if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) - return NULL; - f = p+len; - return f; + PyErr_Format(PyExc_SystemError, "invalid format string: %s", p); + return NULL; } f++; -- cgit v0.12