From 3b6e61ee0812359029cac176042d9c835c60f185 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Mon, 29 Mar 2021 12:36:47 -0700 Subject: bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843) (GH-24906) Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c. (cherry picked from commit 9976834f807ea63ca51bc4f89be457d734148682) Co-authored-by: Victor Stinner Co-authored-by: Victor Stinner --- Lib/test/test_cmd_line.py | 86 ++++++++++----- .../2021-03-13-13-57-21.bpo-35883.UyGpdG.rst | 4 + Objects/unicodeobject.c | 9 +- Python/fileutils.c | 118 ++++++++++++++------- 4 files changed, 148 insertions(+), 69 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index a5ece9b..871a9c7 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -190,38 +190,72 @@ class CmdLineTest(unittest.TestCase): if not stdout.startswith(pattern): raise AssertionError("%a doesn't start with %a" % (stdout, pattern)) + @unittest.skipIf(sys.platform == 'win32', + 'Windows has a native unicode API') + def test_invalid_utf8_arg(self): + # bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba' + # byte sequence with surrogateescape rather than decoding it as the + # U+7fffbeba character which is outside the [U+0000; U+10ffff] range of + # Python Unicode characters. + # + # Test with default config, in the C locale, in the Python UTF-8 Mode. + code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))' + base_cmd = [sys.executable, '-c', code] + + def run_default(arg): + cmd = [sys.executable, '-c', code, arg] + return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + + def run_c_locale(arg): + cmd = [sys.executable, '-c', code, arg] + env = dict(os.environ) + env['LC_ALL'] = 'C' + return subprocess.run(cmd, stdout=subprocess.PIPE, + text=True, env=env) + + def run_utf8_mode(arg): + cmd = [sys.executable, '-X', 'utf8', '-c', code, arg] + return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + + valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') + # invalid UTF-8 byte sequences with a valid UTF-8 sequence + # in the middle. + invalid_utf8 = ( + b'\xff' # invalid byte + b'\xc3\xff' # invalid byte sequence + b'\xc3\xa9' # valid utf-8: U+00E9 character + b'\xed\xa0\x80' # lone surrogate character (invalid) + b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff] + ) + test_args = [valid_utf8, invalid_utf8] + + for run_cmd in (run_default, run_c_locale, run_utf8_mode): + with self.subTest(run_cmd=run_cmd): + for arg in test_args: + proc = run_cmd(arg) + self.assertEqual(proc.stdout.rstrip(), ascii(arg)) + @unittest.skipUnless((sys.platform == 'darwin' or support.is_android), 'test specific to Mac OS X and Android') def test_osx_android_utf8(self): - def check_output(text): - decoded = text.decode('utf-8', 'surrogateescape') - expected = ascii(decoded).encode('ascii') + b'\n' + text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') + code = "import sys; print(ascii(sys.argv[1]))" - env = os.environ.copy() - # C locale gives ASCII locale encoding, but Python uses UTF-8 - # to parse the command line arguments on Mac OS X and Android. - env['LC_ALL'] = 'C' + decoded = text.decode('utf-8', 'surrogateescape') + expected = ascii(decoded).encode('ascii') + b'\n' - p = subprocess.Popen( - (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text), - stdout=subprocess.PIPE, - env=env) - stdout, stderr = p.communicate() - self.assertEqual(stdout, expected) - self.assertEqual(p.returncode, 0) + env = os.environ.copy() + # C locale gives ASCII locale encoding, but Python uses UTF-8 + # to parse the command line arguments on Mac OS X and Android. + env['LC_ALL'] = 'C' - # test valid utf-8 - text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') - check_output(text) - - # test invalid utf-8 - text = ( - b'\xff' # invalid byte - b'\xc3\xa9' # valid utf-8 character - b'\xc3\xff' # invalid byte sequence - b'\xed\xa0\x80' # lone surrogate character (invalid) - ) - check_output(text) + p = subprocess.Popen( + (sys.executable, "-c", code, text), + stdout=subprocess.PIPE, + env=env) + stdout, stderr = p.communicate() + self.assertEqual(stdout, expected) + self.assertEqual(p.returncode, 0) def test_unbuffered_output(self): # Test expected operation of the '-u' switch diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst new file mode 100644 index 0000000..4674242 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst @@ -0,0 +1,4 @@ +Python no longer fails at startup with a fatal error if a command line +argument contains an invalid Unicode character. The +:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be +decoded as Unicode characters outside the [U+0000; U+10ffff] range. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a038e0d..28f9c9b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -90,7 +90,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently extern "C" { #endif -/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ +// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111). +// The value must be the same in fileutils.c. #define MAX_UNICODE 0x10ffff #ifdef Py_DEBUG @@ -1707,8 +1708,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, *maxchar = ch; if (*maxchar > MAX_UNICODE) { PyErr_Format(PyExc_ValueError, - "character U+%x is not in range [U+0000; U+10ffff]", - ch); + "character U+%x is not in range [U+0000; U+%x]", + ch, MAX_UNICODE); return -1; } } @@ -13610,7 +13611,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, { case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; - case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break; default: Py_UNREACHABLE(); } diff --git a/Python/fileutils.c b/Python/fileutils.c index fd2d5fa..3e1311c 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -33,6 +33,13 @@ extern int winerror_to_errno(int); int _Py_open_cloexec_works = -1; #endif +// The value must be the same in unicodeobject.c. +#define MAX_UNICODE 0x10ffff + +// mbstowcs() and mbrtowc() errors +static const size_t DECODE_ERROR = ((size_t)-1); +static const size_t INCOMPLETE_CHARACTER = (size_t)-2; + static int get_surrogateescape(_Py_error_handler errors, int *surrogateescape) @@ -85,6 +92,57 @@ _Py_device_encoding(int fd) Py_RETURN_NONE; } + +static size_t +is_valid_wide_char(wchar_t ch) +{ + if (Py_UNICODE_IS_SURROGATE(ch)) { + // Reject lone surrogate characters + return 0; + } + if (ch > MAX_UNICODE) { + // bpo-35883: Reject characters outside [U+0000; U+10ffff] range. + // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629, + // it creates characters outside the [U+0000; U+10ffff] range: + // https://sourceware.org/bugzilla/show_bug.cgi?id=2373 + return 0; + } + return 1; +} + + +static size_t +_Py_mbstowcs(wchar_t *dest, const char *src, size_t n) +{ + size_t count = mbstowcs(dest, src, n); + if (dest != NULL && count != DECODE_ERROR) { + for (size_t i=0; i < count; i++) { + wchar_t ch = dest[i]; + if (!is_valid_wide_char(ch)) { + return DECODE_ERROR; + } + } + } + return count; +} + + +#ifdef HAVE_MBRTOWC +static size_t +_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) +{ + assert(pwc != NULL); + size_t count = mbrtowc(pwc, str, len, pmbs); + if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) { + if (!is_valid_wide_char(*pwc)) { + return DECODE_ERROR; + } + } + return count; +} +#endif + + #if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS) #define USE_FORCE_ASCII @@ -151,8 +209,8 @@ check_force_ascii(void) size_t res; ch = (unsigned char)0xA7; - res = mbstowcs(&wch, (char*)&ch, 1); - if (res != (size_t)-1 && wch == L'\xA7') { + res = _Py_mbstowcs(&wch, (char*)&ch, 1); + if (res != DECODE_ERROR && wch == L'\xA7') { /* On HP-UX withe C locale or the POSIX locale, nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses Latin1 encoding in practice. Force ASCII in this case. @@ -199,8 +257,8 @@ check_force_ascii(void) unsigned uch = (unsigned char)i; ch[0] = (char)uch; - res = mbstowcs(wch, ch, 1); - if (res != (size_t)-1) { + res = _Py_mbstowcs(wch, ch, 1); + if (res != DECODE_ERROR) { /* decoding a non-ASCII character from the locale encoding succeed: the locale encoding is not ASCII, force ASCII */ return 1; @@ -390,9 +448,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, */ argsize = strlen(arg); #else - argsize = mbstowcs(NULL, arg, 0); + argsize = _Py_mbstowcs(NULL, arg, 0); #endif - if (argsize != (size_t)-1) { + if (argsize != DECODE_ERROR) { if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { return -1; } @@ -401,21 +459,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, return -1; } - count = mbstowcs(res, arg, argsize + 1); - if (count != (size_t)-1) { - wchar_t *tmp; - /* Only use the result if it contains no - surrogate characters. */ - for (tmp = res; *tmp != 0 && - !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) - ; - if (*tmp == 0) { - if (wlen != NULL) { - *wlen = count; - } - *wstr = res; - return 0; + count = _Py_mbstowcs(res, arg, argsize + 1); + if (count != DECODE_ERROR) { + *wstr = res; + if (wlen != NULL) { + *wlen = count; } + return 0; } PyMem_RawFree(res); } @@ -439,13 +489,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { - size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); + size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs); if (converted == 0) { /* Reached end of string; null char stored. */ break; } - if (converted == (size_t)-2) { + if (converted == INCOMPLETE_CHARACTER) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I @@ -453,32 +503,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, goto decode_error; } - if (converted == (size_t)-1) { + if (converted == DECODE_ERROR) { if (!surrogateescape) { goto decode_error; } - /* Conversion error. Escape as UTF-8b, and start over - in the initial shift state. */ + /* Decoding error. Escape as UTF-8b, and start over in the initial + shift state. */ *out++ = 0xdc00 + *in++; argsize--; memset(&mbs, 0, sizeof mbs); continue; } - if (Py_UNICODE_IS_SURROGATE(*out)) { - if (!surrogateescape) { - goto decode_error; - } + // _Py_mbrtowc() reject lone surrogate characters + assert(!Py_UNICODE_IS_SURROGATE(*out)); - /* Surrogate character. Escape the original - byte sequence with surrogateescape. */ - argsize -= converted; - while (converted--) { - *out++ = 0xdc00 + *in++; - } - continue; - } /* successfully converted some bytes */ in += converted; argsize -= converted; @@ -655,7 +695,7 @@ encode_current_locale(const wchar_t *text, char **str, else { converted = wcstombs(NULL, buf, 0); } - if (converted == (size_t)-1) { + if (converted == DECODE_ERROR) { goto encode_error; } if (bytes != NULL) { @@ -1371,7 +1411,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode) char cmode[10]; size_t r; r = wcstombs(cmode, mode, 10); - if (r == (size_t)-1 || r >= 10) { + if (r == DECODE_ERROR || r >= 10) { errno = EINVAL; return NULL; } -- cgit v0.12