summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2021-03-17 20:46:53 (GMT)
committerGitHub <noreply@github.com>2021-03-17 20:46:53 (GMT)
commit9976834f807ea63ca51bc4f89be457d734148682 (patch)
tree2ed6ca7dd002f141e1a2ade2e5dd502514ae29f0
parent6086ae7fd4aeb4089282189673f9bd0cc33abf9b (diff)
downloadcpython-9976834f807ea63ca51bc4f89be457d734148682.zip
cpython-9976834f807ea63ca51bc4f89be457d734148682.tar.gz
cpython-9976834f807ea63ca51bc4f89be457d734148682.tar.bz2
bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)
Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c.
-rw-r--r--Lib/test/test_cmd_line.py86
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst4
-rw-r--r--Objects/unicodeobject.c9
-rw-r--r--Python/fileutils.c118
4 files changed, 148 insertions, 69 deletions
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index 95ab9d8..e87eede 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -200,38 +200,72 @@ class CmdLineTest(unittest.TestCase):
if not stdout.startswith(pattern):
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
+ @unittest.skipIf(sys.platform == 'win32',
+ 'Windows has a native unicode API')
+ def test_invalid_utf8_arg(self):
+ # bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
+ # byte sequence with surrogateescape rather than decoding it as the
+ # U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
+ # Python Unicode characters.
+ #
+ # Test with default config, in the C locale, in the Python UTF-8 Mode.
+ code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
+ base_cmd = [sys.executable, '-c', code]
+
+ def run_default(arg):
+ cmd = [sys.executable, '-c', code, arg]
+ return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+ def run_c_locale(arg):
+ cmd = [sys.executable, '-c', code, arg]
+ env = dict(os.environ)
+ env['LC_ALL'] = 'C'
+ return subprocess.run(cmd, stdout=subprocess.PIPE,
+ text=True, env=env)
+
+ def run_utf8_mode(arg):
+ cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
+ return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+ valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+ # invalid UTF-8 byte sequences with a valid UTF-8 sequence
+ # in the middle.
+ invalid_utf8 = (
+ b'\xff' # invalid byte
+ b'\xc3\xff' # invalid byte sequence
+ b'\xc3\xa9' # valid utf-8: U+00E9 character
+ b'\xed\xa0\x80' # lone surrogate character (invalid)
+ b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff]
+ )
+ test_args = [valid_utf8, invalid_utf8]
+
+ for run_cmd in (run_default, run_c_locale, run_utf8_mode):
+ with self.subTest(run_cmd=run_cmd):
+ for arg in test_args:
+ proc = run_cmd(arg)
+ self.assertEqual(proc.stdout.rstrip(), ascii(arg))
+
@unittest.skipUnless((sys.platform == 'darwin' or
support.is_android), 'test specific to Mac OS X and Android')
def test_osx_android_utf8(self):
- def check_output(text):
- decoded = text.decode('utf-8', 'surrogateescape')
- expected = ascii(decoded).encode('ascii') + b'\n'
+ text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+ code = "import sys; print(ascii(sys.argv[1]))"
- env = os.environ.copy()
- # C locale gives ASCII locale encoding, but Python uses UTF-8
- # to parse the command line arguments on Mac OS X and Android.
- env['LC_ALL'] = 'C'
+ decoded = text.decode('utf-8', 'surrogateescape')
+ expected = ascii(decoded).encode('ascii') + b'\n'
- p = subprocess.Popen(
- (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
- stdout=subprocess.PIPE,
- env=env)
- stdout, stderr = p.communicate()
- self.assertEqual(stdout, expected)
- self.assertEqual(p.returncode, 0)
+ env = os.environ.copy()
+ # C locale gives ASCII locale encoding, but Python uses UTF-8
+ # to parse the command line arguments on Mac OS X and Android.
+ env['LC_ALL'] = 'C'
- # test valid utf-8
- text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
- check_output(text)
-
- # test invalid utf-8
- text = (
- b'\xff' # invalid byte
- b'\xc3\xa9' # valid utf-8 character
- b'\xc3\xff' # invalid byte sequence
- b'\xed\xa0\x80' # lone surrogate character (invalid)
- )
- check_output(text)
+ p = subprocess.Popen(
+ (sys.executable, "-c", code, text),
+ stdout=subprocess.PIPE,
+ env=env)
+ stdout, stderr = p.communicate()
+ self.assertEqual(stdout, expected)
+ self.assertEqual(p.returncode, 0)
def test_non_interactive_output_buffering(self):
code = textwrap.dedent("""
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst
new file mode 100644
index 0000000..4674242
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst
@@ -0,0 +1,4 @@
+Python no longer fails at startup with a fatal error if a command line
+argument contains an invalid Unicode character. The
+:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
+decoded as Unicode characters outside the [U+0000; U+10ffff] range.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0b08b0e..a7a3151 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -94,7 +94,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
extern "C" {
#endif
-/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
+// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
+// The value must be the same in fileutils.c.
#define MAX_UNICODE 0x10ffff
#ifdef Py_DEBUG
@@ -1784,8 +1785,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
*maxchar = ch;
if (*maxchar > MAX_UNICODE) {
PyErr_Format(PyExc_ValueError,
- "character U+%x is not in range [U+0000; U+10ffff]",
- ch);
+ "character U+%x is not in range [U+0000; U+%x]",
+ ch, MAX_UNICODE);
return -1;
}
}
@@ -14089,7 +14090,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
{
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
- case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+ case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
default:
Py_UNREACHABLE();
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index f2b4681..4997f92 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
int _Py_open_cloexec_works = -1;
#endif
+// The value must be the same in unicodeobject.c.
+#define MAX_UNICODE 0x10ffff
+
+// mbstowcs() and mbrtowc() errors
+static const size_t DECODE_ERROR = ((size_t)-1);
+static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
+
static int
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
@@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
#endif
}
+
+static size_t
+is_valid_wide_char(wchar_t ch)
+{
+ if (Py_UNICODE_IS_SURROGATE(ch)) {
+ // Reject lone surrogate characters
+ return 0;
+ }
+ if (ch > MAX_UNICODE) {
+ // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
+ // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
+ // it creates characters outside the [U+0000; U+10ffff] range:
+ // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
+ return 0;
+ }
+ return 1;
+}
+
+
+static size_t
+_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
+{
+ size_t count = mbstowcs(dest, src, n);
+ if (dest != NULL && count != DECODE_ERROR) {
+ for (size_t i=0; i < count; i++) {
+ wchar_t ch = dest[i];
+ if (!is_valid_wide_char(ch)) {
+ return DECODE_ERROR;
+ }
+ }
+ }
+ return count;
+}
+
+
+#ifdef HAVE_MBRTOWC
+static size_t
+_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
+{
+ assert(pwc != NULL);
+ size_t count = mbrtowc(pwc, str, len, pmbs);
+ if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
+ if (!is_valid_wide_char(*pwc)) {
+ return DECODE_ERROR;
+ }
+ }
+ return count;
+}
+#endif
+
+
#if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)
#define USE_FORCE_ASCII
@@ -148,8 +206,8 @@ check_force_ascii(void)
size_t res;
ch = (unsigned char)0xA7;
- res = mbstowcs(&wch, (char*)&ch, 1);
- if (res != (size_t)-1 && wch == L'\xA7') {
+ res = _Py_mbstowcs(&wch, (char*)&ch, 1);
+ if (res != DECODE_ERROR && wch == L'\xA7') {
/* On HP-UX withe C locale or the POSIX locale,
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
Latin1 encoding in practice. Force ASCII in this case.
@@ -196,8 +254,8 @@ check_force_ascii(void)
unsigned uch = (unsigned char)i;
ch[0] = (char)uch;
- res = mbstowcs(wch, ch, 1);
- if (res != (size_t)-1) {
+ res = _Py_mbstowcs(wch, ch, 1);
+ if (res != DECODE_ERROR) {
/* decoding a non-ASCII character from the locale encoding succeed:
the locale encoding is not ASCII, force ASCII */
return 1;
@@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
*/
argsize = strlen(arg);
#else
- argsize = mbstowcs(NULL, arg, 0);
+ argsize = _Py_mbstowcs(NULL, arg, 0);
#endif
- if (argsize != (size_t)-1) {
+ if (argsize != DECODE_ERROR) {
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
return -1;
}
@@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
return -1;
}
- count = mbstowcs(res, arg, argsize + 1);
- if (count != (size_t)-1) {
- wchar_t *tmp;
- /* Only use the result if it contains no
- surrogate characters. */
- for (tmp = res; *tmp != 0 &&
- !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
- ;
- if (*tmp == 0) {
- if (wlen != NULL) {
- *wlen = count;
- }
- *wstr = res;
- return 0;
+ count = _Py_mbstowcs(res, arg, argsize + 1);
+ if (count != DECODE_ERROR) {
+ *wstr = res;
+ if (wlen != NULL) {
+ *wlen = count;
}
+ return 0;
}
PyMem_RawFree(res);
}
@@ -436,13 +486,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
out = res;
memset(&mbs, 0, sizeof mbs);
while (argsize) {
- size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
+ size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
if (converted == 0) {
/* Reached end of string; null char stored. */
break;
}
- if (converted == (size_t)-2) {
+ if (converted == INCOMPLETE_CHARACTER) {
/* Incomplete character. This should never happen,
since we provide everything that we have -
unless there is a bug in the C library, or I
@@ -450,32 +500,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
goto decode_error;
}
- if (converted == (size_t)-1) {
+ if (converted == DECODE_ERROR) {
if (!surrogateescape) {
goto decode_error;
}
- /* Conversion error. Escape as UTF-8b, and start over
- in the initial shift state. */
+ /* Decoding error. Escape as UTF-8b, and start over in the initial
+ shift state. */
*out++ = 0xdc00 + *in++;
argsize--;
memset(&mbs, 0, sizeof mbs);
continue;
}
- if (Py_UNICODE_IS_SURROGATE(*out)) {
- if (!surrogateescape) {
- goto decode_error;
- }
+ // _Py_mbrtowc() reject lone surrogate characters
+ assert(!Py_UNICODE_IS_SURROGATE(*out));
- /* Surrogate character. Escape the original
- byte sequence with surrogateescape. */
- argsize -= converted;
- while (converted--) {
- *out++ = 0xdc00 + *in++;
- }
- continue;
- }
/* successfully converted some bytes */
in += converted;
argsize -= converted;
@@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
else {
converted = wcstombs(NULL, buf, 0);
}
- if (converted == (size_t)-1) {
+ if (converted == DECODE_ERROR) {
goto encode_error;
}
if (bytes != NULL) {
@@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
char cmode[10];
size_t r;
r = wcstombs(cmode, mode, 10);
- if (r == (size_t)-1 || r >= 10) {
+ if (r == DECODE_ERROR || r >= 10) {
errno = EINVAL;
return NULL;
}