diff options
author | Tamás Hegedűs <sorgloomer@users.noreply.github.com> | 2023-10-20 11:52:31 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-20 11:52:31 (GMT) |
commit | 11312eae6ec3acf51aacafce4cb6d1a5edfd5f2e (patch) | |
tree | 9044c92797052e2597c723d83640b724a3da5b54 /Modules/_io | |
parent | b60f05870816019cfd9b2f7d104364613e66fc78 (diff) | |
download | cpython-11312eae6ec3acf51aacafce4cb6d1a5edfd5f2e.zip cpython-11312eae6ec3acf51aacafce4cb6d1a5edfd5f2e.tar.gz cpython-11312eae6ec3acf51aacafce4cb6d1a5edfd5f2e.tar.bz2 |
gh-110913: Fix WindowsConsoleIO chunking of UTF-8 text (GH-111007)
Diffstat (limited to 'Modules/_io')
-rw-r--r-- | Modules/_io/winconsoleio.c | 36 |
1 files changed, 20 insertions, 16 deletions
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c index 50b8818..6680488 100644 --- a/Modules/_io/winconsoleio.c +++ b/Modules/_io/winconsoleio.c @@ -134,6 +134,23 @@ char _PyIO_get_console_type(PyObject *path_or_fd) { return m; } +static DWORD +_find_last_utf8_boundary(const char *buf, DWORD len) +{ + /* This function never returns 0, returns the original len instead */ + DWORD count = 1; + if (len == 0 || (buf[len - 1] & 0x80) == 0) { + return len; + } + for (;; count++) { + if (count > 3 || count >= len) { + return len; + } + if ((buf[len - count] & 0xc0) != 0x80) { + return len - count; + } + } +} /*[clinic input] module _io @@ -975,7 +992,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls, { BOOL res = TRUE; wchar_t *wbuf; - DWORD len, wlen, orig_len, n = 0; + DWORD len, wlen, n = 0; HANDLE handle; if (self->fd == -1) @@ -1007,21 +1024,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls, have to reduce and recalculate. */ while (wlen > 32766 / sizeof(wchar_t)) { len /= 2; - orig_len = len; - /* Reduce the length until we hit the final byte of a UTF-8 sequence - * (top bit is unset). Fix for github issue 82052. - */ - while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0) - --len; - /* If we hit a length of 0, something has gone wrong. This shouldn't - * be possible, as valid UTF-8 can have at most 3 non-final bytes - * before a final one, and our buffer is way longer than that. - * But to be on the safe side, if we hit this issue we just restore - * the original length and let the console API sort it out. - */ - if (len == 0) { - len = orig_len; - } + /* Fix for github issues gh-110913 and gh-82052. */ + len = _find_last_utf8_boundary(b->buf, len); wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0); } Py_END_ALLOW_THREADS |