summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2023-10-20 12:19:04 (GMT)
committerGitHub <noreply@github.com>2023-10-20 12:19:04 (GMT)
commit69bcaf7e0e68f5bdead833a5aa47f67f256c6440 (patch)
tree3c3428c44a08f02a8afa8cd1e92244d7447bf079 /Modules
parent6df935c2128f59c327fe29ff8c114a2bc644471a (diff)
downloadcpython-69bcaf7e0e68f5bdead833a5aa47f67f256c6440.zip
cpython-69bcaf7e0e68f5bdead833a5aa47f67f256c6440.tar.gz
cpython-69bcaf7e0e68f5bdead833a5aa47f67f256c6440.tar.bz2
gh-110913: Fix WindowsConsoleIO chunking of UTF-8 text (GH-111007)
(cherry picked from commit 11312eae6ec3acf51aacafce4cb6d1a5edfd5f2e) Co-authored-by: Tamás Hegedűs <sorgloomer@users.noreply.github.com>
Diffstat (limited to 'Modules')
-rw-r--r--Modules/_io/winconsoleio.c36
1 files changed, 20 insertions, 16 deletions
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c
index 89431b1..dcb7d32 100644
--- a/Modules/_io/winconsoleio.c
+++ b/Modules/_io/winconsoleio.c
@@ -132,6 +132,23 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
return m;
}
+static DWORD
+_find_last_utf8_boundary(const char *buf, DWORD len)
+{
+ /* This function never returns 0, returns the original len instead */
+ DWORD count = 1;
+ if (len == 0 || (buf[len - 1] & 0x80) == 0) {
+ return len;
+ }
+ for (;; count++) {
+ if (count > 3 || count >= len) {
+ return len;
+ }
+ if ((buf[len - count] & 0xc0) != 0x80) {
+ return len - count;
+ }
+ }
+}
/*[clinic input]
module _io
@@ -954,7 +971,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
{
BOOL res = TRUE;
wchar_t *wbuf;
- DWORD len, wlen, orig_len, n = 0;
+ DWORD len, wlen, n = 0;
HANDLE handle;
if (self->fd == -1)
@@ -984,21 +1001,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
have to reduce and recalculate. */
while (wlen > 32766 / sizeof(wchar_t)) {
len /= 2;
- orig_len = len;
- /* Reduce the length until we hit the final byte of a UTF-8 sequence
- * (top bit is unset). Fix for github issue 82052.
- */
- while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
- --len;
- /* If we hit a length of 0, something has gone wrong. This shouldn't
- * be possible, as valid UTF-8 can have at most 3 non-final bytes
- * before a final one, and our buffer is way longer than that.
- * But to be on the safe side, if we hit this issue we just restore
- * the original length and let the console API sort it out.
- */
- if (len == 0) {
- len = orig_len;
- }
+ /* Fix for github issues gh-110913 and gh-82052. */
+ len = _find_last_utf8_boundary(b->buf, len);
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
}
Py_END_ALLOW_THREADS