From 2f7461f649fe3b5d78645f8efea56d24693f1bef Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 19:08:24 +0000 Subject: Fix. Note that just because we get one positive detection of an incomplete character, we cannot conclude that the next byte also will be, or can by taken as a single byte. At least we cannot when TCL_UTF_MAX > 3 so that we have room for valid two-byte sequences after incomplete sequence detection. No need for conditional code, just use an algorithm that always works. --- generic/tclUtf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4147bbc..80f3be8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -548,17 +548,18 @@ Tcl_NumUtfChars( i++; } /* Loop over the remaining string where call must happen */ - while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += TclUtfToUniChar(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } i++; } - if (src < endPtr) { - /* - * String ends in an incomplete UTF-8 sequence. - * Count every byte in it. - */ - i += endPtr - src; - } } return i; } -- cgit v0.12