From 0187afa965d2276476598016ea28d8fcd96d48ea Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 18:56:37 +0000 Subject: Test demonstrating bug in ticket [b2816a3afe]. --- tests/utf.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 8aa3757..f48299d 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -175,6 +175,9 @@ test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars test test utf-4.12.2 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring utf16} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 2 +test utf-4.13 {Tcl_NumUtfChars: end of string} {testnumutfchars testbytestring} { + testnumutfchars foobar[testbytestring \xF2\xC2\xA0] end +} 8 test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { testfindfirst [testbytestring abcbc] 98 -- cgit v0.12 From 2f7461f649fe3b5d78645f8efea56d24693f1bef Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 19:08:24 +0000 Subject: Fix. Note that just because we get one positive detection of an incomplete character, we cannot conclude that the next byte also will be, or can by taken as a single byte. At least we cannot when TCL_UTF_MAX > 3 so that we have room for valid two-byte sequences after incomplete sequence detection. No need for conditional code, just use an algorithm that always works. --- generic/tclUtf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4147bbc..80f3be8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -548,17 +548,18 @@ Tcl_NumUtfChars( i++; } /* Loop over the remaining string where call must happen */ - while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += TclUtfToUniChar(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } i++; } - if (src < endPtr) { - /* - * String ends in an incomplete UTF-8 sequence. - * Count every byte in it. - */ - i += endPtr - src; - } } return i; } -- cgit v0.12