From 0187afa965d2276476598016ea28d8fcd96d48ea Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 18:56:37 +0000 Subject: Test demonstrating bug in ticket [b2816a3afe]. --- tests/utf.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 8aa3757..f48299d 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -175,6 +175,9 @@ test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars test test utf-4.12.2 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring utf16} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 2 +test utf-4.13 {Tcl_NumUtfChars: end of string} {testnumutfchars testbytestring} { + testnumutfchars foobar[testbytestring \xF2\xC2\xA0] end +} 8 test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { testfindfirst [testbytestring abcbc] 98 -- cgit v0.12 From 2f7461f649fe3b5d78645f8efea56d24693f1bef Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 19:08:24 +0000 Subject: Fix. Note that just because we get one positive detection of an incomplete character, we cannot conclude that the next byte also will be, or can by taken as a single byte. At least we cannot when TCL_UTF_MAX > 3 so that we have room for valid two-byte sequences after incomplete sequence detection. No need for conditional code, just use an algorithm that always works. --- generic/tclUtf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4147bbc..80f3be8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -548,17 +548,18 @@ Tcl_NumUtfChars( i++; } /* Loop over the remaining string where call must happen */ - while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += TclUtfToUniChar(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } i++; } - if (src < endPtr) { - /* - * String ends in an incomplete UTF-8 sequence. - * Count every byte in it. - */ - i += endPtr - src; - } } return i; } -- cgit v0.12 From 9c8670e7341c883b6179637d0903cb87c5e90f26 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 7 May 2020 20:24:58 +0000 Subject: Simplify test-cases which don't need the "testbytestring" command to run without it. e.g. [testbytestring \xC2\xA2] is the same as just \xA2. --- tests/utf.test | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index f48299d..a25fde2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -88,8 +88,8 @@ test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring { test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring { string length [testbytestring \xC2] } 1 -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { - string length [testbytestring \xC2\xA2] +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { + string length \xA2 } 1 test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { string length [testbytestring \xE2] @@ -135,32 +135,32 @@ test utf-3.1 {Tcl_UtfCharComplete} { test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } 0 -test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC2\xA2] +test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars { + testnumutfchars \xA2 } 1 test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E] + testnumutfchars abc\xA2[testbytestring \xE4\xB9\x8E\xA2\x4E] } 7 -test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC0\x80] +test utf-4.4 {Tcl_NumUtfChars: #x00} testnumutfchars { + testnumutfchars \x00 } 1 test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } 0 test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC2\xA2] end + testnumutfchars \xA2 end } 1 test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E] end + testnumutfchars abc\xA2[testbytestring \xE4\xB9\x8E\xA2\x4E] end } 7 -test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xC0\x80] end +test utf-4.8 {Tcl_NumUtfChars: #x00, calc len} testnumutfchars { + testnumutfchars \x00 end } 1 # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \xE2\x82\xAC] end-1 } 2 -test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { +test utf-4.10 {Tcl_NumUtfChars: #x00, calc len, overcomplete} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \x00] end+1 } 2 test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { @@ -446,8 +446,8 @@ test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8] } 4 -test utf-6.80 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { - testutfnext [testbytestring \xC0\x80] +test utf-6.80 {Tcl_UtfNext - overlong sequences} {testutfnext} { + testutfnext \x00 } 2 test utf-6.81 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { testutfnext [testbytestring \xC0\x81] @@ -873,8 +873,8 @@ test utf-7.31 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { test utf-7.32 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xF0\x80\x80\x80] 2 } 1 -test utf-7.33 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { - testutfprev A[testbytestring \xC0\x80] +test utf-7.33 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { + testutfprev A\x00 } 1 test utf-7.34 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xC1\x80] -- cgit v0.12 From 3ec501f29bdd63cb4b75f5b369d2c24505b3a6ee Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 7 May 2020 20:31:48 +0000 Subject: Same trouble with Tcl_UtfToUniCharDstring. Test and fix. --- generic/tclUtf.c | 9 +++++---- tests/utf.test | 11 ++++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 80f3be8..7309208 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -453,11 +453,12 @@ Tcl_UtfToUniCharDString( p += TclUtfToUniChar(p, w); w++; } - while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { - p += TclUtfToUniChar(p, w++); - } while (p < endPtr) { - *w++ = UCHAR(*p++); + if (Tcl_UtfCharComplete(p, endPtr-p)) { + p += TclUtfToUniChar(p, w++); + } else { + *w++ = UCHAR(*p++); + } } *w = '\0'; Tcl_DStringSetLength(dsPtr, diff --git a/tests/utf.test b/tests/utf.test index f48299d..5300328 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -1473,7 +1473,16 @@ UniCharCaseCmpTest > [format %c 0x10000] \uFFFF ucs4 UniCharCaseCmpTest > \U10000 \uFFFF {Uesc ucs4} - +test utf-26.1 {Tcl_UniCharDString} -setup { + testobj freeallvars +} -constraints {teststringobj} -cleanup { + testobj freeallvars +} -body { + teststringobj set 1 foo + teststringobj getunicode 1 + teststringobj append 1 [testbytestring barsoom\xF2\xC2\x80] 10 + scan [string index [teststringobj get 1] 11] %c +} -result 128 unset count -- cgit v0.12