From df716cead90670de72a6ac52f7e9375eca9038ef Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 6 Apr 2020 09:22:18 +0000 Subject: Make Tcl_UtfCharComplete() usable for both Tcl_UtfToUniChar() and Tcl_UtfToChar16(). Defect noticed by Don Porter. Thanks! Add test-cases, assuring correct handling of 4-byte UTF-8 sequences. Use "end-1", "end" and "end+1" in testcases related to Tcl_NumUtfChars(), that's more readable/maintainable than integers. --- generic/tclUtf.c | 7 ++----- tests/utf.test | 16 +++++++++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6526645..72fefa4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,13 +64,10 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -#if TCL_UTF_MAX > 3 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ +/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#endif +/* End of "continuation byte section" */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; diff --git a/tests/utf.test b/tests/utf.test index f830110..507c6f9 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -123,20 +123,26 @@ test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } {0} test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xC2\xA2"] 2 + testnumutfchars [testbytestring "\xC2\xA2"] end } {1} test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] 10 + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] end } {7} test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xC0\x80"] 2 + testnumutfchars [testbytestring "\xC0\x80"] end } {1} # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xE2\x82\xAC"] 2 + testnumutfchars [testbytestring "\xE2\x82\xAC"] end-1 } {2} test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\x00"] 2 + testnumutfchars [testbytestring "\x00"] end+1 +} {2} +test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring \xf0\x9f\x92\xa9] end-1 +} {3} +test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} { + testnumutfchars [testbytestring \xf0\x9f\x92\xa9] end } {2} test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { -- cgit v0.12