From 0733f232745ac3cc9a3bd4913bd5ffb8b58378a5 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 10 May 2020 19:28:08 +0000 Subject: Tweak Invalid() function: No need for "return 0" twice in the function. For start bytes F0-F4, case TCL_UTF_MAX=4, Tcl_UtfToUniChar() reads 3 bytes but only advances 1 byte. So Tcl_UtfCharComplete() must make sure 3 bytes are available, not 1. Adapt Tcl_UtfCharComplete() accordingly. No change for TCL_UTF_MAX=[3|6] --- generic/tclUtf.c | 39 +++++++++++++++++++++++++++++++-------- tests/utf.test | 14 +++++++------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c0de80a..5e0b2e0 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,6 +81,30 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1 }; +static const unsigned char complete[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#if TCL_UTF_MAX < 4 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#endif + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#if TCL_UTF_MAX > 4 + 4,4,4,4,4, +#elif TCL_UTF_MAX < 4 + 1,1,1,1,1, +#else + 3,3,3,3,3, +#endif + 1,1,1,1,1,1,1,1,1,1,1 +}; + /* * Functions used only in this module. */ @@ -174,14 +198,13 @@ Invalid( unsigned char byte = UCHAR(*src); int index; - if ((byte & 0xC3) != 0xC0) { + if ((byte & 0xC3) == 0xC0) { /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ - return 0; - } - index = (byte - 0xC0) >> 1; - if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) { - /* Out of bounds - report invalid. */ - return 1; + index = (byte - 0xC0) >> 1; + if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) { + /* Out of bounds - report invalid. */ + return 1; + } } return 0; } @@ -568,7 +591,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[UCHAR(*src)]; + return length >= complete[UCHAR(*src)]; } /* diff --git a/tests/utf.test b/tests/utf.test index 1a4b157..8745385 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -570,13 +570,13 @@ test utf-6.108 {Tcl_UtfNext, read limits} {testutfnext testbytestring} { test utf-6.109 {Tcl_UtfNext, read limits} {testutfnext testbytestring} { testutfnext \u8820[testbytestring \xA0] 3 } 3 -test utf-6.110.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2_utf16} { +test utf-6.110.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 1 } 1 -test utf-6.110.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} { +test utf-6.110.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 1 } 0 -test utf-6.111.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2_utf16} { +test utf-6.111.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 2 } 1 test utf-6.111.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} { @@ -594,16 +594,16 @@ test utf-6.113.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2_utf test utf-6.113.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G 4 } 4 -test utf-6.114.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2_utf16} { +test utf-6.114.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 1 } 1 -test utf-6.114.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} { +test utf-6.114.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 1 } 0 -test utf-6.115.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2_utf16} { +test utf-6.115.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 2 } 1 -test utf-6.115.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs4} { +test utf-6.115.1 {Tcl_UtfNext, read limits} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] 2 } 0 test utf-6.116.0 {Tcl_UtfNext, read limits} {testutfnext testbytestring ucs2_utf16} { -- cgit v0.12