From e3c58bc54a39c2911fb59460045b16c4e61c491c Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 8 Jun 2017 11:48:13 +0000 Subject: tclUtil.c: Use TclUtfToUniChar() in stead of handling ASCII characters separately: This macro already does that. Add new test-case for Tcl_NumUtfChars(), for a knownBug still to be fixed. --- generic/tclTest.c | 2 +- generic/tclUtil.c | 47 ++++++++++++----------------------------------- tests/utf.test | 11 +++++++---- 3 files changed, 20 insertions(+), 40 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index f2dbfc9..e8539e8 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -6672,7 +6672,7 @@ TestNumUtfCharsCmd( int len = -1; if (objc > 2) { - (void) Tcl_GetStringFromObj(objv[1], &len); + (void) Tcl_GetIntFromObj(interp, objv[2], &len); } len = Tcl_NumUtfChars(Tcl_GetString(objv[1]), len); Tcl_SetObjResult(interp, Tcl_NewIntObj(len)); diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 553593c..3fdf54b 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -2162,14 +2162,9 @@ Tcl_StringCaseMatch( * This is a special case optimization for single-byte utf. */ - if (UCHAR(*pattern) < 0x80) { - ch2 = (Tcl_UniChar) - (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - } else { - Tcl_UtfToUniChar(pattern, &ch2); - if (nocase) { - ch2 = Tcl_UniCharToLower(ch2); - } + TclUtfToUniChar(pattern, &ch2); + if (nocase) { + ch2 = Tcl_UniCharToLower(ch2); } while (1) { @@ -2235,44 +2230,26 @@ Tcl_StringCaseMatch( Tcl_UniChar startChar, endChar; pattern++; - if (UCHAR(*str) < 0x80) { - ch1 = (Tcl_UniChar) - (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); - str++; - } else { - str += Tcl_UtfToUniChar(str, &ch1); - if (nocase) { - ch1 = Tcl_UniCharToLower(ch1); - } + str += TclUtfToUniChar(str, &ch1); + if (nocase) { + ch1 = Tcl_UniCharToLower(ch1); } while (1) { if ((*pattern == ']') || (*pattern == '\0')) { return 0; } - if (UCHAR(*pattern) < 0x80) { - startChar = (Tcl_UniChar) (nocase - ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - pattern++; - } else { - pattern += Tcl_UtfToUniChar(pattern, &startChar); - if (nocase) { - startChar = Tcl_UniCharToLower(startChar); - } + pattern += TclUtfToUniChar(pattern, &startChar); + if (nocase) { + startChar = Tcl_UniCharToLower(startChar); } if (*pattern == '-') { pattern++; if (*pattern == '\0') { return 0; } - if (UCHAR(*pattern) < 0x80) { - endChar = (Tcl_UniChar) (nocase - ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - pattern++; - } else { - pattern += Tcl_UtfToUniChar(pattern, &endChar); - if (nocase) { - endChar = Tcl_UniCharToLower(endChar); - } + pattern += TclUtfToUniChar(pattern, &endChar); + if (nocase) { + endChar = Tcl_UniCharToLower(endChar); } if (((startChar <= ch1) && (ch1 <= endChar)) || ((endChar <= ch1) && (ch1 <= startChar))) { diff --git a/tests/utf.test b/tests/utf.test index 28981d6..f677438 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -99,17 +99,20 @@ test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC0\x80"] } {1} test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { - testnumutfchars "" 1 + testnumutfchars "" 0 } {0} test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xC2\xA2"] 1 + testnumutfchars [testbytestring "\xC2\xA2"] 2 } {1} test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1 + testnumutfchars [testbytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 10 } {7} test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xC0\x80"] 1 + testnumutfchars [testbytestring "\xC0\x80"] 2 } {1} +test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {knownBug testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xE2\x82\xAC"] 2 +} {2} test utf-5.1 {Tcl_UtfFindFirsts} { } {} -- cgit v0.12 From 8cb64e1074f47fa62a4f2461569272a27a57f9d6 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 8 Jun 2017 12:34:08 +0000 Subject: Fix [2738427]: Tcl_NumUtfChars(...) no overflow check. --- generic/tclUtf.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 3937141..a405367 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -464,7 +464,6 @@ Tcl_NumUtfChars( * for strlen(string). */ { Tcl_UniChar ch; - register Tcl_UniChar *chPtr = &ch; register int i; /* @@ -477,23 +476,25 @@ Tcl_NumUtfChars( i = 0; if (length < 0) { while (*src != '\0') { - src += TclUtfToUniChar(src, chPtr); + src += TclUtfToUniChar(src, &ch); i++; } + if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - register int n; - - while (length > 0) { - if (UCHAR(*src) < 0xC0) { - length--; - src++; - } else { - n = Tcl_UtfToUniChar(src, chPtr); - length -= n; - src += n; - } + register const char *endPtr = src + length - TCL_UTF_MAX; + + while (src < endPtr) { + src += TclUtfToUniChar(src, &ch); i++; } + endPtr += TCL_UTF_MAX; + while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { + src += TclUtfToUniChar(src, &ch); + i++; + } + if (src < endPtr) { + i += endPtr - src; + } } return i; } -- cgit v0.12 From 7bf7c6e7d90d4b7913115508c91115db89868d48 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 8 Jun 2017 12:50:00 +0000 Subject: Revert part of [95d096e0378b460c6c5168bb55bb2ca8b2fd799e|95d096e037]: Missed the fact that tolower() was optimized for the ASCII case as well, so this was a mistake! --- generic/tclUtil.c | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 3fdf54b..553593c 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -2162,9 +2162,14 @@ Tcl_StringCaseMatch( * This is a special case optimization for single-byte utf. */ - TclUtfToUniChar(pattern, &ch2); - if (nocase) { - ch2 = Tcl_UniCharToLower(ch2); + if (UCHAR(*pattern) < 0x80) { + ch2 = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + } else { + Tcl_UtfToUniChar(pattern, &ch2); + if (nocase) { + ch2 = Tcl_UniCharToLower(ch2); + } } while (1) { @@ -2230,26 +2235,44 @@ Tcl_StringCaseMatch( Tcl_UniChar startChar, endChar; pattern++; - str += TclUtfToUniChar(str, &ch1); - if (nocase) { - ch1 = Tcl_UniCharToLower(ch1); + if (UCHAR(*str) < 0x80) { + ch1 = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); + str++; + } else { + str += Tcl_UtfToUniChar(str, &ch1); + if (nocase) { + ch1 = Tcl_UniCharToLower(ch1); + } } while (1) { if ((*pattern == ']') || (*pattern == '\0')) { return 0; } - pattern += TclUtfToUniChar(pattern, &startChar); - if (nocase) { - startChar = Tcl_UniCharToLower(startChar); + if (UCHAR(*pattern) < 0x80) { + startChar = (Tcl_UniChar) (nocase + ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + pattern++; + } else { + pattern += Tcl_UtfToUniChar(pattern, &startChar); + if (nocase) { + startChar = Tcl_UniCharToLower(startChar); + } } if (*pattern == '-') { pattern++; if (*pattern == '\0') { return 0; } - pattern += TclUtfToUniChar(pattern, &endChar); - if (nocase) { - endChar = Tcl_UniCharToLower(endChar); + if (UCHAR(*pattern) < 0x80) { + endChar = (Tcl_UniChar) (nocase + ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + pattern++; + } else { + pattern += Tcl_UtfToUniChar(pattern, &endChar); + if (nocase) { + endChar = Tcl_UniCharToLower(endChar); + } } if (((startChar <= ch1) && (ch1 <= endChar)) || ((endChar <= ch1) && (ch1 <= startChar))) { -- cgit v0.12