From 1aea640959f4dd7ae9922e1e80099f08d62c6684 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 18:27:20 +0000 Subject: closes regression in string trimright [c61818e4c9] without modifying of Tcl_UtfPrev (so certain inconsistency by Tcl_UtfPrev/TclUtfToUniChar still remains) --- generic/tclUtil.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 874e2a5..cf0bdaf 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1573,8 +1573,7 @@ TrimRight( const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ { - const char *p = bytes + numBytes; - int pInc; + const char *pp, *p = bytes + numBytes; /* Outer loop: iterate over string to be trimmed */ do { @@ -1582,8 +1581,8 @@ TrimRight( const char *q = trim; int bytesLeft = numTrim; - p = Tcl_UtfPrev(p, bytes); - pInc = TclUtfToUniChar(p, &ch1); + pp = Tcl_UtfPrev(p, bytes); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { @@ -1600,9 +1599,9 @@ TrimRight( if (bytesLeft == 0) { /* No match; trim task done; *p is last non-trimmed char */ - p += pInc; break; } + p = pp; } while (p > bytes); return numBytes - (p - bytes); -- cgit v0.12 From 8bdc1b8e328ecf025cade82185e8d44fdf35a559 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 20:04:25 +0000 Subject: added test case covering [c61818e4c9] - string trim for not valid utf-8 sequence (mistakenly considers NTS-zero char as a continuation of utf-8 pair) --- tests/string.test | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/string.test b/tests/string.test index 9a5e0c0..343ccb5 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1459,6 +1459,23 @@ test string-20.4 {string trimright} { test string-20.5 {string trimright} { string trimright "" } {} +test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} -setup { + interp alias {} bytes {} encoding convertfrom identity +} -body { + set result {} + set a [bytes \xc0\x80\x88] + set b foo$a + set m [list \u0000 U \x88 V [bytes \x88] W] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \u0000]] + lappend result [string map $m [string trimleft $b fox]] + lappend result [string map $m [string trimleft $b fo\u0000]] + lappend result [string map $m [string trim $b fox]] + lappend result [string map $m [string trim $b fo\u0000]] +} -result [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] -cleanup { + interp alias {} bytes {} +} test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg -- cgit v0.12 From 2227dd53ffef41928d6beedcde35df43cb31bf82 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 20:05:24 +0000 Subject: fixes [c61818e4c9] for all variants of string trim --- generic/tclUtil.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index cf0bdaf..cb5072b 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1582,14 +1582,17 @@ TrimRight( int bytesLeft = numTrim; pp = Tcl_UtfPrev(p, bytes); - (void)TclUtfToUniChar(pp, &ch1); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2) { + /* compare chars and real length of char, e.g. if TclUtfToUniChar + * mistakenly considers NTS 0-byte as a continuation of invalid utf-8 + * sequence, bug [c61818e4c9] */ + if (ch1 == ch2 && p - pp == qInc) { break; } @@ -1671,12 +1674,17 @@ TrimLeft( const char *q = trim; int bytesLeft = numTrim; + /* take care about real length of char, e.g. if TclUtfToUniChar would + * mistakenly consider NTS 0-byte as a continuation of invalid utf-8 + * sequence, bug [c61818e4c9] */ + if (pInc > numBytes) {pInc = numBytes;} + /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2) { + if (ch1 == ch2 && pInc == qInc) { break; } -- cgit v0.12