From 1aea640959f4dd7ae9922e1e80099f08d62c6684 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 18:27:20 +0000 Subject: closes regression in string trimright [c61818e4c9] without modifying of Tcl_UtfPrev (so certain inconsistency by Tcl_UtfPrev/TclUtfToUniChar still remains) --- generic/tclUtil.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 874e2a5..cf0bdaf 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1573,8 +1573,7 @@ TrimRight( const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ { - const char *p = bytes + numBytes; - int pInc; + const char *pp, *p = bytes + numBytes; /* Outer loop: iterate over string to be trimmed */ do { @@ -1582,8 +1581,8 @@ TrimRight( const char *q = trim; int bytesLeft = numTrim; - p = Tcl_UtfPrev(p, bytes); - pInc = TclUtfToUniChar(p, &ch1); + pp = Tcl_UtfPrev(p, bytes); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { @@ -1600,9 +1599,9 @@ TrimRight( if (bytesLeft == 0) { /* No match; trim task done; *p is last non-trimmed char */ - p += pInc; break; } + p = pp; } while (p > bytes); return numBytes - (p - bytes); -- cgit v0.12 From 38179215a43e8ba972f4f6baebf2aef347682b53 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 19:36:54 +0000 Subject: Set of tests demonstrating flaws in Tcl_UtfPrev (as viewed through a fragile implementation of [string trimright]). See ticket [c61818e4c9]. --- tests/utf.test | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index e8ee374..5d67b36 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -95,7 +95,74 @@ test utf-6.1 {Tcl_UtfNext} { } {} test utf-7.1 {Tcl_UtfPrev} { + string trimright {} X } {} +test utf-7.2 {Tcl_UtfPrev} { + string trimright A X +} A +test utf-7.3 {Tcl_UtfPrev} { + string trimright AA X +} AA +test utf-7.4 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8] X +} [bytestring A\xF8] +test utf-7.5 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4] X +} [bytestring A\xF4] +test utf-7.6 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8] X +} [bytestring A\xE8] +test utf-7.7 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0] X +} [bytestring A\xD0] +test utf-7.8 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0] X +} [bytestring A\xA0] +test utf-7.9 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0] X +} [bytestring A\xF8\xA0] +test utf-7.10 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0] X +} [bytestring A\xF4\xA0] +test utf-7.11 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0] X +} [bytestring A\xE8\xA0] +test utf-7.12 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0] X +} [bytestring A\xD0\xA0] +test utf-7.13 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0\xA0] X +} [bytestring A\xA0\xA0] +test utf-7.14 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0\xA0] X +} [bytestring A\xF8\xA0\xA0] +test utf-7.15 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0\xA0] X +} [bytestring A\xF4\xA0\xA0] +test utf-7.16 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0\xA0] X +} [bytestring A\xE8\xA0\xA0] +test utf-7.17 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0\xA0] X +} [bytestring A\xD0\xA0\xA0] +test utf-7.18 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0\xA0\xA0] X +} [bytestring A\xA0\xA0\xA0] +test utf-7.19 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0\xA0\xA0] X +} [bytestring A\xF8\xA0\xA0\xA0] +test utf-7.20 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0\xA0\xA0] X +} [bytestring A\xF4\xA0\xA0\xA0] +test utf-7.21 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0\xA0\xA0] X +} [bytestring A\xE8\xA0\xA0\xA0] +test utf-7.22 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0\xA0\xA0] X +} [bytestring A\xD0\xA0\xA0\xA0] +test utf-7.23 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0\xA0\xA0\xA0] X +} [bytestring A\xA0\xA0\xA0\xA0] test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From e3ec61b93b66246ae5cf63706bdd4a89fc9f0876 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 19:52:15 +0000 Subject: More tests that should continue to demo faults in Tcl_UtfPrev after [string trimright] implementation is improved. --- tests/utf.test | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 5d67b36..de529a6 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -164,6 +164,29 @@ test utf-7.23 {Tcl_UtfPrev} { string trimright [bytestring A\xA0\xA0\xA0\xA0] X } [bytestring A\xA0\xA0\xA0\xA0] +test utf-7.24 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0] [bytestring \xF8] +} [bytestring A\xF8\xA0] +test utf-7.25 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0] [bytestring \xF4] +} [bytestring A\xF4\xA0] +test utf-7.26 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0] [bytestring \xE8] +} [bytestring A\xE8\xA0] +test utf-7.27 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0\xA0] [bytestring \xF8] +} [bytestring A\xF8\xA0\xA0] +test utf-7.28 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0\xA0] [bytestring \xF4] +} [bytestring A\xF4\xA0\xA0] +test utf-7.29 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0\xA0] [bytestring \xD0] +} [bytestring A\xD0\xA0\xA0] + +test utf-7.30 {Tcl_UtfPrev} { + string trimright [bytestring A\xC0\x80\xA0] \u0000 +} [bytestring A\xC0\x80\xA0] + test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 } {a} -- cgit v0.12 From 8bdc1b8e328ecf025cade82185e8d44fdf35a559 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 20:04:25 +0000 Subject: added test case covering [c61818e4c9] - string trim for not valid utf-8 sequence (mistakenly considers NTS-zero char as a continuation of utf-8 pair) --- tests/string.test | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/string.test b/tests/string.test index 9a5e0c0..343ccb5 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1459,6 +1459,23 @@ test string-20.4 {string trimright} { test string-20.5 {string trimright} { string trimright "" } {} +test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} -setup { + interp alias {} bytes {} encoding convertfrom identity +} -body { + set result {} + set a [bytes \xc0\x80\x88] + set b foo$a + set m [list \u0000 U \x88 V [bytes \x88] W] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \u0000]] + lappend result [string map $m [string trimleft $b fox]] + lappend result [string map $m [string trimleft $b fo\u0000]] + lappend result [string map $m [string trim $b fox]] + lappend result [string map $m [string trim $b fo\u0000]] +} -result [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] -cleanup { + interp alias {} bytes {} +} test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg -- cgit v0.12 From 2227dd53ffef41928d6beedcde35df43cb31bf82 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 20:05:24 +0000 Subject: fixes [c61818e4c9] for all variants of string trim --- generic/tclUtil.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index cf0bdaf..cb5072b 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1582,14 +1582,17 @@ TrimRight( int bytesLeft = numTrim; pp = Tcl_UtfPrev(p, bytes); - (void)TclUtfToUniChar(pp, &ch1); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2) { + /* compare chars and real length of char, e.g. if TclUtfToUniChar + * mistakenly considers NTS 0-byte as a continuation of invalid utf-8 + * sequence, bug [c61818e4c9] */ + if (ch1 == ch2 && p - pp == qInc) { break; } @@ -1671,12 +1674,17 @@ TrimLeft( const char *q = trim; int bytesLeft = numTrim; + /* take care about real length of char, e.g. if TclUtfToUniChar would + * mistakenly consider NTS 0-byte as a continuation of invalid utf-8 + * sequence, bug [c61818e4c9] */ + if (pInc > numBytes) {pInc = numBytes;} + /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2) { + if (ch1 == ch2 && pInc == qInc) { break; } -- cgit v0.12 From fe515177dd3f500c04c593db04baa6a8735ecd3b Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 21:06:58 +0000 Subject: New testing command so we can directly demonstrate flaws. --- generic/tclTest.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/generic/tclTest.c b/generic/tclTest.c index 66b2233..bfed72e 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -433,6 +433,7 @@ static int SimpleMatchInDirectory( Tcl_Interp *interp, Tcl_Obj *resultPtr, Tcl_Obj *dirPtr, const char *pattern, Tcl_GlobTypeData *types); +static Tcl_ObjCmdProc TestUtfPrevCmd; static int TestNumUtfCharsCmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); @@ -690,6 +691,8 @@ Tcltest_Init( (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testsetobjerrorcode", TestsetobjerrorcodeCmd, (ClientData) 0, NULL); + Tcl_CreateObjCommand(interp, "testutfprev", + TestUtfPrevCmd, (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testnumutfchars", TestNumUtfCharsCmd, (ClientData) 0, NULL); Tcl_CreateCommand(interp, "testsetplatform", TestsetplatformCmd, @@ -7094,6 +7097,51 @@ SimpleListVolumes(void) } /* + * Used to check operations of Tcl_UtfPrev. + * + * Usage: testutfprev $bytes $offset + */ + +static int +TestUtfPrevCmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + int numBytes, offset; + char *bytes; + const char *result; + Tcl_Obj *copy; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes offset"); + return TCL_ERROR; + } + + bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + copy = Tcl_DuplicateObj(objv[1]); + bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); + bytes[numBytes] = '\0'; + + result = Tcl_UtfPrev(bytes + offset, bytes); + + Tcl_DecrRefCount(copy); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + return TCL_OK; +} + +/* * Used to check correct string-length determining in Tcl_NumUtfChars */ -- cgit v0.12 From bf7064c9adf77f3184dc5efcaa49e8d05da728cf Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 21:49:02 +0000 Subject: Convert the tests to use the testing command. --- generic/tclTest.c | 22 +++--- tests/utf.test | 208 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 129 insertions(+), 101 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index bfed72e..506cef9 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -7114,20 +7114,24 @@ TestUtfPrevCmd( const char *result; Tcl_Obj *copy; - if (objc != 3) { - Tcl_WrongNumArgs(interp, 1, objv, "bytes offset"); + if (objc < 2 || objc > 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); return TCL_ERROR; } bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); - if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { - return TCL_ERROR; - } - if (offset < 0) { - offset = 0; - } - if (offset > numBytes) { + if (objc == 3) { + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + } else { offset = numBytes; } copy = Tcl_DuplicateObj(objv[1]); diff --git a/tests/utf.test b/tests/utf.test index de529a6..7fe0b4e 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -94,98 +94,122 @@ test utf-5.1 {Tcl_UtfFindFirsts} { test utf-6.1 {Tcl_UtfNext} { } {} -test utf-7.1 {Tcl_UtfPrev} { - string trimright {} X -} {} -test utf-7.2 {Tcl_UtfPrev} { - string trimright A X -} A -test utf-7.3 {Tcl_UtfPrev} { - string trimright AA X -} AA -test utf-7.4 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8] X -} [bytestring A\xF8] -test utf-7.5 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4] X -} [bytestring A\xF4] -test utf-7.6 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8] X -} [bytestring A\xE8] -test utf-7.7 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0] X -} [bytestring A\xD0] -test utf-7.8 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0] X -} [bytestring A\xA0] -test utf-7.9 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0] X -} [bytestring A\xF8\xA0] -test utf-7.10 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0] X -} [bytestring A\xF4\xA0] -test utf-7.11 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0] X -} [bytestring A\xE8\xA0] -test utf-7.12 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0] X -} [bytestring A\xD0\xA0] -test utf-7.13 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0\xA0] X -} [bytestring A\xA0\xA0] -test utf-7.14 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0\xA0] X -} [bytestring A\xF8\xA0\xA0] -test utf-7.15 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0\xA0] X -} [bytestring A\xF4\xA0\xA0] -test utf-7.16 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0\xA0] X -} [bytestring A\xE8\xA0\xA0] -test utf-7.17 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0\xA0] X -} [bytestring A\xD0\xA0\xA0] -test utf-7.18 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0\xA0\xA0] X -} [bytestring A\xA0\xA0\xA0] -test utf-7.19 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0\xA0\xA0] X -} [bytestring A\xF8\xA0\xA0\xA0] -test utf-7.20 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0\xA0\xA0] X -} [bytestring A\xF4\xA0\xA0\xA0] -test utf-7.21 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0\xA0\xA0] X -} [bytestring A\xE8\xA0\xA0\xA0] -test utf-7.22 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0\xA0\xA0] X -} [bytestring A\xD0\xA0\xA0\xA0] -test utf-7.23 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0\xA0\xA0\xA0] X -} [bytestring A\xA0\xA0\xA0\xA0] - -test utf-7.24 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0] [bytestring \xF8] -} [bytestring A\xF8\xA0] -test utf-7.25 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0] [bytestring \xF4] -} [bytestring A\xF4\xA0] -test utf-7.26 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0] [bytestring \xE8] -} [bytestring A\xE8\xA0] -test utf-7.27 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0\xA0] [bytestring \xF8] -} [bytestring A\xF8\xA0\xA0] -test utf-7.28 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0\xA0] [bytestring \xF4] -} [bytestring A\xF4\xA0\xA0] -test utf-7.29 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0\xA0] [bytestring \xD0] -} [bytestring A\xD0\xA0\xA0] - -test utf-7.30 {Tcl_UtfPrev} { - string trimright [bytestring A\xC0\x80\xA0] \u0000 -} [bytestring A\xC0\x80\xA0] +testConstraint testutfprev [llength [info commands testutfprev]] + +test utf-7.1 {Tcl_UtfPrev} testutfprev { + testutfprev {} +} 0 +test utf-7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A +} 0 +test utf-7.3 {Tcl_UtfPrev} testutfprev { + testutfprev AA +} 1 +test utf-7.4 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8 +} 1 +test utf-7.4.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 2 +} 1 +test utf-7.5 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4 +} 1 +test utf-7.5.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 2 +} 1 +test utf-7.6 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8 +} 1 +test utf-7.6.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 2 +} 1 +test utf-7.7 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0 +} 1 +test utf-7.7.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 2 +} 1 +test utf-7.8 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0 +} 1 +test utf-7.8.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 2 +} 1 +test utf-7.9 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0 +} 2 +test utf-7.9.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 3 +} 2 +test utf-7.10 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0 +} 2 +test utf-7.10.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 3 +} 2 +test utf-7.11 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0 +} 2 +test utf-7.11.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 3 +} 1 +test utf-7.12 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0 +} 1 +test utf-7.12.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 3 +} 1 +test utf-7.13 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0 +} 2 +test utf-7.13.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 3 +} 2 +test utf-7.14 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0 +} 3 +test utf-7.14.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 4 +} 3 +test utf-7.15 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0 +} 3 +test utf-7.15.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 4 +} 3 +test utf-7.16 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0 +} 1 +test utf-7.16.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 4 +} 1 +test utf-7.17 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0 +} 3 +test utf-7.17.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 4 +} 3 +test utf-7.18 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0 +} 3 +test utf-7.18.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 4 +} 3 +test utf-7.19 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 +} 4 +test utf-7.20 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 +} 4 +test utf-7.21 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 +} 4 +test utf-7.22 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 +} 4 +test utf-7.23 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 +} 4 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From ac03c44f432374514af20b60a1aac369b9147c10 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 14:09:48 +0000 Subject: more tests --- tests/utf.test | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 7fe0b4e..9ce2b64 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -111,90 +111,135 @@ test utf-7.4 {Tcl_UtfPrev} testutfprev { test utf-7.4.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 2 } 1 +test utf-7.4.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xF8\xA0\xA0 2 +} 1 test utf-7.5 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4 } 1 test utf-7.5.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0\xA0 2 } 1 +test utf-7.5.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xF8\xA0\xA0 2 +} 1 test utf-7.6 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8 } 1 test utf-7.6.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 2 } 1 +test utf-7.6.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xF8\xA0\xA0 2 +} 1 test utf-7.7 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0 } 1 test utf-7.7.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xA0 2 } 1 +test utf-7.7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xF8\xA0\xA0 2 +} 1 test utf-7.8 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0 } 1 test utf-7.8.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 2 } 1 +test utf-7.8.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xF8\xA0\xA0 2 +} 1 test utf-7.9 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0 } 2 test utf-7.9.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 3 } 2 +test utf-7.9.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xF8\xA0 3 +} 2 test utf-7.10 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0 } 2 test utf-7.10.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0\xA0 3 } 2 +test utf-7.10.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xF8\xA0 3 +} 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 } 2 test utf-7.11.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 3 } 1 +test utf-7.11.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xF8\xA0 3 +} 2 test utf-7.12 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0 } 1 test utf-7.12.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xA0 3 } 1 +test utf-7.12.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xF8\xA0 3 +} 1 test utf-7.13 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0 } 2 test utf-7.13.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 3 } 2 +test utf-7.13.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xF8\xA0 3 +} 2 test utf-7.14 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0 } 3 test utf-7.14.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 4 } 3 +test utf-7.14.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xF8 4 +} 3 test utf-7.15 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0 } 3 test utf-7.15.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0\xA0 4 } 3 +test utf-7.15.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xF8 4 +} 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 } 1 test utf-7.16.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 4 } 1 +test utf-7.16.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xF8 4 +} 1 test utf-7.17 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0 } 3 test utf-7.17.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xA0 4 } 3 +test utf-7.17.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xF8 4 +} 3 test utf-7.18 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0 } 3 test utf-7.18.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 4 } 3 +test utf-7.18.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xF8 4 +} 3 test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -- cgit v0.12 From ac0e8526f0e0d8b60502c6a92ed6e4b06c9ebd02 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 16:43:09 +0000 Subject: Restore the original Tcl_UtfPrev routine. Fails a different set of tests. Many fewer. --- generic/tclUtf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7d3db57..b66a2eb 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -693,6 +693,9 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { + if (totalBytes[byte] != i + 1) { + break; + } return look; } look--; -- cgit v0.12 From a0961e94367316cf621eec486300b53b8411bd47 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 17:31:58 +0000 Subject: Apply better bug fix that does not create new bugs this time. --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b66a2eb..b7e8f5e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -693,7 +693,7 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { - if (totalBytes[byte] != i + 1) { + if (totalBytes[byte] <= i) { break; } return look; -- cgit v0.12 From 41a89c36a8f7b088c02c032fa5f61056dfd1f383 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 18:07:07 +0000 Subject: Cherry pick the [string trim] changes. --- generic/tclUtil.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index cb5072b..f4879a1 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1589,10 +1589,7 @@ TrimRight( Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - /* compare chars and real length of char, e.g. if TclUtfToUniChar - * mistakenly considers NTS 0-byte as a continuation of invalid utf-8 - * sequence, bug [c61818e4c9] */ - if (ch1 == ch2 && p - pp == qInc) { + if (ch1 == ch2) { break; } @@ -1604,8 +1601,7 @@ TrimRight( /* No match; trim task done; *p is last non-trimmed char */ break; } - p = pp; - } while (p > bytes); + } while ((p = pp) > bytes); return numBytes - (p - bytes); } @@ -1684,7 +1680,7 @@ TrimLeft( Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2 && pInc == qInc) { + if (ch1 == ch2) { break; } -- cgit v0.12 From 94a4f5be20584689a3ab5fe061fbc304c7cd18b3 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 15:56:26 +0000 Subject: New tests demonstrating bug in TclNeedSpace(): improper handling escaped space. --- tests/util.test | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/util.test b/tests/util.test index 61a1790..294d883 100644 --- a/tests/util.test +++ b/tests/util.test @@ -522,6 +522,51 @@ test util-8.6 {TclNeedSpace - correct UTF8 handling} testdstring { testdstring append \} -1 list [llength [testdstring get]] [string length [testdstring get]] } {2 9} +test util-8.7 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\ } -1 + testdstring start + testdstring end + + # Should make {\ {}} + list [llength [testdstring get]] [string index [testdstring get] 3] +} {2 \{} +test util-8.8 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\ } -1 + testdstring start + testdstring end + + # Should make {\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 3] +} {2 \{} +test util-8.9 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\\ } -1 + testdstring start + testdstring end + + # Should make {\\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 5] +} {2 \{} +test util-8.10 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\\\\\\ } -1 + testdstring start + testdstring end + + # Should make {\\\\\\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 9] +} {2 \{} +test util-8.11 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\\\\\\\ } -1 + testdstring start + testdstring end + + # Should make {\\\\\\\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 9] +} {2 \{} test util-9.0.0 {TclGetIntForIndex} { string index abcd 0 -- cgit v0.12 From 77dbe8dc087788e733edf13dedd31202b18fded4 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 16:24:34 +0000 Subject: [afa4b28153] Correct TclNeedSpace handling of trailing escaped space. --- generic/tclUtil.c | 44 ++++++++++++++++++++++++++++++++++++-------- tests/util.test | 10 ++-------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 874e2a5..7ec224e 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -3225,25 +3225,49 @@ TclNeedSpace( /* * A space is needed unless either: * (a) we're at the start of the string, or - */ + * + * (NOTE: This check is now absorbed into the loop below.) + * if (end == start) { return 0; } + * + */ + /* * (b) we're at the start of a nested list-element, quoted with an open * curly brace; we can be nested arbitrarily deep, so long as the * first curly brace starts an element, so backtrack over open curly * braces that are trailing characters of the string; and - */ + * + * (NOTE: Every character our parser is looking for is a proper + * single-byte encoding of an ASCII value. It does not accept + * overlong encodings. Given that, there's no benefit using + * Tcl_UtfPrev. If it would find what we seek, so would byte-by-byte + * backward scan. Save routine call overhead and risk of wrong + * results should the behavior of Tcl_UtfPrev change in unexpected ways. + * Reconsider this if we ever start treating non-ASCII Unicode + * characters as meaningful list syntax, expanded Unicode spaces as + * element separators, for example.) + * end = Tcl_UtfPrev(end, start); while (*end == '{') { - if (end == start) { - return 0; - } - end = Tcl_UtfPrev(end, start); + if (end == start) { + return 0; + } + end = Tcl_UtfPrev(end, start); + } + + * + */ + + while ((--end >= start) && (*end == '{')) { + } + if (end < start) { + return 0; } /* @@ -3278,8 +3302,12 @@ TclNeedSpace( case '\r': case '\v': case '\f': - if ((end == start) || (end[-1] != '\\')) { - return 0; + { + int result = 0; + while ((--end >= start) && (*end == '\\')) { + result = !result; + } + return result; } } return 1; diff --git a/tests/util.test b/tests/util.test index 294d883..46d9152 100644 --- a/tests/util.test +++ b/tests/util.test @@ -503,25 +503,19 @@ test util-8.4 {TclNeedSpace - correct UTF8 handling} testdstring { llength [testdstring get] } 2 test util-8.5 {TclNeedSpace - correct UTF8 handling} testdstring { - # Note that in this test TclNeedSpace actually gets it wrong, - # claiming we need a space when we really do not. Extra space - # between list elements is harmless though, and better to have - # extra space in really weird string reps of lists, than to - # invest the effort required to make TclNeedSpace foolproof. testdstring free testdstring append {\\ } -1 testdstring element foo list [llength [testdstring get]] [string length [testdstring get]] -} {2 7} +} {2 6} test util-8.6 {TclNeedSpace - correct UTF8 handling} testdstring { - # Another example of TclNeedSpace harmlessly getting it wrong. testdstring free testdstring append {\\ } -1 testdstring append \{ -1 testdstring element foo testdstring append \} -1 list [llength [testdstring get]] [string length [testdstring get]] -} {2 9} +} {2 8} test util-8.7 {TclNeedSpace - watch out for escaped space} { testdstring free testdstring append {\ } -1 -- cgit v0.12 From e2b3b1edf02e0c1ab06cfe784d6a7bd76e8050d8 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 17:30:06 +0000 Subject: Guarantee TclNeedSpace and TclFindElement have common definition of whitespace by having both call the same routine. Create a macro form to contain performance costs and adapt callers. --- generic/tclCmdAH.c | 2 +- generic/tclCmdMZ.c | 2 +- generic/tclDate.c | 2 +- generic/tclInt.h | 11 ++++- generic/tclParse.c | 2 +- generic/tclStrToD.c | 6 +-- generic/tclUtf.c | 2 +- generic/tclUtil.c | 120 ++++++++++++++++++++++------------------------------ unix/tclUnixFile.c | 2 +- 9 files changed, 69 insertions(+), 80 deletions(-) diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index 06743d6..f30396b 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -145,7 +145,7 @@ Tcl_CaseObjCmd( pat = TclGetString(caseObjv[i]); for (p = pat; *p != '\0'; p++) { - if (TclIsSpaceProc(*p) || (*p == '\\')) { + if (TclIsSpaceProcM(*p) || (*p == '\\')) { break; } } diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 255fca1..d4fa4e9 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1648,7 +1648,7 @@ StringIsCmd( * if it is the first "element" that has the failure. */ - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } TclNewStringObj(tmpStr, string1, p-string1); diff --git a/generic/tclDate.c b/generic/tclDate.c index 2cf20d6..8d37f3d 100644 --- a/generic/tclDate.c +++ b/generic/tclDate.c @@ -2684,7 +2684,7 @@ TclDatelex( location->first_column = yyInput - info->dateStart; for ( ; ; ) { - while (TclIsSpaceProc(*yyInput)) { + while (TclIsSpaceProcM(*yyInput)) { yyInput++; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 06cff60..15bc000 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2608,7 +2608,6 @@ MODULE_SCOPE void TclInitNotifier(void); MODULE_SCOPE void TclInitObjSubsystem(void); MODULE_SCOPE void TclInitSubsystems(void); MODULE_SCOPE int TclInterpReady(Tcl_Interp *interp); -MODULE_SCOPE int TclIsSpaceProc(char byte); MODULE_SCOPE int TclIsBareword(char byte); MODULE_SCOPE int TclJoinThread(Tcl_ThreadId id, int *result); MODULE_SCOPE void TclLimitRemoveAllHandlers(Tcl_Interp *interp); @@ -2800,6 +2799,16 @@ MODULE_SCOPE Tcl_Obj * TclDisassembleByteCodeObj(Tcl_Obj *objPtr); MODULE_SCOPE int TclUtfCasecmp(CONST char *cs, CONST char *ct); /* + * Many parsing tasks need a common definition of whitespace. + * Use this routine and macro to achieve that and place + * optimization (fragile on changes) in one place. + */ + +MODULE_SCOPE int TclIsSpaceProc(char byte); +# define TclIsSpaceProcM(byte) \ + (((byte) > 0x20) ? 0 : TclIsSpaceProc(byte)) + +/* *---------------------------------------------------------------- * Command procedures in the generic core: *---------------------------------------------------------------- diff --git a/generic/tclParse.c b/generic/tclParse.c index 6f989d9..7bead99 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -1809,7 +1809,7 @@ Tcl_ParseBraces( openBrace = 0; break; case '#' : - if (openBrace && TclIsSpaceProc(src[-1])) { + if (openBrace && TclIsSpaceProcM(src[-1])) { Tcl_AppendResult(parsePtr->interp, ": possible unbalanced brace in comment", NULL); goto error; diff --git a/generic/tclStrToD.c b/generic/tclStrToD.c index 4359829..3776521 100644 --- a/generic/tclStrToD.c +++ b/generic/tclStrToD.c @@ -533,7 +533,7 @@ TclParseNumber( * I, N, and whitespace. */ - if (TclIsSpaceProc(c)) { + if (TclIsSpaceProcM(c)) { if (flags & TCL_PARSE_NO_WHITESPACE) { goto endgame; } @@ -1053,7 +1053,7 @@ TclParseNumber( } /* FALLTHROUGH */ case sNANPAREN: - if (TclIsSpaceProc(c)) { + if (TclIsSpaceProcM(c)) { break; } if (numSigDigs < 13) { @@ -1107,7 +1107,7 @@ TclParseNumber( * Accept trailing whitespace. */ - while (len != 0 && TclIsSpaceProc(*p)) { + while (len != 0 && TclIsSpaceProcM(*p)) { p++; len--; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7d3db57..9aaf506 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1610,7 +1610,7 @@ Tcl_UniCharIsSpace( */ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { - return TclIsSpaceProc((char) ch); + return TclIsSpaceProcM((char) ch); } else if ((Tcl_UniChar) ch == 0x180E || (Tcl_UniChar) ch == 0x202F) { return 1; } else { diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 7ec224e..0b8ec2d 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -396,20 +396,20 @@ TclMaxListLength( } /* No list element before leading white space */ - count += 1 - TclIsSpaceProc(*bytes); + count += 1 - TclIsSpaceProcM(*bytes); /* Count white space runs as potential element separators */ while (numBytes) { if ((numBytes == -1) && (*bytes == '\0')) { break; } - if (TclIsSpaceProc(*bytes)) { + if (TclIsSpaceProcM(*bytes)) { /* Space run started; bump count */ count++; do { bytes++; numBytes -= (numBytes != -1); - } while (numBytes && TclIsSpaceProc(*bytes)); + } while (numBytes && TclIsSpaceProcM(*bytes)); if ((numBytes == 0) || ((numBytes == -1) && (*bytes == '\0'))) { break; } @@ -420,7 +420,7 @@ TclMaxListLength( } /* No list element following trailing white space */ - count -= TclIsSpaceProc(bytes[-1]); + count -= TclIsSpaceProcM(bytes[-1]); done: if (endPtr) { @@ -508,7 +508,7 @@ TclFindElement( */ limit = (list + listLength); - while ((p < limit) && (TclIsSpaceProc(*p))) { + while ((p < limit) && (TclIsSpaceProcM(*p))) { p++; } if (p == limit) { /* no element found */ @@ -553,7 +553,7 @@ TclFindElement( } else if (openBraces == 1) { size = (p - elemStart); p++; - if ((p >= limit) || TclIsSpaceProc(*p)) { + if ((p >= limit) || TclIsSpaceProcM(*p)) { goto done; } @@ -563,7 +563,7 @@ TclFindElement( if (interp != NULL) { p2 = p; - while ((p2 < limit) && (!TclIsSpaceProc(*p2)) + while ((p2 < limit) && (!TclIsSpaceProcM(*p2)) && (p2 < p+20)) { p2++; } @@ -595,23 +595,6 @@ TclFindElement( break; /* - * Space: ignore if element is in braces or quotes; otherwise - * terminate element. - */ - - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - if ((openBraces == 0) && !inQuotes) { - size = (p - elemStart); - goto done; - } - break; - - /* * Double-quote: if element is in quotes then terminate it. */ @@ -619,7 +602,7 @@ TclFindElement( if (inQuotes) { size = (p - elemStart); p++; - if ((p >= limit) || TclIsSpaceProc(*p)) { + if ((p >= limit) || TclIsSpaceProcM(*p)) { goto done; } @@ -629,7 +612,7 @@ TclFindElement( if (interp != NULL) { p2 = p; - while ((p2 < limit) && (!TclIsSpaceProc(*p2)) + while ((p2 < limit) && (!TclIsSpaceProcM(*p2)) && (p2 < p+20)) { p2++; } @@ -640,6 +623,20 @@ TclFindElement( return TCL_ERROR; } break; + + default: + if (TclIsSpaceProcM(*p)) { + /* + * Space: ignore if element is in braces or quotes; + * otherwise terminate element. + */ + if ((openBraces == 0) && !inQuotes) { + size = (p - elemStart); + goto done; + } + } + break; + } p++; } @@ -666,7 +663,7 @@ TclFindElement( } done: - while ((p < limit) && (TclIsSpaceProc(*p))) { + while ((p < limit) && (TclIsSpaceProcM(*p))) { p++; } *elementPtr = elemStart; @@ -1013,12 +1010,6 @@ TclScanElement( case '[': case '$': case ';': - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': forbidNone = 1; extra++; /* Escape sequences all one byte longer. */ #if COMPAT @@ -1056,6 +1047,15 @@ TclScanElement( } /* TODO: Panic on improper encoding? */ break; + default: + if (TclIsSpaceProcM(*p)) { + forbidNone = 1; + extra++; /* Escape sequences all one byte longer. */ +#if COMPAT + preferBrace = 1; +#endif + } + break; } length -= (length > 0); p++; @@ -1806,6 +1806,7 @@ TclTrim( */ /* The whitespace characters trimmed during [concat] operations */ +/* TODO: Find a reasonable way to guarantee in sync with TclIsSpaceProc() */ #define CONCAT_WS " \f\v\r\t\n" #define CONCAT_WS_SIZE (int) (sizeof(CONCAT_WS "") - 1) @@ -3272,43 +3273,22 @@ TclNeedSpace( /* * (c) the trailing character of the string is already a list-element - * separator (according to TclFindElement); that is, one of these - * characters: - * \u0009 \t TAB - * \u000A \n NEWLINE - * \u000B \v VERTICAL TAB - * \u000C \f FORM FEED - * \u000D \r CARRIAGE RETURN - * \u0020 SPACE - * with the condition that the penultimate character is not a - * backslash. + * separator, Use the same testing routine as TclFindElement to + * enforce consistency. */ - if (*end > 0x20) { + if (TclIsSpaceProcM(*end)) { + int result = 0; + /* - * Performance tweak. All ASCII spaces are <= 0x20. So get a quick - * answer for most characters before comparing against all spaces in - * the switch below. - * - * NOTE: Remove this if other Unicode spaces ever get accepted as - * list-element separators. + * Trailing whitespace might be part of a backslash escape + * sequence. Handle that possibility. */ - return 1; - } - switch (*end) { - case ' ': - case '\t': - case '\n': - case '\r': - case '\v': - case '\f': - { - int result = 0; - while ((--end >= start) && (*end == '\\')) { - result = !result; - } - return result; + + while ((--end >= start) && (*end == '\\')) { + result = !result; } + return result; } return 1; } @@ -3448,7 +3428,7 @@ TclGetIntForIndex( * Leading whitespace is acceptable in an index. */ - while (length && TclIsSpaceProc(*bytes)) { + while (length && TclIsSpaceProcM(*bytes)) { bytes++; length--; } @@ -3461,7 +3441,7 @@ TclGetIntForIndex( if ((savedOp != '+') && (savedOp != '-')) { goto parseError; } - if (TclIsSpaceProc(opPtr[1])) { + if (TclIsSpaceProcM(opPtr[1])) { goto parseError; } *opPtr = '\0'; @@ -3607,7 +3587,7 @@ SetEndOffsetFromAny( * after "end-" to Tcl_GetInt, then reverse for offset. */ - if (TclIsSpaceProc(bytes[4])) { + if (TclIsSpaceProcM(bytes[4])) { return TCL_ERROR; } if (Tcl_GetInt(interp, bytes+4, &offset) != TCL_OK) { @@ -3672,7 +3652,7 @@ TclCheckBadOctal( * zero. Try to generate a meaningful error message. */ - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } if (*p == '+' || *p == '-') { @@ -3685,7 +3665,7 @@ TclCheckBadOctal( while (isdigit(UCHAR(*p))) { /* INTL: digit. */ p++; } - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } if (*p == '\0') { diff --git a/unix/tclUnixFile.c b/unix/tclUnixFile.c index 65e144d..038cbf8 100644 --- a/unix/tclUnixFile.c +++ b/unix/tclUnixFile.c @@ -98,7 +98,7 @@ TclpFindExecutable( */ while (1) { - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } name = p; -- cgit v0.12 From 0834a2b225f7f2b09abce5ae03af5ff936f2a947 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 17:43:16 +0000 Subject: Revise two tests that were detecting and forgiving a bug. --- tests/dstring.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/dstring.test b/tests/dstring.test index 95321ec..5151d27 100644 --- a/tests/dstring.test +++ b/tests/dstring.test @@ -130,13 +130,13 @@ test dstring-2.12 {appending list elements} testdstring { testdstring get } {x #} test dstring-2.13 {appending list elements} testdstring { - # This test shows lack of sophistication in Tcl_DStringAppendElement's + # This test checks the sophistication in Tcl_DStringAppendElement's # decision about whether #-quoting can be disabled. testdstring free testdstring append "x " -1 testdstring element # testdstring get -} {x {#}} +} {x #} test dstring-3.1 {nested sublists} testdstring { testdstring free @@ -227,7 +227,7 @@ test dstring-3.9 {appending list elements} testdstring { testdstring get } {x {x #}} test dstring-3.10 {appending list elements} testdstring { - # This test shows lack of sophistication in Tcl_DStringAppendElement's + # This test checks the sophistication in Tcl_DStringAppendElement's # decision about whether #-quoting can be disabled. testdstring free testdstring append x -1 @@ -236,7 +236,7 @@ test dstring-3.10 {appending list elements} testdstring { testdstring element # testdstring end testdstring get -} {x {x {#}}} +} {x {x #}} test dstring-4.1 {truncation} testdstring { testdstring free -- cgit v0.12 From 54867c8febc798f6f9002a74120d8c9c115bf6bb Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 18:41:12 +0000 Subject: More tests. --- tests/dstring.test | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/dstring.test b/tests/dstring.test index 5151d27..f2d8656 100644 --- a/tests/dstring.test +++ b/tests/dstring.test @@ -137,6 +137,20 @@ test dstring-2.13 {appending list elements} testdstring { testdstring element # testdstring get } {x #} +test dstring-2.14 {appending list elements} testdstring { + testdstring free + testdstring append " " -1 + testdstring element # + testdstring get +} { {#}} +test dstring-2.15 {appending list elements} testdstring { + # This test checks the sophistication in Tcl_DStringAppendElement's + # decision about whether #-quoting can be disabled. + testdstring free + testdstring append "x " -1 + testdstring element # + testdstring get +} {x #} test dstring-3.1 {nested sublists} testdstring { testdstring free @@ -237,6 +251,26 @@ test dstring-3.10 {appending list elements} testdstring { testdstring end testdstring get } {x {x #}} +test dstring-3.11 {appending list elements} testdstring { + testdstring free + testdstring append x -1 + testdstring start + testdstring append " " -1 + testdstring element # + testdstring end + testdstring get +} {x { {#}}} +test dstring-3.12 {appending list elements} testdstring { + # This test checks the sophistication in Tcl_DStringAppendElement's + # decision about whether #-quoting can be disabled. + testdstring free + testdstring append x -1 + testdstring start + testdstring append "x " -1 + testdstring element # + testdstring end + testdstring get +} {x {x #}} test dstring-4.1 {truncation} testdstring { testdstring free -- cgit v0.12 From 6b3b5aaca5dae90f4def710e2f9d88684b039505 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 19:04:04 +0000 Subject: [085913c760] Fix Tcl_DStringAppendElement quoting of #. --- generic/tclUtil.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 0b8ec2d..be80610 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -2630,9 +2630,36 @@ Tcl_DStringAppendElement( { char *dst = dsPtr->string + dsPtr->length; int needSpace = TclNeedSpace(dsPtr->string, dst); - int flags = needSpace ? TCL_DONT_QUOTE_HASH : 0; - int newSize = dsPtr->length + needSpace - + TclScanElement(element, -1, &flags); + int flags = 0, quoteHash = 1, newSize; + + if (needSpace) { + /* + * If we need a space to separate the new element from something + * already ending the string, we're not appending the first element + * of any list, so we need not quote any leading hash character. + */ + quoteHash = 0; + } else { + /* + * We don't need a space, maybe because there's some already there. + * Checking whether we might be appending a first element is a bit + * more involved. + * + * Backtrack over all whitespace. + */ + while ((--dst >= dsPtr->string) && TclIsSpaceProcM(*dst)) { + } + + /* Call again without whitespace to confound things. */ + quoteHash = !TclNeedSpace(dsPtr->string, dst+1); + } + if (!quoteHash) { + flags |= TCL_DONT_QUOTE_HASH; + } + newSize = dsPtr->length + needSpace + TclScanElement(element, -1, &flags); + if (!quoteHash) { + flags |= TCL_DONT_QUOTE_HASH; + } /* * Allocate a larger buffer for the string if the current one isn't large @@ -2665,8 +2692,8 @@ Tcl_DStringAppendElement( element = dsPtr->string + offset; } } - dst = dsPtr->string + dsPtr->length; } + dst = dsPtr->string + dsPtr->length; /* * Convert the new string to a list element and copy it into the buffer at @@ -2677,15 +2704,8 @@ Tcl_DStringAppendElement( *dst = ' '; dst++; dsPtr->length++; - - /* - * If we need a space to separate this element from preceding stuff, - * then this element will not lead a list, and need not have it's - * leading '#' quoted. - */ - - flags |= TCL_DONT_QUOTE_HASH; } + dsPtr->length += TclConvertElement(element, -1, dst, flags); dsPtr->string[dsPtr->length] = '\0'; return dsPtr->string; -- cgit v0.12 From 18e7960200db745bdfb8936ed33e4cc7cafad557 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 19:13:48 +0000 Subject: Same trouble fixed the same way in Tcl_AppendElement(). --- generic/tclResult.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/generic/tclResult.c b/generic/tclResult.c index 7b58d44..1b9d5c9 100644 --- a/generic/tclResult.c +++ b/generic/tclResult.c @@ -704,6 +704,7 @@ Tcl_AppendElement( char *dst; int size; int flags; + int quoteHash = 1; /* * If the string result is empty, move the object result to the string @@ -740,9 +741,17 @@ Tcl_AppendElement( * then this element will not lead a list, and need not have it's * leading '#' quoted. */ - + quoteHash = 0; + } else { + while ((--dst >= iPtr->appendResult) && TclIsSpaceProcM(*dst)) { + } + quoteHash = !TclNeedSpace(iPtr->appendResult, dst+1); + } + dst = iPtr->appendResult + iPtr->appendUsed; + if (!quoteHash) { flags |= TCL_DONT_QUOTE_HASH; } + iPtr->appendUsed += Tcl_ConvertElement(element, dst, flags); } -- cgit v0.12 From ab39084d894caaf7e9a5e362169fd1ddcdb0460c Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 9 Apr 2020 22:27:37 +0000 Subject: Bulletproof the calls to Tcl_UtfPrev in Tcl_AppendLimitedToObj. --- generic/tclStringObj.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index aeb4285..c3c85dc 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1119,12 +1119,7 @@ Tcl_AppendLimitedToObj( { String *stringPtr; int toCopy = 0; - - if (Tcl_IsShared(objPtr)) { - Tcl_Panic("%s called with shared object", "Tcl_AppendLimitedToObj"); - } - - SetStringFromAny(NULL, objPtr); + int eLen = 0; if (length < 0) { length = (bytes ? strlen(bytes) : 0); @@ -1132,6 +1127,9 @@ Tcl_AppendLimitedToObj( if (length == 0) { return; } + if (limit <= 0) { + return; + } if (length <= limit) { toCopy = length; @@ -1139,8 +1137,12 @@ Tcl_AppendLimitedToObj( if (ellipsis == NULL) { ellipsis = "..."; } - toCopy = (bytes == NULL) ? limit - : Tcl_UtfPrev(bytes+limit+1-strlen(ellipsis), bytes) - bytes; + eLen = strlen(ellipsis); + while (eLen > limit) { + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + } + + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -1149,6 +1151,11 @@ Tcl_AppendLimitedToObj( * objPtr's string rep. */ + if (Tcl_IsShared(objPtr)) { + Tcl_Panic("%s called with shared object", "Tcl_AppendLimitedToObj"); + } + + SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); if (stringPtr->hasUnicode != 0) { AppendUtfToUnicodeRep(objPtr, bytes, toCopy); @@ -1162,9 +1169,9 @@ Tcl_AppendLimitedToObj( stringPtr = GET_STRING(objPtr); if (stringPtr->hasUnicode != 0) { - AppendUtfToUnicodeRep(objPtr, ellipsis, -1); + AppendUtfToUnicodeRep(objPtr, ellipsis, eLen); } else { - AppendUtfToUtfRep(objPtr, ellipsis, -1); + AppendUtfToUtfRep(objPtr, ellipsis, eLen); } } -- cgit v0.12 From 7f995b755bfead2dfc4865e0d2fc5fb15ea7a946 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 10 Apr 2020 00:01:52 +0000 Subject: Add (disabled) test to demo the inability of [string wordstart] to handle malformed UTF-8 sequences. --- tests/string.test | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/string.test b/tests/string.test index 9a5e0c0..e1ae63a 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1542,6 +1542,11 @@ test string-22.12 {string wordstart, unicode} { test string-22.13 {string wordstart, unicode} { string wordstart "\uc700\uc700 abc" 8 } 3 +test string-22.14 {string wordstart, invalid UTF-8} knownBug { + # See Bug c61818e4c9 + set demo [bytestring "abc def\xE0\xA9ghi"] + string index $demo [string wordstart $demo 10] +} g test string-23.0 {string is boolean, Bug 1187123} testindexobj { set x 5 -- cgit v0.12 From 1aa65236ca8cf733dd41fb38b4e8ae49601d8c2a Mon Sep 17 00:00:00 2001 From: dgp Date: Sat, 11 Apr 2020 22:26:30 +0000 Subject: Repair invalid utf-8 in subst.test. --- tests/subst.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/subst.test b/tests/subst.test index 933b1c6..a809f28 100644 --- a/tests/subst.test +++ b/tests/subst.test @@ -41,7 +41,7 @@ test subst-3.1 {backslash substitutions} { subst {\x\$x\[foo bar]\\} } "x\$x\[foo bar]\\" test subst-3.2 {backslash substitutions with utf chars} { - # 'j' is just a char that doesn't mean anything, and \344 is 'ä' + # 'j' is just a char that doesn't mean anything, and \344 is 'ä' # that also doesn't mean anything, but is multi-byte in UTF-8. list [subst \j] [subst \\j] [subst \\344] [subst \\\344] } "j j \344 \344" -- cgit v0.12 From c878e211c9ae8106d3afb570e8019243b50b5554 Mon Sep 17 00:00:00 2001 From: dgp Date: Sun, 12 Apr 2020 23:47:02 +0000 Subject: [2006888] Backport conversion of test file to strict ASCII. ISO-8859-1 assumption is nonportable and increasingly invalid. Fossil does not like working with files that contain invalid UTF-8 byte sequences. --- tests/stringObj.test | 75 ++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/tests/stringObj.test b/tests/stringObj.test index 3b25592..1ac8b84 100644 --- a/tests/stringObj.test +++ b/tests/stringObj.test @@ -1,16 +1,16 @@ # Commands covered: none # -# This file contains tests for the procedures in tclStringObj.c -# that implement the Tcl type manager for the string type. +# This file contains tests for the procedures in tclStringObj.c that implement +# the Tcl type manager for the string type. # -# Sourcing this file into Tcl runs the tests and generates output for -# errors. No output means no errors were found. +# Sourcing this file into Tcl runs the tests and generates output for errors. +# No output means no errors were found. # # Copyright (c) 1995-1997 Sun Microsystems, Inc. # Copyright (c) 1998-1999 by Scriptics Corporation. # -# See the file "license.terms" for information on usage and redistribution -# of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# See the file "license.terms" for information on usage and redistribution of +# this file, and for a DISCLAIMER OF ALL WARRANTIES. if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest @@ -19,7 +19,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { testConstraint testobj [llength [info commands testobj]] testConstraint testdstring [llength [info commands testdstring]] - + test stringObj-1.1 {string type registration} testobj { set t [testobj types] set first [string first "string" $t] @@ -38,7 +38,7 @@ test stringObj-3.1 {Tcl_SetStringObj, existing "empty string" object} testobj { set result "" lappend result [testobj freeallvars] lappend result [testobj newobj 1] - lappend result [teststringobj set 1 xyz] ;# makes existing obj a string + lappend result [teststringobj set 1 xyz] ;# makes existing obj a string lappend result [testobj type 1] lappend result [testobj refcount 1] } {{} {} xyz string 2} @@ -46,7 +46,7 @@ test stringObj-3.2 {Tcl_SetStringObj, existing non-"empty string" object} testob set result "" lappend result [testobj freeallvars] lappend result [testintobj set 1 512] - lappend result [teststringobj set 1 foo] ;# makes existing obj a string + lappend result [teststringobj set 1 foo] ;# makes existing obj a string lappend result [testobj type 1] lappend result [testobj refcount 1] } {{} 512 foo string 2} @@ -202,19 +202,19 @@ test stringObj-8.1 {DupStringInternalRep procedure} testobj { [teststringobj ualloc 2] [teststringobj get 2] } {5 10 0 abcde 5 5 0 abcde} test stringObj-8.2 {DupUnicodeInternalRep, mixed width chars} testobj { - set x abcï¿®ghi + set x abc\u00ef\u00bf\u00aeghi string length $x set y $x - list [testobj objtype $x] [testobj objtype $y] [append x "®¿ï"] \ + list [testobj objtype $x] [testobj objtype $y] [append x "\u00ae\u00bf\u00ef"] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {string string abcï¿®ghi®¿ï abcï¿®ghi string string} +} "string string abc\u00ef\u00bf\u00aeghi\u00ae\u00bf\u00ef abc\u00ef\u00bf\u00aeghi string string" test stringObj-8.3 {DupUnicodeInternalRep, mixed width chars} testobj { - set x abcï¿®ghi + set x abc\u00ef\u00bf\u00aeghi set y $x string length $x - list [testobj objtype $x] [testobj objtype $y] [append x "®¿ï"] \ + list [testobj objtype $x] [testobj objtype $y] [append x "\u00ae\u00bf\u00ef"] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {string string abcï¿®ghi®¿ï abcï¿®ghi string string} +} "string string abc\u00ef\u00bf\u00aeghi\u00ae\u00bf\u00ef abc\u00ef\u00bf\u00aeghi string string" test stringObj-8.4 {DupUnicodeInternalRep, all byte-size chars} testobj { set x abcdefghi string length $x @@ -240,13 +240,13 @@ test stringObj-9.1 {TclAppendObjToObj, mixed src & dest} {testobj testdstring} { [set y] [testobj objtype $x] [testobj objtype $y] } "string none abc\u00ef\u00bf\u00aeghi\u00ae\u00bf\u00ef \u00ae\u00bf\u00ef string none" test stringObj-9.2 {TclAppendObjToObj, mixed src & dest} testobj { - set x abcï¿®ghi + set x abc\u00ef\u00bf\u00aeghi string length $x list [testobj objtype $x] [append x $x] [testobj objtype $x] \ [append x $x] [testobj objtype $x] -} {string abcï¿®ghiabcï¿®ghi string\ -abcï¿®ghiabcï¿®ghiabcï¿®ghiabcï¿®ghi\ -string} +} "string abc\u00ef\u00bf\u00aeghiabc\u00ef\u00bf\u00aeghi string\ +abc\u00ef\u00bf\u00aeghiabc\u00ef\u00bf\u00aeghiabc\u00ef\u00bf\u00aeghiabc\u00ef\u00bf\u00aeghi\ +string" test stringObj-9.3 {TclAppendObjToObj, mixed src & 1-byte dest} {testobj testdstring} { set x abcdefghi testdstring free @@ -301,20 +301,19 @@ test stringObj-9.9 {TclAppendObjToObj, integer src & 1-byte dest} testobj { [set y] [testobj objtype $x] [testobj objtype $y] } {string int abcdefghi9 9 string int} test stringObj-9.10 {TclAppendObjToObj, integer src & mixed dest} testobj { - set x abcï¿®ghi + set x abc\u00ef\u00bf\u00aeghi set y [expr {4 + 5}] string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {string int abcï¿®ghi9 9 string int} +} "string int abc\u00ef\u00bf\u00aeghi9 9 string int" test stringObj-9.11 {TclAppendObjToObj, mixed src & 1-byte dest index check} testobj { # bug 2678, in <=8.2.0, the second obj (the one to append) in - # Tcl_AppendObjToObj was not correctly checked to see if it was - # all one byte chars, so a unicode string would be added as one - # byte chars. + # Tcl_AppendObjToObj was not correctly checked to see if it was all one + # byte chars, so a unicode string would be added as one byte chars. set x abcdef set len [string length $x] - set y aübåcï + set y a\u00fcb\u00e5c\u00ef set len [string length $y] append x $y string length $x @@ -323,7 +322,7 @@ test stringObj-9.11 {TclAppendObjToObj, mixed src & 1-byte dest index check} tes lappend q [string index $x $i] } set q -} {a b c d e f a ü b å c ï} +} "a b c d e f a \u00fc b \u00e5 c \u00ef" test stringObj-10.1 {Tcl_GetRange with all byte-size chars} {testobj testdstring} { testdstring free @@ -336,7 +335,7 @@ test stringObj-10.2 {Tcl_GetRange with some mixed width chars} {testobj testdstr # Because this test does not use \uXXXX notation below instead of # hardcoding the values, it may fail in multibyte locales. However, we # need to test that the parser produces untyped objects even when there - # are high-ASCII characters in the input (like "ï"). I don't know what + # are high-ASCII characters in the input (like "ï"). I don't know what # else to do but inline those characters here. testdstring free testdstring append "abc\u00ef\u00efdef" -1 @@ -345,7 +344,7 @@ test stringObj-10.2 {Tcl_GetRange with some mixed width chars} {testobj testdstr [testobj objtype $x] [testobj objtype $y] } [list none "bc\u00EF\u00EFde" string string] test stringObj-10.3 {Tcl_GetRange with some mixed width chars} testobj { - # set x "abcïïdef" + # set x "abcïïdef" # Use \uXXXX notation below instead of hardcoding the values, otherwise # the test will fail in multibyte locales. set x "abc\u00EF\u00EFdef" @@ -354,7 +353,7 @@ test stringObj-10.3 {Tcl_GetRange with some mixed width chars} testobj { [testobj objtype $x] [testobj objtype $y] } [list string "bc\u00EF\u00EFde" string string] test stringObj-10.4 {Tcl_GetRange with some mixed width chars} testobj { - # set a "ïa¿b®cï¿d®" + # set a "ïa¿b®cï¿d®" # Use \uXXXX notation below instead of hardcoding the values, otherwise # the test will fail in multibyte locales. set a "\u00EFa\u00BFb\u00AEc\u00EF\u00BFd\u00AE" @@ -389,15 +388,15 @@ test stringObj-12.3 {Tcl_GetUniChar with byte-size chars} testobj { list [string index $x end] [string index $x end-1] } {i h} test stringObj-12.4 {Tcl_GetUniChar with mixed width chars} testobj { - string index "ïa¿b®c®¿dï" 0 -} "ï" + string index "\u00efa\u00bfb\u00aec\u00ae\u00bfd\u00ef" 0 +} "\u00ef" test stringObj-12.5 {Tcl_GetUniChar} testobj { - set x "ïa¿b®c®¿dï" + set x "\u00efa\u00bfb\u00aec\u00ae\u00bfd\u00ef" list [string index $x 4] [string index $x 0] -} {® ï} +} "\u00ae \u00ef" test stringObj-12.6 {Tcl_GetUniChar} testobj { - string index "ïa¿b®cï¿d®" end -} "®" + string index "\u00efa\u00bfb\u00aec\u00ef\u00bfd\u00ae" end +} "\u00ae" test stringObj-13.1 {Tcl_GetCharLength with byte-size chars} testobj { set a "" @@ -411,16 +410,16 @@ test stringObj-13.3 {Tcl_GetCharLength with byte-size chars} testobj { list [string length $a] [string length $a] } {6 6} test stringObj-13.4 {Tcl_GetCharLength with mixed width chars} testobj { - string length "®" + string length "\u00ae" } 1 test stringObj-13.5 {Tcl_GetCharLength with mixed width chars} testobj { - # string length "○○" + # string length "○○" # Use \uXXXX notation below instead of hardcoding the values, otherwise # the test will fail in multibyte locales. string length "\u00EF\u00BF\u00AE\u00EF\u00BF\u00AE" } 6 test stringObj-13.6 {Tcl_GetCharLength with mixed width chars} testobj { - # set a "ïa¿b®cï¿d®" + # set a "ïa¿b®cï¿d®" # Use \uXXXX notation below instead of hardcoding the values, otherwise # the test will fail in multibyte locales. set a "\u00EFa\u00BFb\u00AEc\u00EF\u00BFd\u00AE" -- cgit v0.12 From 7d90cc50f22d633813d2ed620542d15a55755352 Mon Sep 17 00:00:00 2001 From: dgp Date: Sun, 12 Apr 2020 23:53:27 +0000 Subject: Stop direct use of the identity encoding in tests. --- tests/encoding.test | 4 ++-- tests/parse.test | 2 +- tests/stringObj.test | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/encoding.test b/tests/encoding.test index 498e176..8722a93 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -304,13 +304,13 @@ test encoding-15.1 {UtfToUtfProc} { test encoding-15.2 {UtfToUtfProc null character output} { set x \u0000 set y [encoding convertto utf-8 \u0000] - set y [encoding convertfrom identity $y] + set y [bytestring $y] binary scan $y H* z list [string bytelength $x] [string bytelength $y] $z } {2 1 00} test encoding-15.3 {UtfToUtfProc null character input} { - set x [encoding convertfrom identity \x00] + set x [bytestring \x00] set y [encoding convertfrom utf-8 $x] binary scan [encoding convertto identity $y] H* z list [string bytelength $x] [string bytelength $y] $z diff --git a/tests/parse.test b/tests/parse.test index cd02386..13e9839 100644 --- a/tests/parse.test +++ b/tests/parse.test @@ -916,7 +916,7 @@ test parse-15.58 {CommandComplete procedure, memory leaks} { } 1 test parse-15.59 {CommandComplete procedure} { # Test for Tcl Bug 684744 - info complete [encoding convertfrom identity "\x00;if 1 \{"] + info complete [bytestring "\x00;if 1 \{"] } 0 test parse-15.60 {CommandComplete procedure} { # Test for Tcl Bug 1968882 diff --git a/tests/stringObj.test b/tests/stringObj.test index 1ac8b84..b62b768 100644 --- a/tests/stringObj.test +++ b/tests/stringObj.test @@ -427,10 +427,10 @@ test stringObj-13.6 {Tcl_GetCharLength with mixed width chars} testobj { } {10 10} test stringObj-13.7 {Tcl_GetCharLength with identity nulls} testobj { # SF bug #684699 - string length [encoding convertfrom identity \x00] + string length [bytestring \x00] } 1 test stringObj-13.8 {Tcl_GetCharLength with identity nulls} testobj { - string length [encoding convertfrom identity \x01\x00\x02] + string length [bytestring \x01\x00\x02] } 3 test stringObj-14.1 {Tcl_SetObjLength on pure unicode object} testobj { -- cgit v0.12 From 901525301c284507e65f39f5c68785ab9ec1eb16 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 00:09:43 +0000 Subject: added test case covering [c61818e4c9] - string trim for not valid utf-8 sequence (mistakenly considers NTS-zero char as a continuation of utf-8 pair) --- tests/string.test | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/string.test b/tests/string.test index e1ae63a..f6eaaf0 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1459,6 +1459,23 @@ test string-20.4 {string trimright} { test string-20.5 {string trimright} { string trimright "" } {} +test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} -setup { + interp alias {} bytes {} encoding convertfrom identity +} -body { + set result {} + set a [bytes \xc0\x80\x88] + set b foo$a + set m [list \u0000 U \x88 V [bytes \x88] W] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \u0000]] + lappend result [string map $m [string trimleft $b fox]] + lappend result [string map $m [string trimleft $b fo\u0000]] + lappend result [string map $m [string trim $b fox]] + lappend result [string map $m [string trim $b fo\u0000]] +} -result [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] -cleanup { + interp alias {} bytes {} +} test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg -- cgit v0.12 From e4426ffac822281e598f797f1a787a4bd05c090b Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 00:12:26 +0000 Subject: Convert test to not directly use identity encoding. --- tests/string.test | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/string.test b/tests/string.test index f6eaaf0..72b2a49 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1459,13 +1459,11 @@ test string-20.4 {string trimright} { test string-20.5 {string trimright} { string trimright "" } {} -test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} -setup { - interp alias {} bytes {} encoding convertfrom identity -} -body { +test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} { set result {} - set a [bytes \xc0\x80\x88] + set a [bytestring \xc0\x80\x88] set b foo$a - set m [list \u0000 U \x88 V [bytes \x88] W] + set m [list \u0000 U \x88 V [bytestring \x88] W] lappend result [string map $m $b] lappend result [string map $m [string trimright $b x]] lappend result [string map $m [string trimright $b \u0000]] @@ -1473,9 +1471,7 @@ test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as conti lappend result [string map $m [string trimleft $b fo\u0000]] lappend result [string map $m [string trim $b fox]] lappend result [string map $m [string trim $b fo\u0000]] -} -result [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] -cleanup { - interp alias {} bytes {} -} +} [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg -- cgit v0.12 From a8641c2da18720d07e3a0235703a9587b46d12ca Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 01:40:18 +0000 Subject: Another test for [string trimright] that demonstrates its own failures, not those of Tcl_UtfPrev. --- tests/string.test | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/string.test b/tests/string.test index 72b2a49..05a0623 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1472,6 +1472,21 @@ test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as conti lappend result [string map $m [string trim $b fox]] lappend result [string map $m [string trim $b fo\u0000]] } [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] +test string-20.7 {[c61818e4c9] [string trimright] fails when UtfPrev is ok} { + set result {} + set a [bytestring \xE8\x80] + set b foo$a + set m [list \xE8 U \x80 V [bytestring \xE8] W [bytestring \x80] X]] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \xE8]] + lappend result [string map $m [string trimright $b [bytestring \xE8]]] + lappend result [string map $m [string trimright $b \x80]] + lappend result [string map $m [string trimright $b [bytestring \x80]]] + lappend result [string map $m [string trimright $b \xE8\x80]] + lappend result [string map $m [string trimright $b [bytestring \xE8\x80]]] + lappend result [string map $m [string trimright $b \u0000]] +} [list {*}[lrepeat 4 fooUV] {*}[lrepeat 2 fooU] {*}[lrepeat 2 foo] fooUV] test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg -- cgit v0.12 From 40effa0f8aaae738487ed2384800e626995d776e Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 02:10:45 +0000 Subject: Cherrypick partial fix. --- generic/tclUtil.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index be80610..b3742f7 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1573,8 +1573,7 @@ TrimRight( const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ { - const char *p = bytes + numBytes; - int pInc; + const char *pp, *p = bytes + numBytes; /* Outer loop: iterate over string to be trimmed */ do { @@ -1582,8 +1581,8 @@ TrimRight( const char *q = trim; int bytesLeft = numTrim; - p = Tcl_UtfPrev(p, bytes); - pInc = TclUtfToUniChar(p, &ch1); + pp = Tcl_UtfPrev(p, bytes); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { @@ -1600,9 +1599,9 @@ TrimRight( if (bytesLeft == 0) { /* No match; trim task done; *p is last non-trimmed char */ - p += pInc; break; } + p = pp; } while (p > bytes); return numBytes - (p - bytes); -- cgit v0.12 From d1a1538c301096e5be710310329fa391a61a3b33 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 02:28:24 +0000 Subject: [c61818e4c9] [string trimright] robustly handle backing up over incomplete or malformed byte sequences. --- generic/tclUtil.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index b3742f7..d40cbeb 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1570,8 +1570,12 @@ static inline int TrimRight( const char *bytes, /* String to be trimmed... */ int numBytes, /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (bytes[numBytes] == '\0'). */ const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (trim[numTrim] == '\0'). */ { const char *pp, *p = bytes + numBytes; @@ -1579,10 +1583,13 @@ TrimRight( do { Tcl_UniChar ch1; const char *q = trim; - int bytesLeft = numTrim; + int pInc = 0, bytesLeft = numTrim; pp = Tcl_UtfPrev(p, bytes); - (void)TclUtfToUniChar(pp, &ch1); + do { + pp += pInc; + pInc = TclUtfToUniChar(pp, &ch1); + } while (pp + pInc < p); /* Inner loop: scan trim string for match to current character */ do { -- cgit v0.12 From 787517aa7fc113fcd554b0da726b0ba74585812b Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 13:51:50 +0000 Subject: TclTrimLeft and TclTrimRight are internal routines. They demand NUL-terminated arguments. That's a reasonable burden to put on internal callers, and all existing callers already meet it. --- generic/tclUtil.c | 94 +++++++++++++++---------------------------------------- 1 file changed, 25 insertions(+), 69 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index d40cbeb..b6eebdc 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1566,8 +1566,8 @@ UtfWellFormedEnd( *---------------------------------------------------------------------- */ -static inline int -TrimRight( +int +TclTrimRight( const char *bytes, /* String to be trimmed... */ int numBytes, /* ...and its length in bytes */ /* Calls to TclUtfToUniChar() in this routine @@ -1579,6 +1579,11 @@ TrimRight( { const char *pp, *p = bytes + numBytes; + /* Empty strings -> nothing to do */ + if ((numBytes == 0) || (numTrim == 0)) { + return 0; + } + /* Outer loop: iterate over string to be trimmed */ do { Tcl_UniChar ch1; @@ -1613,37 +1618,6 @@ TrimRight( return numBytes - (p - bytes); } - -int -TclTrimRight( - const char *bytes, /* String to be trimmed... */ - int numBytes, /* ...and its length in bytes */ - const char *trim, /* String of trim characters... */ - int numTrim) /* ...and its length in bytes */ -{ - int res; - Tcl_DString bytesBuf, trimBuf; - - /* Empty strings -> nothing to do */ - if ((numBytes == 0) || (numTrim == 0)) { - return 0; - } - - Tcl_DStringInit(&bytesBuf); - Tcl_DStringInit(&trimBuf); - bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); - trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - - res = TrimRight(bytes, numBytes, trim, numTrim); - if (res > numBytes) { - res = numBytes; - } - - Tcl_DStringFree(&bytesBuf); - Tcl_DStringFree(&trimBuf); - - return res; -} /* *---------------------------------------------------------------------- @@ -1662,15 +1636,24 @@ TclTrimRight( *---------------------------------------------------------------------- */ -static inline int -TrimLeft( +int +TclTrimLeft( const char *bytes, /* String to be trimmed... */ int numBytes, /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (bytes[numBytes] == '\0'). */ const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (trim[numTrim] == '\0'). */ { const char *p = bytes; + /* Empty strings -> nothing to do */ + if ((numBytes == 0) || (numTrim == 0)) { + return 0; + } + /* Outer loop: iterate over string to be trimmed */ do { Tcl_UniChar ch1; @@ -1702,37 +1685,6 @@ TrimLeft( return p - bytes; } - -int -TclTrimLeft( - const char *bytes, /* String to be trimmed... */ - int numBytes, /* ...and its length in bytes */ - const char *trim, /* String of trim characters... */ - int numTrim) /* ...and its length in bytes */ -{ - int res; - Tcl_DString bytesBuf, trimBuf; - - /* Empty strings -> nothing to do */ - if ((numBytes == 0) || (numTrim == 0)) { - return 0; - } - - Tcl_DStringInit(&bytesBuf); - Tcl_DStringInit(&trimBuf); - bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); - trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - - res = TrimLeft(bytes, numBytes, trim, numTrim); - if (res > numBytes) { - res = numBytes; - } - - Tcl_DStringFree(&bytesBuf); - Tcl_DStringFree(&trimBuf); - - return res; -} /* *---------------------------------------------------------------------- @@ -1754,9 +1706,13 @@ int TclTrim( const char *bytes, /* String to be trimmed... */ int numBytes, /* ...and its length in bytes */ + /* Calls in this routine + * rely on (bytes[numBytes] == '\0'). */ const char *trim, /* String of trim characters... */ int numTrim, /* ...and its length in bytes */ - int *trimRight) /* Offset from the end of the string. */ + /* Calls in this routine + * rely on (trim[numTrim] == '\0'). */ + int *trimRight) /* Offset from the end of the string. */ { int trimLeft; Tcl_DString bytesBuf, trimBuf; @@ -1772,7 +1728,7 @@ TclTrim( bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - trimLeft = TrimLeft(bytes, numBytes, trim, numTrim); + trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim); if (trimLeft > numBytes) { trimLeft = numBytes; } @@ -1780,7 +1736,7 @@ TclTrim( /* have to trim yet (first char was already verified within TrimLeft) */ if (numBytes > 1) { bytes += trimLeft; - *trimRight = TrimRight(bytes, numBytes, trim, numTrim); + *trimRight = TclTrimRight(bytes, numBytes, trim, numTrim); if (*trimRight > numBytes) { *trimRight = numBytes; } -- cgit v0.12 From 5ca0fff784edcbd2b7eabe15c8aa7ed414ca12f2 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 14:21:09 +0000 Subject: TclTrim() can also demand NUL-terminated arguments, and be simplified. --- generic/tclUtil.c | 85 ++++++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 64 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index b6eebdc..82ef9b7 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1516,42 +1516,6 @@ Tcl_Backslash( /* *---------------------------------------------------------------------- * - * UtfWellFormedEnd -- - * Checks the end of utf string is malformed, if yes - wraps bytes - * to the given buffer (as well-formed NTS string). The buffer - * argument should be initialized by the caller and ready to use. - * - * Results: - * The bytes with well-formed end of the string. - * - * Side effects: - * Buffer (DString) may be allocated, so must be released. - * - *---------------------------------------------------------------------- - */ - -static inline const char* -UtfWellFormedEnd( - Tcl_DString *buffer, /* Buffer used to hold well-formed string. */ - CONST char *bytes, /* Pointer to the beginning of the string. */ - int length) /* Length of the string. */ -{ - CONST char *l = bytes + length; - CONST char *p = Tcl_UtfPrev(l, bytes); - - if (Tcl_UtfCharComplete(p, l - p)) { - return bytes; - } - /* - * Malformed utf-8 end, be sure we've NTS to safe compare of end-character, - * avoid segfault by access violation out of range. - */ - Tcl_DStringAppend(buffer, bytes, length); - return Tcl_DStringValue(buffer); -} -/* - *---------------------------------------------------------------------- - * * TclTrimRight -- * Takes two counted strings in the Tcl encoding. Conceptually * finds the sub string (offset) to trim from the right side of the @@ -1712,39 +1676,32 @@ TclTrim( int numTrim, /* ...and its length in bytes */ /* Calls in this routine * rely on (trim[numTrim] == '\0'). */ - int *trimRight) /* Offset from the end of the string. */ + int *trimRightPtr) /* Offset from the end of the string. */ { - int trimLeft; - Tcl_DString bytesBuf, trimBuf; + int trimLeft = 0, trimRight = 0; - *trimRight = 0; /* Empty strings -> nothing to do */ - if ((numBytes == 0) || (numTrim == 0)) { - return 0; - } - - Tcl_DStringInit(&bytesBuf); - Tcl_DStringInit(&trimBuf); - bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); - trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - - trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim); - if (trimLeft > numBytes) { - trimLeft = numBytes; - } - numBytes -= trimLeft; - /* have to trim yet (first char was already verified within TrimLeft) */ - if (numBytes > 1) { - bytes += trimLeft; - *trimRight = TclTrimRight(bytes, numBytes, trim, numTrim); - if (*trimRight > numBytes) { - *trimRight = numBytes; + if ((numBytes > 0) && (numTrim > 0)) { + + /* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */ + trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim); + numBytes -= trimLeft; + + /* If we did not trim the whole string, it starts with a character + * that we will not trim. Skip over it. */ + if (numBytes > 0) { + const char *first = bytes + trimLeft; + bytes = Tcl_UtfNext(first); + numBytes -= (bytes - first); + + if (numBytes > 0) { + /* When bytes is NUL-terminated, returns + * 0 <= trimRight <= numBytes */ + trimRight = TclTrimRight(bytes, numBytes, trim, numTrim); + } } } - - Tcl_DStringFree(&bytesBuf); - Tcl_DStringFree(&trimBuf); - + *trimRightPtr = trimRight; return trimLeft; } -- cgit v0.12 From ea5d755488be5c353d266e5ef9666e9e13457f17 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 14:30:49 +0000 Subject: A NUL byte cannot be mistaken for a trail byte. --- generic/tclUtil.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 15b67b9..82ef9b7 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1625,11 +1625,6 @@ TclTrimLeft( const char *q = trim; int bytesLeft = numTrim; - /* take care about real length of char, e.g. if TclUtfToUniChar would - * mistakenly consider NTS 0-byte as a continuation of invalid utf-8 - * sequence, bug [c61818e4c9] */ - if (pInc > numBytes) {pInc = numBytes;} - /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; -- cgit v0.12 From 9b529e05399279d3631a9740e3b61bb19a1af5d8 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 15:01:38 +0000 Subject: Known bug test string-22.14 is not so hard to fix. --- generic/tclCmdMZ.c | 14 ++++++++++++-- tests/string.test | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d4fa4e9..6515d98 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2396,12 +2396,22 @@ StringStartCmd( cur = 0; if (index > 0) { p = Tcl_UtfAtIndex(string, index); + + TclUtfToUniChar(p, &ch); for (cur = index; cur >= 0; cur--) { - TclUtfToUniChar(p, &ch); + int delta = 0; + const char *next; + if (!Tcl_UniCharIsWordChar(ch)) { break; } - p = Tcl_UtfPrev(p, string); + + next = Tcl_UtfPrev(p, string); + do { + next += delta; + delta = TclUtfToUniChar(next, &ch); + } while (next + delta < p); + p = next; } if (cur != index) { cur += 1; diff --git a/tests/string.test b/tests/string.test index 05a0623..54b9c95 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1570,7 +1570,7 @@ test string-22.12 {string wordstart, unicode} { test string-22.13 {string wordstart, unicode} { string wordstart "\uc700\uc700 abc" 8 } 3 -test string-22.14 {string wordstart, invalid UTF-8} knownBug { +test string-22.14 {string wordstart, invalid UTF-8} { # See Bug c61818e4c9 set demo [bytestring "abc def\xE0\xA9ghi"] string index $demo [string wordstart $demo 10] -- cgit v0.12 From 7110b5a18c09fdf57b9da18e5a8cf60ee43b1430 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 16:44:07 +0000 Subject: New test demonstrating Tcl_StringCaseMatch is botched in its use of Tcl_UtfPrev. It doesn't consider multi-byte characters at all, let alone anything malformed. Good thing the call isn't needed at all. --- tests/util.test | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/util.test b/tests/util.test index 46d9152..85c06dd 100644 --- a/tests/util.test +++ b/tests/util.test @@ -378,6 +378,10 @@ test util-5.50 {Tcl_StringMatch} { test util-5.51 {Tcl_StringMatch} { Wrapper_Tcl_StringMatch "" "" } 1 +test util-5.52 {Tcl_StringMatch} { + Wrapper_Tcl_StringMatch \[a\u0000 a\x80 +} 0 + test util-6.1 {Tcl_PrintDouble - using tcl_precision} -setup { set old_precision $::tcl_precision -- cgit v0.12 From ff424481242afe72a43040e49520fcfebe00e6c1 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 16:51:34 +0000 Subject: [a7f685a181] Eliminate botched call of Tcl_UtfPrev. --- generic/tclUtil.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 82ef9b7..3dd9a32 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1980,7 +1980,6 @@ Tcl_StringCaseMatch( int nocase) /* 0 for case sensitive, 1 for insensitive */ { int p, charLen; - CONST char *pstart = pattern; Tcl_UniChar ch1, ch2; while (1) { @@ -2145,10 +2144,13 @@ Tcl_StringCaseMatch( break; } } + /* If we reach here, we matched. Need to move past closing ] */ while (*pattern != ']') { if (*pattern == '\0') { - pattern = Tcl_UtfPrev(pattern, pstart); - break; + /* We ran out of pattern after matching something in + * (unclosed!) brackets. So long as we ran out of string + * at the same time, we have a match. Otherwise, not. */ + return (*str == '\0'); } pattern++; } -- cgit v0.12 From 01fa998afeaf983e50cf0ab93936a53250a0fa4c Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 16:57:15 +0000 Subject: Repair tests to expect the right thing. --- tests/utf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 9ce2b64..c2191c2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -170,13 +170,13 @@ test utf-7.10.2 {Tcl_UtfPrev} testutfprev { } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 -} 2 +} 1 test utf-7.11.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 3 } 1 test utf-7.11.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xF8\xA0 3 -} 2 +} 1 test utf-7.12 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0 } 1 -- cgit v0.12 From 4214a568d4ce47e17c79050848b51f51b2ffb8df Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 18:42:09 +0000 Subject: Make the comments describing Tcl_UtfPrev more complete and precise. --- generic/tclUtf.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1a8d515..fbdba4c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -654,15 +654,43 @@ Tcl_UtfNext( * * Tcl_UtfPrev -- * - * Given a pointer to some current location in a UTF-8 string, move - * backwards one character. This works correctly when the pointer is in - * the middle of a UTF-8 character. + * The aim of this routine is to provide a way to move backward + * through a UTF-8 string. The caller is expected to pass non-NULL + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and + * that character might include the byte src[-1]. The routine will + * examine only those bytes in the range that might be returned. + * It will not examine the byte *src, and because of that cannot + * determine for certain in all circumstances whether the character + * that begins with the returned pointer will or will not include + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the + * final complete character in the string or to the earliest byte + * that might start an incomplete character waiting for more bytes to + * complete. + * + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator + * through a string that will always make progress and always be + * prevented from running past the beginning of the string. + * + * In a string where all characters are complete and properly formed, + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: - * The return value is a pointer to the previous character in the UTF-8 - * string. If the current location was already at the beginning of the - * string, the return value will also be a pointer to the beginning of - * the string. + * A pointer to the start of a character in the string as described + * above. * * Side effects: * None. @@ -672,9 +700,8 @@ Tcl_UtfNext( CONST char * Tcl_UtfPrev( - CONST char *src, /* The current location in the string. */ - CONST char *start) /* Pointer to the beginning of the string, to - * avoid going backwards too far. */ + CONST char *src, /* A location in a UTF-8 string. */ + CONST char *start) /* Pointer to the beginning of the string */ { CONST char *look; int i, byte; -- cgit v0.12 From d7f2c3aaa409a6493a80b5ae7cfdc391babcd6c5 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 19:39:39 +0000 Subject: Improve the precision of the Tcl_UtfPrev documentation. --- doc/Utf.3 | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 5361f32..87d1318 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -223,13 +223,27 @@ string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null character. .PP -Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a -null byte immediately following such a string), \fBTcl_UtfPrev\fR -returns a pointer to the closest preceding byte that starts a UTF-8 -character. -This function will not back up to a position before \fIstart\fR, -the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the -return value will be \fIstart\fR. +\fBTcl_UtfPrev\fR is used to step backward through but not beyond the +UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made +up entirely of complete and well-formed characters, and \fIsrc\fR points +to the lead byte of one of those characters (or to the location one byte +past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will +return pointers to the lead bytes of each character in the string, one +character at a time, terminating when it returns \fIstart\fR. +.PP +When the conditions of completeness and well-formedness may not be satisfied, +a more precise description of the function of \fBTcl_UtfPrev\fR is necessary. +It always returns a pointer greater than or equal to \fIstart\fR; that is, +always a pointer to a location in the string. It always returns a pointer to +a byte that begins a character when scanning for characters beginning +from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it +always returns a pointer less than \fIsrc\fR and greater than or +equal to (\fIsrc\fR - \fBTCL_UTF_MAX\fR). The character that begins +at the returned pointer is the first one that either includes the +byte \fIsrc[-1]\fR, or might include it if the right trail bytes are +present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the +byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte +\fIsrc[-\fBTCL_UTF_MAX\fI-1]\fR. .PP \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the Pascal Ord() function. It returns the Tcl_UniChar represented at the -- cgit v0.12 From b7b6ab2719c37927b6585d21880c791860df7b46 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 21:18:21 +0000 Subject: test numbering --- tests/string.test | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/string.test b/tests/string.test index 54b9c95..2fc719b 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1459,7 +1459,10 @@ test string-20.4 {string trimright} { test string-20.5 {string trimright} { string trimright "" } {} -test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} { +test string-20.6 {string trimright, unicode default} { + # Reserve test number for Tcl 8.6 (TIP 413) +} {} +test string-20.7 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} { set result {} set a [bytestring \xc0\x80\x88] set b foo$a @@ -1472,7 +1475,7 @@ test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as conti lappend result [string map $m [string trim $b fox]] lappend result [string map $m [string trim $b fo\u0000]] } [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] -test string-20.7 {[c61818e4c9] [string trimright] fails when UtfPrev is ok} { +test string-20.8 {[c61818e4c9] [string trimright] fails when UtfPrev is ok} { set result {} set a [bytestring \xE8\x80] set b foo$a -- cgit v0.12