From 1aea640959f4dd7ae9922e1e80099f08d62c6684 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 18:27:20 +0000 Subject: closes regression in string trimright [c61818e4c9] without modifying of Tcl_UtfPrev (so certain inconsistency by Tcl_UtfPrev/TclUtfToUniChar still remains) --- generic/tclUtil.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 874e2a5..cf0bdaf 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1573,8 +1573,7 @@ TrimRight( const char *trim, /* String of trim characters... */ int numTrim) /* ...and its length in bytes */ { - const char *p = bytes + numBytes; - int pInc; + const char *pp, *p = bytes + numBytes; /* Outer loop: iterate over string to be trimmed */ do { @@ -1582,8 +1581,8 @@ TrimRight( const char *q = trim; int bytesLeft = numTrim; - p = Tcl_UtfPrev(p, bytes); - pInc = TclUtfToUniChar(p, &ch1); + pp = Tcl_UtfPrev(p, bytes); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { @@ -1600,9 +1599,9 @@ TrimRight( if (bytesLeft == 0) { /* No match; trim task done; *p is last non-trimmed char */ - p += pInc; break; } + p = pp; } while (p > bytes); return numBytes - (p - bytes); -- cgit v0.12 From 38179215a43e8ba972f4f6baebf2aef347682b53 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 19:36:54 +0000 Subject: Set of tests demonstrating flaws in Tcl_UtfPrev (as viewed through a fragile implementation of [string trimright]). See ticket [c61818e4c9]. --- tests/utf.test | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index e8ee374..5d67b36 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -95,7 +95,74 @@ test utf-6.1 {Tcl_UtfNext} { } {} test utf-7.1 {Tcl_UtfPrev} { + string trimright {} X } {} +test utf-7.2 {Tcl_UtfPrev} { + string trimright A X +} A +test utf-7.3 {Tcl_UtfPrev} { + string trimright AA X +} AA +test utf-7.4 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8] X +} [bytestring A\xF8] +test utf-7.5 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4] X +} [bytestring A\xF4] +test utf-7.6 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8] X +} [bytestring A\xE8] +test utf-7.7 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0] X +} [bytestring A\xD0] +test utf-7.8 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0] X +} [bytestring A\xA0] +test utf-7.9 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0] X +} [bytestring A\xF8\xA0] +test utf-7.10 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0] X +} [bytestring A\xF4\xA0] +test utf-7.11 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0] X +} [bytestring A\xE8\xA0] +test utf-7.12 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0] X +} [bytestring A\xD0\xA0] +test utf-7.13 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0\xA0] X +} [bytestring A\xA0\xA0] +test utf-7.14 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0\xA0] X +} [bytestring A\xF8\xA0\xA0] +test utf-7.15 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0\xA0] X +} [bytestring A\xF4\xA0\xA0] +test utf-7.16 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0\xA0] X +} [bytestring A\xE8\xA0\xA0] +test utf-7.17 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0\xA0] X +} [bytestring A\xD0\xA0\xA0] +test utf-7.18 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0\xA0\xA0] X +} [bytestring A\xA0\xA0\xA0] +test utf-7.19 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0\xA0\xA0] X +} [bytestring A\xF8\xA0\xA0\xA0] +test utf-7.20 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0\xA0\xA0] X +} [bytestring A\xF4\xA0\xA0\xA0] +test utf-7.21 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0\xA0\xA0] X +} [bytestring A\xE8\xA0\xA0\xA0] +test utf-7.22 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0\xA0\xA0] X +} [bytestring A\xD0\xA0\xA0\xA0] +test utf-7.23 {Tcl_UtfPrev} { + string trimright [bytestring A\xA0\xA0\xA0\xA0] X +} [bytestring A\xA0\xA0\xA0\xA0] test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From e3ec61b93b66246ae5cf63706bdd4a89fc9f0876 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 19:52:15 +0000 Subject: More tests that should continue to demo faults in Tcl_UtfPrev after [string trimright] implementation is improved. --- tests/utf.test | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 5d67b36..de529a6 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -164,6 +164,29 @@ test utf-7.23 {Tcl_UtfPrev} { string trimright [bytestring A\xA0\xA0\xA0\xA0] X } [bytestring A\xA0\xA0\xA0\xA0] +test utf-7.24 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0] [bytestring \xF8] +} [bytestring A\xF8\xA0] +test utf-7.25 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0] [bytestring \xF4] +} [bytestring A\xF4\xA0] +test utf-7.26 {Tcl_UtfPrev} { + string trimright [bytestring A\xE8\xA0] [bytestring \xE8] +} [bytestring A\xE8\xA0] +test utf-7.27 {Tcl_UtfPrev} { + string trimright [bytestring A\xF8\xA0\xA0] [bytestring \xF8] +} [bytestring A\xF8\xA0\xA0] +test utf-7.28 {Tcl_UtfPrev} { + string trimright [bytestring A\xF4\xA0\xA0] [bytestring \xF4] +} [bytestring A\xF4\xA0\xA0] +test utf-7.29 {Tcl_UtfPrev} { + string trimright [bytestring A\xD0\xA0\xA0] [bytestring \xD0] +} [bytestring A\xD0\xA0\xA0] + +test utf-7.30 {Tcl_UtfPrev} { + string trimright [bytestring A\xC0\x80\xA0] \u0000 +} [bytestring A\xC0\x80\xA0] + test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 } {a} -- cgit v0.12 From 8bdc1b8e328ecf025cade82185e8d44fdf35a559 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 20:04:25 +0000 Subject: added test case covering [c61818e4c9] - string trim for not valid utf-8 sequence (mistakenly considers NTS-zero char as a continuation of utf-8 pair) --- tests/string.test | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/string.test b/tests/string.test index 9a5e0c0..343ccb5 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1459,6 +1459,23 @@ test string-20.4 {string trimright} { test string-20.5 {string trimright} { string trimright "" } {} +test string-20.6 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} -setup { + interp alias {} bytes {} encoding convertfrom identity +} -body { + set result {} + set a [bytes \xc0\x80\x88] + set b foo$a + set m [list \u0000 U \x88 V [bytes \x88] W] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \u0000]] + lappend result [string map $m [string trimleft $b fox]] + lappend result [string map $m [string trimleft $b fo\u0000]] + lappend result [string map $m [string trim $b fox]] + lappend result [string map $m [string trim $b fo\u0000]] +} -result [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] -cleanup { + interp alias {} bytes {} +} test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg -- cgit v0.12 From 2227dd53ffef41928d6beedcde35df43cb31bf82 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 7 Apr 2020 20:05:24 +0000 Subject: fixes [c61818e4c9] for all variants of string trim --- generic/tclUtil.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index cf0bdaf..cb5072b 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1582,14 +1582,17 @@ TrimRight( int bytesLeft = numTrim; pp = Tcl_UtfPrev(p, bytes); - (void)TclUtfToUniChar(pp, &ch1); + (void)TclUtfToUniChar(pp, &ch1); /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2) { + /* compare chars and real length of char, e.g. if TclUtfToUniChar + * mistakenly considers NTS 0-byte as a continuation of invalid utf-8 + * sequence, bug [c61818e4c9] */ + if (ch1 == ch2 && p - pp == qInc) { break; } @@ -1671,12 +1674,17 @@ TrimLeft( const char *q = trim; int bytesLeft = numTrim; + /* take care about real length of char, e.g. if TclUtfToUniChar would + * mistakenly consider NTS 0-byte as a continuation of invalid utf-8 + * sequence, bug [c61818e4c9] */ + if (pInc > numBytes) {pInc = numBytes;} + /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2) { + if (ch1 == ch2 && pInc == qInc) { break; } -- cgit v0.12 From fe515177dd3f500c04c593db04baa6a8735ecd3b Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 21:06:58 +0000 Subject: New testing command so we can directly demonstrate flaws. --- generic/tclTest.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/generic/tclTest.c b/generic/tclTest.c index 66b2233..bfed72e 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -433,6 +433,7 @@ static int SimpleMatchInDirectory( Tcl_Interp *interp, Tcl_Obj *resultPtr, Tcl_Obj *dirPtr, const char *pattern, Tcl_GlobTypeData *types); +static Tcl_ObjCmdProc TestUtfPrevCmd; static int TestNumUtfCharsCmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); @@ -690,6 +691,8 @@ Tcltest_Init( (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testsetobjerrorcode", TestsetobjerrorcodeCmd, (ClientData) 0, NULL); + Tcl_CreateObjCommand(interp, "testutfprev", + TestUtfPrevCmd, (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testnumutfchars", TestNumUtfCharsCmd, (ClientData) 0, NULL); Tcl_CreateCommand(interp, "testsetplatform", TestsetplatformCmd, @@ -7094,6 +7097,51 @@ SimpleListVolumes(void) } /* + * Used to check operations of Tcl_UtfPrev. + * + * Usage: testutfprev $bytes $offset + */ + +static int +TestUtfPrevCmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + int numBytes, offset; + char *bytes; + const char *result; + Tcl_Obj *copy; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes offset"); + return TCL_ERROR; + } + + bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + copy = Tcl_DuplicateObj(objv[1]); + bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); + bytes[numBytes] = '\0'; + + result = Tcl_UtfPrev(bytes + offset, bytes); + + Tcl_DecrRefCount(copy); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + return TCL_OK; +} + +/* * Used to check correct string-length determining in Tcl_NumUtfChars */ -- cgit v0.12 From bf7064c9adf77f3184dc5efcaa49e8d05da728cf Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 7 Apr 2020 21:49:02 +0000 Subject: Convert the tests to use the testing command. --- generic/tclTest.c | 22 +++--- tests/utf.test | 208 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 129 insertions(+), 101 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index bfed72e..506cef9 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -7114,20 +7114,24 @@ TestUtfPrevCmd( const char *result; Tcl_Obj *copy; - if (objc != 3) { - Tcl_WrongNumArgs(interp, 1, objv, "bytes offset"); + if (objc < 2 || objc > 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); return TCL_ERROR; } bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); - if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { - return TCL_ERROR; - } - if (offset < 0) { - offset = 0; - } - if (offset > numBytes) { + if (objc == 3) { + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + } else { offset = numBytes; } copy = Tcl_DuplicateObj(objv[1]); diff --git a/tests/utf.test b/tests/utf.test index de529a6..7fe0b4e 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -94,98 +94,122 @@ test utf-5.1 {Tcl_UtfFindFirsts} { test utf-6.1 {Tcl_UtfNext} { } {} -test utf-7.1 {Tcl_UtfPrev} { - string trimright {} X -} {} -test utf-7.2 {Tcl_UtfPrev} { - string trimright A X -} A -test utf-7.3 {Tcl_UtfPrev} { - string trimright AA X -} AA -test utf-7.4 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8] X -} [bytestring A\xF8] -test utf-7.5 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4] X -} [bytestring A\xF4] -test utf-7.6 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8] X -} [bytestring A\xE8] -test utf-7.7 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0] X -} [bytestring A\xD0] -test utf-7.8 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0] X -} [bytestring A\xA0] -test utf-7.9 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0] X -} [bytestring A\xF8\xA0] -test utf-7.10 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0] X -} [bytestring A\xF4\xA0] -test utf-7.11 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0] X -} [bytestring A\xE8\xA0] -test utf-7.12 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0] X -} [bytestring A\xD0\xA0] -test utf-7.13 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0\xA0] X -} [bytestring A\xA0\xA0] -test utf-7.14 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0\xA0] X -} [bytestring A\xF8\xA0\xA0] -test utf-7.15 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0\xA0] X -} [bytestring A\xF4\xA0\xA0] -test utf-7.16 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0\xA0] X -} [bytestring A\xE8\xA0\xA0] -test utf-7.17 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0\xA0] X -} [bytestring A\xD0\xA0\xA0] -test utf-7.18 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0\xA0\xA0] X -} [bytestring A\xA0\xA0\xA0] -test utf-7.19 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0\xA0\xA0] X -} [bytestring A\xF8\xA0\xA0\xA0] -test utf-7.20 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0\xA0\xA0] X -} [bytestring A\xF4\xA0\xA0\xA0] -test utf-7.21 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0\xA0\xA0] X -} [bytestring A\xE8\xA0\xA0\xA0] -test utf-7.22 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0\xA0\xA0] X -} [bytestring A\xD0\xA0\xA0\xA0] -test utf-7.23 {Tcl_UtfPrev} { - string trimright [bytestring A\xA0\xA0\xA0\xA0] X -} [bytestring A\xA0\xA0\xA0\xA0] - -test utf-7.24 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0] [bytestring \xF8] -} [bytestring A\xF8\xA0] -test utf-7.25 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0] [bytestring \xF4] -} [bytestring A\xF4\xA0] -test utf-7.26 {Tcl_UtfPrev} { - string trimright [bytestring A\xE8\xA0] [bytestring \xE8] -} [bytestring A\xE8\xA0] -test utf-7.27 {Tcl_UtfPrev} { - string trimright [bytestring A\xF8\xA0\xA0] [bytestring \xF8] -} [bytestring A\xF8\xA0\xA0] -test utf-7.28 {Tcl_UtfPrev} { - string trimright [bytestring A\xF4\xA0\xA0] [bytestring \xF4] -} [bytestring A\xF4\xA0\xA0] -test utf-7.29 {Tcl_UtfPrev} { - string trimright [bytestring A\xD0\xA0\xA0] [bytestring \xD0] -} [bytestring A\xD0\xA0\xA0] - -test utf-7.30 {Tcl_UtfPrev} { - string trimright [bytestring A\xC0\x80\xA0] \u0000 -} [bytestring A\xC0\x80\xA0] +testConstraint testutfprev [llength [info commands testutfprev]] + +test utf-7.1 {Tcl_UtfPrev} testutfprev { + testutfprev {} +} 0 +test utf-7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A +} 0 +test utf-7.3 {Tcl_UtfPrev} testutfprev { + testutfprev AA +} 1 +test utf-7.4 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8 +} 1 +test utf-7.4.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 2 +} 1 +test utf-7.5 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4 +} 1 +test utf-7.5.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 2 +} 1 +test utf-7.6 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8 +} 1 +test utf-7.6.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 2 +} 1 +test utf-7.7 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0 +} 1 +test utf-7.7.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 2 +} 1 +test utf-7.8 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0 +} 1 +test utf-7.8.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 2 +} 1 +test utf-7.9 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0 +} 2 +test utf-7.9.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 3 +} 2 +test utf-7.10 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0 +} 2 +test utf-7.10.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 3 +} 2 +test utf-7.11 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0 +} 2 +test utf-7.11.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 3 +} 1 +test utf-7.12 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0 +} 1 +test utf-7.12.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 3 +} 1 +test utf-7.13 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0 +} 2 +test utf-7.13.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 3 +} 2 +test utf-7.14 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0 +} 3 +test utf-7.14.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 4 +} 3 +test utf-7.15 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0 +} 3 +test utf-7.15.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 4 +} 3 +test utf-7.16 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0 +} 1 +test utf-7.16.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 4 +} 1 +test utf-7.17 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0 +} 3 +test utf-7.17.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 4 +} 3 +test utf-7.18 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0 +} 3 +test utf-7.18.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 4 +} 3 +test utf-7.19 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 +} 4 +test utf-7.20 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 +} 4 +test utf-7.21 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 +} 4 +test utf-7.22 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 +} 4 +test utf-7.23 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 +} 4 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From ac03c44f432374514af20b60a1aac369b9147c10 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 14:09:48 +0000 Subject: more tests --- tests/utf.test | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 7fe0b4e..9ce2b64 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -111,90 +111,135 @@ test utf-7.4 {Tcl_UtfPrev} testutfprev { test utf-7.4.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 2 } 1 +test utf-7.4.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xF8\xA0\xA0 2 +} 1 test utf-7.5 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4 } 1 test utf-7.5.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0\xA0 2 } 1 +test utf-7.5.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xF8\xA0\xA0 2 +} 1 test utf-7.6 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8 } 1 test utf-7.6.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 2 } 1 +test utf-7.6.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xF8\xA0\xA0 2 +} 1 test utf-7.7 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0 } 1 test utf-7.7.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xA0 2 } 1 +test utf-7.7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xF8\xA0\xA0 2 +} 1 test utf-7.8 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0 } 1 test utf-7.8.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 2 } 1 +test utf-7.8.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xF8\xA0\xA0 2 +} 1 test utf-7.9 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0 } 2 test utf-7.9.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 3 } 2 +test utf-7.9.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xF8\xA0 3 +} 2 test utf-7.10 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0 } 2 test utf-7.10.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0\xA0 3 } 2 +test utf-7.10.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xF8\xA0 3 +} 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 } 2 test utf-7.11.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 3 } 1 +test utf-7.11.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xF8\xA0 3 +} 2 test utf-7.12 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0 } 1 test utf-7.12.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xA0 3 } 1 +test utf-7.12.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xF8\xA0 3 +} 1 test utf-7.13 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0 } 2 test utf-7.13.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 3 } 2 +test utf-7.13.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xF8\xA0 3 +} 2 test utf-7.14 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0 } 3 test utf-7.14.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 4 } 3 +test utf-7.14.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xF8 4 +} 3 test utf-7.15 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0 } 3 test utf-7.15.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xF4\xA0\xA0\xA0 4 } 3 +test utf-7.15.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xF8 4 +} 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 } 1 test utf-7.16.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 4 } 1 +test utf-7.16.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xF8 4 +} 1 test utf-7.17 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0 } 3 test utf-7.17.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xA0 4 } 3 +test utf-7.17.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xF8 4 +} 3 test utf-7.18 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0 } 3 test utf-7.18.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 4 } 3 +test utf-7.18.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xF8 4 +} 3 test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -- cgit v0.12 From ac0e8526f0e0d8b60502c6a92ed6e4b06c9ebd02 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 16:43:09 +0000 Subject: Restore the original Tcl_UtfPrev routine. Fails a different set of tests. Many fewer. --- generic/tclUtf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7d3db57..b66a2eb 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -693,6 +693,9 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { + if (totalBytes[byte] != i + 1) { + break; + } return look; } look--; -- cgit v0.12 From a0961e94367316cf621eec486300b53b8411bd47 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 17:31:58 +0000 Subject: Apply better bug fix that does not create new bugs this time. --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b66a2eb..b7e8f5e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -693,7 +693,7 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { - if (totalBytes[byte] != i + 1) { + if (totalBytes[byte] <= i) { break; } return look; -- cgit v0.12 From 41a89c36a8f7b088c02c032fa5f61056dfd1f383 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 8 Apr 2020 18:07:07 +0000 Subject: Cherry pick the [string trim] changes. --- generic/tclUtil.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index cb5072b..f4879a1 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1589,10 +1589,7 @@ TrimRight( Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - /* compare chars and real length of char, e.g. if TclUtfToUniChar - * mistakenly considers NTS 0-byte as a continuation of invalid utf-8 - * sequence, bug [c61818e4c9] */ - if (ch1 == ch2 && p - pp == qInc) { + if (ch1 == ch2) { break; } @@ -1604,8 +1601,7 @@ TrimRight( /* No match; trim task done; *p is last non-trimmed char */ break; } - p = pp; - } while (p > bytes); + } while ((p = pp) > bytes); return numBytes - (p - bytes); } @@ -1684,7 +1680,7 @@ TrimLeft( Tcl_UniChar ch2; int qInc = TclUtfToUniChar(q, &ch2); - if (ch1 == ch2 && pInc == qInc) { + if (ch1 == ch2) { break; } -- cgit v0.12 From ea5d755488be5c353d266e5ef9666e9e13457f17 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 14:30:49 +0000 Subject: A NUL byte cannot be mistaken for a trail byte. --- generic/tclUtil.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 15b67b9..82ef9b7 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1625,11 +1625,6 @@ TclTrimLeft( const char *q = trim; int bytesLeft = numTrim; - /* take care about real length of char, e.g. if TclUtfToUniChar would - * mistakenly consider NTS 0-byte as a continuation of invalid utf-8 - * sequence, bug [c61818e4c9] */ - if (pInc > numBytes) {pInc = numBytes;} - /* Inner loop: scan trim string for match to current character */ do { Tcl_UniChar ch2; -- cgit v0.12 From 01fa998afeaf983e50cf0ab93936a53250a0fa4c Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 16:57:15 +0000 Subject: Repair tests to expect the right thing. --- tests/utf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 9ce2b64..c2191c2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -170,13 +170,13 @@ test utf-7.10.2 {Tcl_UtfPrev} testutfprev { } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 -} 2 +} 1 test utf-7.11.1 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 3 } 1 test utf-7.11.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xF8\xA0 3 -} 2 +} 1 test utf-7.12 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0 } 1 -- cgit v0.12 From 4214a568d4ce47e17c79050848b51f51b2ffb8df Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 18:42:09 +0000 Subject: Make the comments describing Tcl_UtfPrev more complete and precise. --- generic/tclUtf.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1a8d515..fbdba4c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -654,15 +654,43 @@ Tcl_UtfNext( * * Tcl_UtfPrev -- * - * Given a pointer to some current location in a UTF-8 string, move - * backwards one character. This works correctly when the pointer is in - * the middle of a UTF-8 character. + * The aim of this routine is to provide a way to move backward + * through a UTF-8 string. The caller is expected to pass non-NULL + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and + * that character might include the byte src[-1]. The routine will + * examine only those bytes in the range that might be returned. + * It will not examine the byte *src, and because of that cannot + * determine for certain in all circumstances whether the character + * that begins with the returned pointer will or will not include + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the + * final complete character in the string or to the earliest byte + * that might start an incomplete character waiting for more bytes to + * complete. + * + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator + * through a string that will always make progress and always be + * prevented from running past the beginning of the string. + * + * In a string where all characters are complete and properly formed, + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: - * The return value is a pointer to the previous character in the UTF-8 - * string. If the current location was already at the beginning of the - * string, the return value will also be a pointer to the beginning of - * the string. + * A pointer to the start of a character in the string as described + * above. * * Side effects: * None. @@ -672,9 +700,8 @@ Tcl_UtfNext( CONST char * Tcl_UtfPrev( - CONST char *src, /* The current location in the string. */ - CONST char *start) /* Pointer to the beginning of the string, to - * avoid going backwards too far. */ + CONST char *src, /* A location in a UTF-8 string. */ + CONST char *start) /* Pointer to the beginning of the string */ { CONST char *look; int i, byte; -- cgit v0.12 From d7f2c3aaa409a6493a80b5ae7cfdc391babcd6c5 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 13 Apr 2020 19:39:39 +0000 Subject: Improve the precision of the Tcl_UtfPrev documentation. --- doc/Utf.3 | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 5361f32..87d1318 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -223,13 +223,27 @@ string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null character. .PP -Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a -null byte immediately following such a string), \fBTcl_UtfPrev\fR -returns a pointer to the closest preceding byte that starts a UTF-8 -character. -This function will not back up to a position before \fIstart\fR, -the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the -return value will be \fIstart\fR. +\fBTcl_UtfPrev\fR is used to step backward through but not beyond the +UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made +up entirely of complete and well-formed characters, and \fIsrc\fR points +to the lead byte of one of those characters (or to the location one byte +past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will +return pointers to the lead bytes of each character in the string, one +character at a time, terminating when it returns \fIstart\fR. +.PP +When the conditions of completeness and well-formedness may not be satisfied, +a more precise description of the function of \fBTcl_UtfPrev\fR is necessary. +It always returns a pointer greater than or equal to \fIstart\fR; that is, +always a pointer to a location in the string. It always returns a pointer to +a byte that begins a character when scanning for characters beginning +from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it +always returns a pointer less than \fIsrc\fR and greater than or +equal to (\fIsrc\fR - \fBTCL_UTF_MAX\fR). The character that begins +at the returned pointer is the first one that either includes the +byte \fIsrc[-1]\fR, or might include it if the right trail bytes are +present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the +byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte +\fIsrc[-\fBTCL_UTF_MAX\fI-1]\fR. .PP \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the Pascal Ord() function. It returns the Tcl_UniChar represented at the -- cgit v0.12