From e9b1c88a4142a059a821fdd2f2b02272a53c9151 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 20:03:33 +0000 Subject: The function of Tcl_UtfNext() is to advance a pointer. There's nothing inherent in that task that requires decoding of the characters, but the implementation does that. Let's try a simpler solution for callers that do not need the content decoded. --- generic/tclUtf.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index fbdba4c..a03fa30 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -644,9 +644,19 @@ CONST char * Tcl_UtfNext( CONST char *src) /* The current location in the string. */ { - Tcl_UniChar ch; - - return src + TclUtfToUniChar(src, &ch); + int byte = *((unsigned char *) src); + int left = totalBytes[byte]; + + src++; + while (--left) { + byte = *((unsigned char *) src); + if ((byte & 0xC0) != 0x80) { + /* src points to non-trail byte; return it */ + return src; + } + src++; + } + return src; } /* -- cgit v0.12 From 279c54dbff724a62d6739a9cc71ba31a83325c98 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 20:18:55 +0000 Subject: Create and use an optimized macro TclUtfNext() for Tcl_UtfNext(). --- generic/tclInt.h | 3 +++ generic/tclUtil.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/generic/tclInt.h b/generic/tclInt.h index 15bc000..e92cd18 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3691,6 +3691,9 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, CONST char *file, ((*(chPtr) = (unsigned char) *(str)), 1) \ : Tcl_UtfToUniChar(str, chPtr)) +#define TclUtfNext(src) \ + ((((unsigned char) *(src)) < 0xC0) ? src + 1 : Tcl_UtfNext(src)) + /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 3dd9a32..e87cf83 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1691,7 +1691,7 @@ TclTrim( * that we will not trim. Skip over it. */ if (numBytes > 0) { const char *first = bytes + trimLeft; - bytes = Tcl_UtfNext(first); + bytes = TclUtfNext(first); numBytes -= (bytes - first); if (numBytes > 0) { -- cgit v0.12 From 6df0ccf3997e397b860c47c770ba0fc31a2a9961 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:16:43 +0000 Subject: Replace calls of TclUtfToUniChar() with TclUtfNext() when caller has no decoding need. Failing test string-22.14 indicates something is still not quite right. Now that Tcl_NumUtfChars() is not paying decoding prices, we let it spend to properly protect against overflow. [2738427] --- generic/tclCompExpr.c | 5 ++--- generic/tclUtf.c | 19 ++++++------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 27d7503..42321af 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1801,7 +1801,6 @@ ParseLexeme( { const char *end; int scanned; - Tcl_UniChar ch; Tcl_Obj *literal = NULL; unsigned char byte; @@ -1979,12 +1978,12 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = Tcl_UtfToUniChar(start, &ch); + scanned = TclUtfNext(start) - start; } else { char utfBytes[TCL_UTF_MAX]; memcpy(utfBytes, start, (size_t) numBytes); utfBytes[numBytes] = '\0'; - scanned = Tcl_UtfToUniChar(utfBytes, &ch); + scanned = TclUtfNext(utfBytes) - utfBytes; } *lexemePtr = INVALID; Tcl_DecrRefCount(literal); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a03fa30..25d52d0 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -504,7 +504,6 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - Tcl_UniChar ch; register int i; /* @@ -516,21 +515,20 @@ Tcl_NumUtfChars( i = 0; if (length < 0) { - while (*src != '\0') { - src += TclUtfToUniChar(src, &ch); + while ((*src != '\0') && (i < INT_MAX)) { + src = TclUtfNext(src); i++; } - if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - src += TclUtfToUniChar(src, &ch); + src = TclUtfNext(src); i++; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); + src = TclUtfNext(src); i++; } if (src < endPtr) { @@ -764,10 +762,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch; - while (index >= 0) { - index--; - src += TclUtfToUniChar(src, &ch); - } + TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch); return ch; } @@ -793,11 +788,9 @@ Tcl_UtfAtIndex( register CONST char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; - while (index > 0) { index--; - src += TclUtfToUniChar(src, &ch); + src = TclUtfNext(src); } return src; } -- cgit v0.12 From e2b7ad1627665c99b128ff5a023e9f772fe467b4 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:25:33 +0000 Subject: Fix the bad logic in Tcl_UtfNext(). --- generic/tclUtf.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 25d52d0..7dd8598 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -644,17 +644,21 @@ Tcl_UtfNext( { int byte = *((unsigned char *) src); int left = totalBytes[byte]; + const char *next = src + 1; - src++; while (--left) { - byte = *((unsigned char *) src); + byte = *((unsigned char *) next); if ((byte & 0xC0) != 0x80) { - /* src points to non-trail byte; return it */ - return src; + /* + * src points to non-trail byte; We ran out of trail bytes + * before the needs of the lead bytes were satisfied. + * Let the (malformed) lead byte alone be a character + */ + return src + 1; } - src++; + next++; } - return src; + return next; } /* -- cgit v0.12 From 7aff227882dd8bfaa8972ecaf1e129bb9ef1e6e3 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:32:22 +0000 Subject: typo --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7dd8598..078ecf4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -651,7 +651,7 @@ Tcl_UtfNext( if ((byte & 0xC0) != 0x80) { /* * src points to non-trail byte; We ran out of trail bytes - * before the needs of the lead bytes were satisfied. + * before the needs of the lead byte were satisfied. * Let the (malformed) lead byte alone be a character */ return src + 1; -- cgit v0.12 From f0f59ae8a31a818d78cb449dc4532762cfb2bb00 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:39:53 +0000 Subject: New testing command [testutfnext]. --- generic/tclTest.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index 31d3a7f..782b9a2 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -433,6 +433,7 @@ static int SimpleMatchInDirectory( Tcl_Interp *interp, Tcl_Obj *resultPtr, Tcl_Obj *dirPtr, const char *pattern, Tcl_GlobTypeData *types); +static Tcl_ObjCmdProc TestUtfNextCmd; static Tcl_ObjCmdProc TestUtfPrevCmd; static int TestNumUtfCharsCmd(ClientData clientData, Tcl_Interp *interp, int objc, @@ -697,8 +698,10 @@ Tcltest_Init( (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testsetobjerrorcode", TestsetobjerrorcodeCmd, NULL, NULL); + Tcl_CreateObjCommand(interp, "testutfnext", + TestUtfNextCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testutfprev", - TestUtfPrevCmd, (ClientData) 0, NULL); + TestUtfPrevCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testnumutfchars", TestNumUtfCharsCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testfindfirst", @@ -7107,6 +7110,52 @@ SimpleListVolumes(void) } /* + * Used to check operations of Tcl_UtfNext. + * + * Usage: testutfnext $bytes $offset + */ + +static int +TestUtfNextCmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + int numBytes, offset = 0; + char *bytes; + const char *result; + Tcl_Obj *copy; + + if (objc < 2 || objc > 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); + return TCL_ERROR; + } + + bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + + if (objc == 3) { + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + } + copy = Tcl_DuplicateObj(objv[1]); + bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); + bytes[numBytes] = '\0'; + + result = Tcl_UtfNext(bytes + offset); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + + Tcl_DecrRefCount(copy); + return TCL_OK; +} +/* * Used to check operations of Tcl_UtfPrev. * * Usage: testutfprev $bytes $offset @@ -7149,9 +7198,9 @@ TestUtfPrevCmd( bytes[numBytes] = '\0'; result = Tcl_UtfPrev(bytes + offset, bytes); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); Tcl_DecrRefCount(copy); - Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); return TCL_OK; } -- cgit v0.12 From 532ec4fa923534f592e04cc3c5679ce5771c684c Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 14:42:41 +0000 Subject: Collection of coverage tests for Tcl_UtfNext. --- tests/utf.test | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 244 insertions(+), 2 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 56ca1b9..b5358cc 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -111,8 +111,250 @@ test utf-5.2 {Tcl_UtfFindLast} testfindlast { testfindlast [bytestring "abcbc"] 98 } {bc} -test utf-6.1 {Tcl_UtfNext} { -} {} +testConstraint testutfnext [llength [info commands testutfnext]] + +test utf-6.1 {Tcl_UtfNext} testutfnext { + # This takes the pointer one past the terminating NUL. + # This is really an invalid call. + testutfnext {} +} 1 +test utf-6.2 {Tcl_UtfNext} testutfnext { + testutfnext A +} 1 +test utf-6.3 {Tcl_UtfNext} testutfnext { + testutfnext AA +} 1 +test utf-6.4 {Tcl_UtfNext} testutfnext { + testutfnext A\xA0 +} 1 +test utf-6.5 {Tcl_UtfNext} testutfnext { + testutfnext A\xD0 +} 1 +test utf-6.6 {Tcl_UtfNext} testutfnext { + testutfnext A\xE8 +} 1 +test utf-6.7 {Tcl_UtfNext} testutfnext { + testutfnext A\xF4 +} 1 +test utf-6.8 {Tcl_UtfNext} testutfnext { + testutfnext A\xF8 +} 1 +test utf-6.9 {Tcl_UtfNext} testutfnext { + testutfnext \xA0 +} 1 +test utf-6.10 {Tcl_UtfNext} testutfnext { + testutfnext \xA0G +} 1 +test utf-6.11 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xA0 +} 1 +test utf-6.12 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xD0 +} 1 +test utf-6.13 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xE8 +} 1 +test utf-6.14 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xF4 +} 1 +test utf-6.15 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xF8 +} 1 +test utf-6.16 {Tcl_UtfNext} testutfnext { + testutfnext \xD0 +} 1 +test utf-6.17 {Tcl_UtfNext} testutfnext { + testutfnext \xD0A +} 1 +test utf-6.18 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0 +} 2 +test utf-6.19 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xD0 +} 1 +test utf-6.20 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xE8 +} 1 +test utf-6.21 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xF4 +} 1 +test utf-6.22 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xF8 +} 1 +test utf-6.23 {Tcl_UtfNext} testutfnext { + testutfnext \xE8 +} 1 +test utf-6.24 {Tcl_UtfNext} testutfnext { + testutfnext \xE8A +} 1 +test utf-6.25 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0 +} 1 +test utf-6.26 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xD0 +} 1 +test utf-6.27 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xE8 +} 1 +test utf-6.28 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xF4 +} 1 +test utf-6.29 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xF8 +} 1 +test utf-6.30 {Tcl_UtfNext} testutfnext { + testutfnext \xF4 +} 1 +test utf-6.31 {Tcl_UtfNext} testutfnext { + testutfnext \xF4A +} 1 +test utf-6.32 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0 +} 1 +test utf-6.33 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xD0 +} 1 +test utf-6.34 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xE8 +} 1 +test utf-6.35 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xF4 +} 1 +test utf-6.36 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xF8 +} 1 +test utf-6.37 {Tcl_UtfNext} testutfnext { + testutfnext \xF8 +} 1 +test utf-6.38 {Tcl_UtfNext} testutfnext { + testutfnext \xF8A +} 1 +test utf-6.39 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xA0 +} 1 +test utf-6.40 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xD0 +} 1 +test utf-6.41 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xE8 +} 1 +test utf-6.42 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xF4 +} 1 +test utf-6.43 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xF8 +} 1 +test utf-6.44 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0G +} 2 +test utf-6.45 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xA0 +} 2 +test utf-6.46 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xD0 +} 2 +test utf-6.47 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xE8 +} 2 +test utf-6.48 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xF4 +} 2 +test utf-6.49 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xF8 +} 2 +test utf-6.50 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0G +} 1 +test utf-6.51 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0 +} 3 +test utf-6.52 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xD0 +} 1 +test utf-6.53 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xE8 +} 1 +test utf-6.54 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xF4 +} 1 +test utf-6.55 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xF8 +} 1 +test utf-6.56 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0G +} 1 +test utf-6.57 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0 +} 1 +test utf-6.58 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xD0 +} 1 +test utf-6.59 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xE8 +} 1 +test utf-6.60 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xF4 +} 1 +test utf-6.61 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xF8 +} 1 +test utf-6.62 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0G +} 3 +test utf-6.63 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xA0 +} 3 +test utf-6.64 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xD0 +} 3 +test utf-6.65 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xE8 +} 3 +test utf-6.66 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xF4 +} 3 +test utf-6.67 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xF8 +} 3 +test utf-6.68 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0G +} 1 +test utf-6.69 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0 +} 1 +test utf-6.70 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xD0 +} 1 +test utf-6.71 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xE8 +} 1 +test utf-6.71 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xF4 +} 1 +test utf-6.73 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xF8 +} 1 +test utf-6.74 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0G +} 1 +test utf-6.75 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xA0 +} 1 +test utf-6.76 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xD0 +} 1 +test utf-6.77 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xE8 +} 1 +test utf-6.78 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xF4 +} 1 +test utf-6.79 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0G\xF8 +} 1 + + + testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From e2a3a358c95196a7bf142d591fa5ef729b3b0d69 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 16:42:32 +0000 Subject: Add test demonstrating that Tcl_UtfNext accepts overlong byte sequences, which is in conflict with what Tcl_UtfToUniChar does. --- tests/utf.test | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index b5358cc..a930aae 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -352,9 +352,9 @@ test utf-6.78 {Tcl_UtfNext} testutfnext { test utf-6.79 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0G\xF8 } 1 - - - +test utf-6.80 {Tcl_UtfNext - overlong sequences} { + testutfnext \xC0\x81 +} 1 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From e6faa58e6df3292b2c0735ba4921af4be0e215fa Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 18:40:40 +0000 Subject: More tests and fix for overlong handling in revised Tcl_UtfNext. --- generic/tclUtf.c | 3 +++ tests/utf.test | 23 ++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e41e7a5..00ca94e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -712,6 +712,9 @@ Tcl_UtfNext( } next++; } + if (Overlong(src)) { + return src + 1; + } return next; } diff --git a/tests/utf.test b/tests/utf.test index 72165f9..02b7002 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -352,9 +352,30 @@ test utf-6.78 {Tcl_UtfNext} testutfnext { test utf-6.79 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0G\xF8 } 1 -test utf-6.80 {Tcl_UtfNext - overlong sequences} { +test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xC0\x80 +} 2 +test utf-6.81 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x81 } 1 +test utf-6.82 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xC1\x80 +} 1 +test utf-6.83 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xC2\x80 +} 2 +test utf-6.84 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xE0\x80\x80 +} 1 +test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xE0\xA0\x80 +} 3 +test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xF0\x80\x80\x80 +} 1 +test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xF0\x90\x80\x80 +} 1 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From 443928c10f1ac94e6a6adfafb478eb9fa09ac39a Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 18:42:37 +0000 Subject: compiler warning --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 00ca94e..91e9c73 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -712,7 +712,7 @@ Tcl_UtfNext( } next++; } - if (Overlong(src)) { + if (Overlong((unsigned char *)src)) { return src + 1; } return next; -- cgit v0.12 From 60d5424069845124e51d4032f295d913a17454a1 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 19:02:40 +0000 Subject: More detailed comments. --- generic/tclUtf.c | 70 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 91e9c73..67603af 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -678,13 +678,35 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * Given a pointer to some current location in a UTF-8 string, move - * forward one character. The caller must ensure that they are not asking - * for the next character after the last character in the string. + * The aim of this routine is to provide a way to iterate forward + * through a UTF-8 string. The caller is expected to pass a non-NULL + * pointer argument /src/ which points to a location within a string. + * (*src) will be read, so /src/ must not point to an unreadable + * location past the end of the string. If /src/ points to the + * beginning of a complete, well-formed and valid UTF_8 byte sequence + * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer + * just past the end of that sequence. In any other circumstance, + * Tcl_UtfNext returns /src/+1. + * + * Because this routine always returns a value > /src/, it is useful + * as a forward iterator that will always make progress. If the string + * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating + * NUL character. If it is not NUL-terminated, the caller must make + * use of the companion routine Tcl_UtfCharComplete to test whether + * there is risk that Tcl_UtfNext will read beyond the end of the string. + * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. + * + * In a string where all characters are complete and properly formed, + * and /src/ points to the first byte of a character, repeated + * Tcl_UtfNext calls will step to the starting bytes of characters, one + * character at a time. Within those limitations, Tcl_UtfPrev and + * Tcl_UtfNext are inverses. If either condition cannot be met, + * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the + * caller will have to take greater care. * * Results: - * The return value is the pointer to the next character in the UTF-8 - * string. + * A pointer to the start of the next character in the string (or to + * the end of the string) as described above. * * Side effects: * None. @@ -725,37 +747,37 @@ Tcl_UtfNext( * * The aim of this routine is to provide a way to move backward * through a UTF-8 string. The caller is expected to pass non-NULL - * pointer arguments start and src. start points to the beginning - * of a string, and src >= start points to a location within (or just - * past the end) of the string. This routine always returns a - * pointer within the string (>= start). When (src == start), it - * returns start. When (src > start), it returns a pointer (< src) - * and (>= src - TCL_UTF_MAX). Subject to these constraints, the - * routine returns a pointer to the earliest byte in the string that - * starts a character when characters are read starting at start and + * pointer arguments /start/ and /src/. /start/ points to the beginning + * of a string, and /src/ (>= /start/) points to a location within (or + * just past the end) of the string. This routine always returns a + * pointer within the string (>= /start/). When (/src/ == /start/), + * it returns /start/. When (/src/ > /start/), it returns a pointer + * (< /src/) and (>= /src/ - TCL_UTF_MAX). Subject to these constraints, + * the routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at /start/ and * that character might include the byte src[-1]. The routine will * examine only those bytes in the range that might be returned. - * It will not examine the byte *src, and because of that cannot + * It will not examine the byte (*src), and because of that cannot * determine for certain in all circumstances whether the character * that begins with the returned pointer will or will not include - * the byte src[-1]. In the scenario, where src points to the end of - * a buffer being filled, the returned pointer point to either the + * the byte src[-1]. In the scenario where /src/ points to the end of + * a buffer being filled, the returned pointer points to either the * final complete character in the string or to the earliest byte * that might start an incomplete character waiting for more bytes to * complete. * - * Because this routine always returns a value < src until the point - * it is forced to return start, it is useful as a backward iterator + * Because this routine always returns a value < /src/ until the point + * it is forced to return /start/, it is useful as a backward iterator * through a string that will always make progress and always be * prevented from running past the beginning of the string. * * In a string where all characters are complete and properly formed, - * and the value of src points to the first byte of a character, - * repeated Tcl_UtfPrev calls will step to the starting bytes of - * characters, one character at a time. Within those limitations, - * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot - * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and - * the caller will have to take greater care. + * and /src/ points to the first byte of a character, repeated + * Tcl_UtfPrev calls will step to the starting bytes of characters, one + * character at a time. Within those limitations, Tcl_UtfPrev and + * Tcl_UtfNext are inverses. If either condition cannot be met, + * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the + * caller will have to take greater care. * * Results: * A pointer to the start of a character in the string as described -- cgit v0.12 From c2479a465e6bef08275d47d0277deda87e6e014e Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 19:04:47 +0000 Subject: delete merge litter --- generic/tclTest.c | 1 - 1 file changed, 1 deletion(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index b8507bf..6e0fbed 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -325,7 +325,6 @@ static Tcl_FSPathInFilesystemProc SimplePathInFilesystem; static Tcl_Obj * SimpleRedirect(Tcl_Obj *pathPtr); static Tcl_FSMatchInDirectoryProc SimpleMatchInDirectory; static Tcl_ObjCmdProc TestUtfNextCmd; -static Tcl_ObjCmdProc TestUtfNextCmd; static Tcl_ObjCmdProc TestUtfPrevCmd; static Tcl_ObjCmdProc TestNumUtfCharsCmd; static Tcl_ObjCmdProc TestFindFirstCmd; -- cgit v0.12 From 66197fff215f60690a444b3f2af67a0c3c87c8af Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 19:39:36 +0000 Subject: Improve the docs for Tcl_UtfNext. --- doc/Utf.3 | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 87d1318..cb82699 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -217,11 +217,20 @@ returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is considered part of the UTF-8 string. .PP -Given \fIsrc\fR, a pointer to some location in a UTF-8 string, -\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the -string. The caller must not ask for the next character after the last -character in the string if the string is not terminated by a null -character. +\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string. +If the UTF-8 string is made up entirely of complete, well-formed, and +valid character byte sequences, and \fIsrc\fR points to the lead byte +of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will +return pointers to the lead bytes of each character in the string, one +character at a time. In any other circumstance, \fBTcl_UtfNext\fR +returns \fIsrc\fR+1. \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR +and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR) +as needed to find the end of the byte sequence. If the string is +\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating +\fBNUL\fR byte. If not, the caller must use the companion routine +\fBTcl_UtfCharComplete\fR to determine whether there is any risk +\fBTcl_UtfNext\fR might read beyond the readable memory occupied +by the string. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made -- cgit v0.12