From 105caf2a2a57f837098c768474183dae2a8ea8b0 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 20:03:33 +0000 Subject: The function of Tcl_UtfNext() is to advance a pointer. There's nothing inherent in that task that requires decoding of the characters, but the implementation does that. Let's try a simpler solution for callers that do not need the content decoded. --- generic/tclUtf.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index fbdba4c..a03fa30 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -644,9 +644,19 @@ CONST char * Tcl_UtfNext( CONST char *src) /* The current location in the string. */ { - Tcl_UniChar ch; - - return src + TclUtfToUniChar(src, &ch); + int byte = *((unsigned char *) src); + int left = totalBytes[byte]; + + src++; + while (--left) { + byte = *((unsigned char *) src); + if ((byte & 0xC0) != 0x80) { + /* src points to non-trail byte; return it */ + return src; + } + src++; + } + return src; } /* -- cgit v0.12 From 15451ee26da26b99995357d4b84f9617ee4ebe92 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 20:18:55 +0000 Subject: Create and use an optimized macro TclUtfNext() for Tcl_UtfNext(). --- generic/tclInt.h | 3 +++ generic/tclUtil.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/generic/tclInt.h b/generic/tclInt.h index 15bc000..e92cd18 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3691,6 +3691,9 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, CONST char *file, ((*(chPtr) = (unsigned char) *(str)), 1) \ : Tcl_UtfToUniChar(str, chPtr)) +#define TclUtfNext(src) \ + ((((unsigned char) *(src)) < 0xC0) ? src + 1 : Tcl_UtfNext(src)) + /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 3dd9a32..e87cf83 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1691,7 +1691,7 @@ TclTrim( * that we will not trim. Skip over it. */ if (numBytes > 0) { const char *first = bytes + trimLeft; - bytes = Tcl_UtfNext(first); + bytes = TclUtfNext(first); numBytes -= (bytes - first); if (numBytes > 0) { -- cgit v0.12 From fe2553850d8bf3a1345f21f74bfdd220531d9733 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:16:43 +0000 Subject: Replace calls of TclUtfToUniChar() with TclUtfNext() when caller has no decoding need. Failing test string-22.14 indicates something is still not quite right. Now that Tcl_NumUtfChars() is not paying decoding prices, we let it spend to properly protect against overflow. [2738427] --- generic/tclCompExpr.c | 5 ++--- generic/tclUtf.c | 19 ++++++------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 27d7503..42321af 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1801,7 +1801,6 @@ ParseLexeme( { const char *end; int scanned; - Tcl_UniChar ch; Tcl_Obj *literal = NULL; unsigned char byte; @@ -1979,12 +1978,12 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = Tcl_UtfToUniChar(start, &ch); + scanned = TclUtfNext(start) - start; } else { char utfBytes[TCL_UTF_MAX]; memcpy(utfBytes, start, (size_t) numBytes); utfBytes[numBytes] = '\0'; - scanned = Tcl_UtfToUniChar(utfBytes, &ch); + scanned = TclUtfNext(utfBytes) - utfBytes; } *lexemePtr = INVALID; Tcl_DecrRefCount(literal); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a03fa30..25d52d0 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -504,7 +504,6 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - Tcl_UniChar ch; register int i; /* @@ -516,21 +515,20 @@ Tcl_NumUtfChars( i = 0; if (length < 0) { - while (*src != '\0') { - src += TclUtfToUniChar(src, &ch); + while ((*src != '\0') && (i < INT_MAX)) { + src = TclUtfNext(src); i++; } - if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - src += TclUtfToUniChar(src, &ch); + src = TclUtfNext(src); i++; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); + src = TclUtfNext(src); i++; } if (src < endPtr) { @@ -764,10 +762,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch; - while (index >= 0) { - index--; - src += TclUtfToUniChar(src, &ch); - } + TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch); return ch; } @@ -793,11 +788,9 @@ Tcl_UtfAtIndex( register CONST char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; - while (index > 0) { index--; - src += TclUtfToUniChar(src, &ch); + src = TclUtfNext(src); } return src; } -- cgit v0.12 From 3159e52862aa48f9ee5a0cf09ebe55216c68151c Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:25:33 +0000 Subject: Fix the bad logic in Tcl_UtfNext(). --- generic/tclUtf.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 25d52d0..7dd8598 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -644,17 +644,21 @@ Tcl_UtfNext( { int byte = *((unsigned char *) src); int left = totalBytes[byte]; + const char *next = src + 1; - src++; while (--left) { - byte = *((unsigned char *) src); + byte = *((unsigned char *) next); if ((byte & 0xC0) != 0x80) { - /* src points to non-trail byte; return it */ - return src; + /* + * src points to non-trail byte; We ran out of trail bytes + * before the needs of the lead bytes were satisfied. + * Let the (malformed) lead byte alone be a character + */ + return src + 1; } - src++; + next++; } - return src; + return next; } /* -- cgit v0.12 From 4b0e038a777ae51e825022ceaf1534c0d9bd0085 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:32:22 +0000 Subject: typo --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7dd8598..078ecf4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -651,7 +651,7 @@ Tcl_UtfNext( if ((byte & 0xC0) != 0x80) { /* * src points to non-trail byte; We ran out of trail bytes - * before the needs of the lead bytes were satisfied. + * before the needs of the lead byte were satisfied. * Let the (malformed) lead byte alone be a character */ return src + 1; -- cgit v0.12 From 8204714c967a484eaadc038cec0276a84f907a53 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 14 Apr 2020 21:39:53 +0000 Subject: New testing command [testutfnext]. --- generic/tclTest.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index 31d3a7f..782b9a2 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -433,6 +433,7 @@ static int SimpleMatchInDirectory( Tcl_Interp *interp, Tcl_Obj *resultPtr, Tcl_Obj *dirPtr, const char *pattern, Tcl_GlobTypeData *types); +static Tcl_ObjCmdProc TestUtfNextCmd; static Tcl_ObjCmdProc TestUtfPrevCmd; static int TestNumUtfCharsCmd(ClientData clientData, Tcl_Interp *interp, int objc, @@ -697,8 +698,10 @@ Tcltest_Init( (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testsetobjerrorcode", TestsetobjerrorcodeCmd, NULL, NULL); + Tcl_CreateObjCommand(interp, "testutfnext", + TestUtfNextCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testutfprev", - TestUtfPrevCmd, (ClientData) 0, NULL); + TestUtfPrevCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testnumutfchars", TestNumUtfCharsCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testfindfirst", @@ -7107,6 +7110,52 @@ SimpleListVolumes(void) } /* + * Used to check operations of Tcl_UtfNext. + * + * Usage: testutfnext $bytes $offset + */ + +static int +TestUtfNextCmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + int numBytes, offset = 0; + char *bytes; + const char *result; + Tcl_Obj *copy; + + if (objc < 2 || objc > 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); + return TCL_ERROR; + } + + bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + + if (objc == 3) { + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + } + copy = Tcl_DuplicateObj(objv[1]); + bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); + bytes[numBytes] = '\0'; + + result = Tcl_UtfNext(bytes + offset); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + + Tcl_DecrRefCount(copy); + return TCL_OK; +} +/* * Used to check operations of Tcl_UtfPrev. * * Usage: testutfprev $bytes $offset @@ -7149,9 +7198,9 @@ TestUtfPrevCmd( bytes[numBytes] = '\0'; result = Tcl_UtfPrev(bytes + offset, bytes); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); Tcl_DecrRefCount(copy); - Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); return TCL_OK; } -- cgit v0.12 From 8990d50cc6324f079a60ef75b0f16bb77cdb2889 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 14:42:41 +0000 Subject: Collection of coverage tests for Tcl_UtfNext. --- tests/utf.test | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 244 insertions(+), 2 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 56ca1b9..b5358cc 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -111,8 +111,250 @@ test utf-5.2 {Tcl_UtfFindLast} testfindlast { testfindlast [bytestring "abcbc"] 98 } {bc} -test utf-6.1 {Tcl_UtfNext} { -} {} +testConstraint testutfnext [llength [info commands testutfnext]] + +test utf-6.1 {Tcl_UtfNext} testutfnext { + # This takes the pointer one past the terminating NUL. + # This is really an invalid call. + testutfnext {} +} 1 +test utf-6.2 {Tcl_UtfNext} testutfnext { + testutfnext A +} 1 +test utf-6.3 {Tcl_UtfNext} testutfnext { + testutfnext AA +} 1 +test utf-6.4 {Tcl_UtfNext} testutfnext { + testutfnext A\xA0 +} 1 +test utf-6.5 {Tcl_UtfNext} testutfnext { + testutfnext A\xD0 +} 1 +test utf-6.6 {Tcl_UtfNext} testutfnext { + testutfnext A\xE8 +} 1 +test utf-6.7 {Tcl_UtfNext} testutfnext { + testutfnext A\xF4 +} 1 +test utf-6.8 {Tcl_UtfNext} testutfnext { + testutfnext A\xF8 +} 1 +test utf-6.9 {Tcl_UtfNext} testutfnext { + testutfnext \xA0 +} 1 +test utf-6.10 {Tcl_UtfNext} testutfnext { + testutfnext \xA0G +} 1 +test utf-6.11 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xA0 +} 1 +test utf-6.12 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xD0 +} 1 +test utf-6.13 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xE8 +} 1 +test utf-6.14 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xF4 +} 1 +test utf-6.15 {Tcl_UtfNext} testutfnext { + testutfnext \xA0\xF8 +} 1 +test utf-6.16 {Tcl_UtfNext} testutfnext { + testutfnext \xD0 +} 1 +test utf-6.17 {Tcl_UtfNext} testutfnext { + testutfnext \xD0A +} 1 +test utf-6.18 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0 +} 2 +test utf-6.19 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xD0 +} 1 +test utf-6.20 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xE8 +} 1 +test utf-6.21 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xF4 +} 1 +test utf-6.22 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xF8 +} 1 +test utf-6.23 {Tcl_UtfNext} testutfnext { + testutfnext \xE8 +} 1 +test utf-6.24 {Tcl_UtfNext} testutfnext { + testutfnext \xE8A +} 1 +test utf-6.25 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0 +} 1 +test utf-6.26 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xD0 +} 1 +test utf-6.27 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xE8 +} 1 +test utf-6.28 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xF4 +} 1 +test utf-6.29 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xF8 +} 1 +test utf-6.30 {Tcl_UtfNext} testutfnext { + testutfnext \xF4 +} 1 +test utf-6.31 {Tcl_UtfNext} testutfnext { + testutfnext \xF4A +} 1 +test utf-6.32 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0 +} 1 +test utf-6.33 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xD0 +} 1 +test utf-6.34 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xE8 +} 1 +test utf-6.35 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xF4 +} 1 +test utf-6.36 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xF8 +} 1 +test utf-6.37 {Tcl_UtfNext} testutfnext { + testutfnext \xF8 +} 1 +test utf-6.38 {Tcl_UtfNext} testutfnext { + testutfnext \xF8A +} 1 +test utf-6.39 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xA0 +} 1 +test utf-6.40 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xD0 +} 1 +test utf-6.41 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xE8 +} 1 +test utf-6.42 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xF4 +} 1 +test utf-6.43 {Tcl_UtfNext} testutfnext { + testutfnext \xF8\xF8 +} 1 +test utf-6.44 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0G +} 2 +test utf-6.45 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xA0 +} 2 +test utf-6.46 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xD0 +} 2 +test utf-6.47 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xE8 +} 2 +test utf-6.48 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xF4 +} 2 +test utf-6.49 {Tcl_UtfNext} testutfnext { + testutfnext \xD0\xA0\xF8 +} 2 +test utf-6.50 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0G +} 1 +test utf-6.51 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0 +} 3 +test utf-6.52 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xD0 +} 1 +test utf-6.53 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xE8 +} 1 +test utf-6.54 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xF4 +} 1 +test utf-6.55 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xF8 +} 1 +test utf-6.56 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0G +} 1 +test utf-6.57 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0 +} 1 +test utf-6.58 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xD0 +} 1 +test utf-6.59 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xE8 +} 1 +test utf-6.60 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xF4 +} 1 +test utf-6.61 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xF8 +} 1 +test utf-6.62 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0G +} 3 +test utf-6.63 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xA0 +} 3 +test utf-6.64 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xD0 +} 3 +test utf-6.65 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xE8 +} 3 +test utf-6.66 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xF4 +} 3 +test utf-6.67 {Tcl_UtfNext} testutfnext { + testutfnext \xE8\xA0\xA0\xF8 +} 3 +test utf-6.68 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0G +} 1 +test utf-6.69 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0 +} 1 +test utf-6.70 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xD0 +} 1 +test utf-6.71 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xE8 +} 1 +test utf-6.71 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xF4 +} 1 +test utf-6.73 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xF8 +} 1 +test utf-6.74 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0G +} 1 +test utf-6.75 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xA0 +} 1 +test utf-6.76 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xD0 +} 1 +test utf-6.77 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xE8 +} 1 +test utf-6.78 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0\xF4 +} 1 +test utf-6.79 {Tcl_UtfNext} testutfnext { + testutfnext \xF4\xA0\xA0\xA0G\xF8 +} 1 + + + testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From b6f85fcdc4a91b0addc291d2bb85fae07b94c24c Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 16:42:32 +0000 Subject: Add test demonstrating that Tcl_UtfNext accepts overlong byte sequences, which is in conflict with what Tcl_UtfToUniChar does. --- tests/utf.test | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index b5358cc..a930aae 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -352,9 +352,9 @@ test utf-6.78 {Tcl_UtfNext} testutfnext { test utf-6.79 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0G\xF8 } 1 - - - +test utf-6.80 {Tcl_UtfNext - overlong sequences} { + testutfnext \xC0\x81 +} 1 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From 31331dec813313d1fe547ea6d70b7c2639e682a9 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 17:05:52 +0000 Subject: New tests checking Tcl_UtfPrev handling of overlong sequences. Bug demonstrated. --- tests/utf.test | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 56ca1b9..9982f81 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -275,6 +275,33 @@ test utf-7.22 {Tcl_UtfPrev} testutfprev { test utf-7.23 {Tcl_UtfPrev} testutfprev { testutfprev A\xA0\xA0\xA0\xA0 } 4 +test utf-7.24 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xC0\x81 +} 2 +test utf-7.25 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xC0\x81 2 +} 1 +test utf-7.26 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\x80\x80 +} 3 +test utf-7.27 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\x80\x80 3 +} 2 +test utf-7.28 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\x80\x80 2 +} 1 +test utf-7.29 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x80\x80\x80 +} 4 +test utf-7.30 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x80\x80\x80 4 +} 3 +test utf-7.31 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x80\x80\x80 3 +} 2 +test utf-7.32 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x80\x80\x80 2 +} 1 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From eb7e63e9e7f8fe9b15e48b78127cab9e67e864cc Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 21:48:57 +0000 Subject: Rework Tcl_UtfPrev so it properly handles overlong sequences. --- generic/tclUtf.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 113 insertions(+), 19 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index fbdba4c..a50a7fc 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -703,31 +703,125 @@ Tcl_UtfPrev( CONST char *src, /* A location in a UTF-8 string. */ CONST char *start) /* Pointer to the beginning of the string */ { - CONST char *look; - int i, byte; - - src--; - look = src; - for (i = 0; i < TCL_UTF_MAX; i++) { - if (look < start) { - if (src < start) { - src = start; - } - break; - } - byte = *((unsigned char *) look); + int trailBytesSeen = 0; /* How many trail bytes have been verified? */ + CONST char *fallback = src - 1; + /* If we cannot find a lead byte that might + * start a prefix of a valid UTF byte sequence, + * we will fallback to a one-byte back step */ + unsigned char *look = (unsigned char *)fallback; + /* Start search at the fallback position */ + + /* Quick boundary case exit. */ + if (fallback <= start) { + return start; + } + + do { + unsigned char byte = look[0]; + if (byte < 0x80) { - break; + /* + * Single byte character. Either this is a correct previous + * character, or it is followed by at least one trail byte + * which indicates a malformed sequence. In either case the + * correct result is to return the fallback. + */ + return fallback; } if (byte >= 0xC0) { - if (totalBytes[byte] <= i) { - break; + /* Non-trail byte; May be multibyte lead. */ + + if ((trailBytesSeen == 0) + /* + * We've seen no trailing context to use to check + * anything. From what we know, this non-trail byte + * is a prefix of a previous character, and accepting + * it (the fallback) is correct. + */ + + || (trailBytesSeen >= totalBytes[byte])) { + /* + * That is, (1 + trailBytesSeen > needed). + * We've examined more bytes than needed to complete + * this lead byte. No matter about well-formedness or + * validity, the sequence starting with this lead byte + * will never include the fallback location, so we must + * return the fallback location. + * + * EXAMPLE: bytes = "ab\C0\x80\x81def"; + * Tcl_UtfPrev(bytes+5, bytes); + * + * When we get here, look == bytes+2, trailBytesSeen == 2, + * needed = 2, and we need to return bytes+4 that points to + * the malformed \x81. + */ + return fallback; } - return look; + + /* + * trailBytesSeen > 0, so we can examine look[1] safely. + * Use that capability to screen out overlong sequences. + */ + + switch (byte) { + case 0xC0: + if (look[1] == 0x80) { + /* Valid sequence: \xC0\x80 for \u0000 */ + return (CONST char *)look; + } + /* Reject overlong: \xC0\x81 - \xC0\xBF */ + return fallback; + case 0xC1: + /* Reject overlong: \xC1\x80 - \xC1\xBF */ + return fallback; + case 0xE0: + if (look[1] < 0xA0) { + /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */ + return fallback; + } + /* Valid sequence: \xE0\xA0\x80 for \u0800 , etc. */ + return (CONST char *)look; +#if TCL_UTF_MAX > 3 + case 0xF0: + if (look[1] < 0x90) { + /* Reject overlong: \xF0\x80\x80\x80 - \xF0\x8F\xBF\xBF */ + return fallback; + } + /* Valid sequence: \xF0\x90\x80\x80 for \U10000 , etc. */ + return (CONST char *)look; +#endif + default: + /* All other lead bytes lead only valid sequences */ + return (CONST char *)look; + } + } + + /* We saw a trail byte. */ + trailBytesSeen++; + + if ((CONST char *)look == start) { + /* + * Do not read before the start of the string + * + * If we get here, we've examined bytes at every location + * >= start and < src and all of them are trail bytes, + * including (*start). We need to return our fallback + * and exit this loop before we run past the start of the string. + */ + return fallback; } + + /* Continue the search backwards... */ look--; - } - return src; + } while (trailBytesSeen < TCL_UTF_MAX); + + /* + * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * properly formed byte sequence to find, and we can stop looking, + * accepting the fallback. + */ + + return fallback; } /* -- cgit v0.12 From 9e732c5d031fffda316458c11263a1f77d630eb8 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 22:14:33 +0000 Subject: More test coverage --- tests/utf.test | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 9982f81..62b85a7 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -302,6 +302,48 @@ test utf-7.31 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.32 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x80\x80\x80 2 } 1 +test utf-7.33 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xC0\x80 +} 1 +test utf-7.34 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xC1\x80 +} 2 +test utf-7.35 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xC2\x80 +} 1 +test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\xA0\x80 +} 1 +test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\xA0\x80 3 +} 1 +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\xA0\x80 2 +} 1 +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x90\x80\x80 +} 4 +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x90\x80\x80 4 +} 3 +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x90\x80\x80 3 +} 2 +test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xF0\x90\x80\x80 2 +} 1 +test utf-7.43 {Tcl_UtfPrev -- no lead byte at start} testutfprev { + testutfprev \xA0 +} 0 +test utf-7.44 {Tcl_UtfPrev -- no lead byte at start} testutfprev { + testutfprev \xA0\xA0 +} 1 +test utf-7.45 {Tcl_UtfPrev -- no lead byte at start} testutfprev { + testutfprev \xA0\xA0\xA0 +} 2 +test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} testutfprev { + testutfprev \xA0\xA0\xA0\xA0 +} 3 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From 912da02553120c9c19cad6852d9960a86b3887af Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 22:28:18 +0000 Subject: Use test existence to shorten comment. --- generic/tclUtf.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a50a7fc..6003b75 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -746,14 +746,7 @@ Tcl_UtfPrev( * this lead byte. No matter about well-formedness or * validity, the sequence starting with this lead byte * will never include the fallback location, so we must - * return the fallback location. - * - * EXAMPLE: bytes = "ab\C0\x80\x81def"; - * Tcl_UtfPrev(bytes+5, bytes); - * - * When we get here, look == bytes+2, trailBytesSeen == 2, - * needed = 2, and we need to return bytes+4 that points to - * the malformed \x81. + * return the fallback location. See test utf-7.17 */ return fallback; } -- cgit v0.12 From c0f6190785d84fd0e859738c2d89b78f16a07f11 Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 15 Apr 2020 22:39:53 +0000 Subject: Refactor the Overlong test into a utility routine. --- generic/tclUtf.c | 85 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6003b75..f3b2097 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,6 +81,7 @@ static CONST unsigned char totalBytes[256] = { */ static int UtfCount(int ch); +static int Overlong(unsigned char *src); /* *--------------------------------------------------------------------------- @@ -115,7 +116,59 @@ UtfCount( #endif return 3; } + +/* + *--------------------------------------------------------------------------- + * + * Overlong -- + * + * Utility routine to report whether /src/ points to the start of an + * overlong byte sequence that should be rejected. + * + * Results: + * A boolean. + *--------------------------------------------------------------------------- + */ +INLINE static int +Overlong( + unsigned char *src) /* Points to lead byte of a UTF-8 byte + * sequence. Caller guarantees it is safe + * to read src[0] and src[1]. */ +{ + switch (*src) { + case 0xC0: + if (src[1] == 0x80) { + /* Valid sequence: \xC0\x80 for \u0000 */ + return 0; + } + /* Reject overlong: \xC0\x81 - \xC0\xBF */ + return 1; + case 0xC1: + /* Reject overlong: \xC1\x80 - \xC1\xBF */ + return 1; + case 0xE0: + if (src[1] < 0xA0) { + /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */ + return 1; + } + /* Valid sequence: \xE0\xA0\x80 for \u0800 , etc. */ + return 0; +#if TCL_UTF_MAX > 3 + case 0xF0: + if (src[1] < 0x90) { + /* Reject overlong: \xF0\x80\x80\x80 - \xF0\x8F\xBF\xBF */ + return 1; + } + /* Valid sequence: \xF0\x90\x80\x80 for \U10000 , etc. */ + return 0 +#endif + default: + /* All other lead bytes lead only valid sequences */ + return 0; + } +} + /* *--------------------------------------------------------------------------- * @@ -756,37 +809,11 @@ Tcl_UtfPrev( * Use that capability to screen out overlong sequences. */ - switch (byte) { - case 0xC0: - if (look[1] == 0x80) { - /* Valid sequence: \xC0\x80 for \u0000 */ - return (CONST char *)look; - } - /* Reject overlong: \xC0\x81 - \xC0\xBF */ + if (Overlong(look)) { + /* Reject */ return fallback; - case 0xC1: - /* Reject overlong: \xC1\x80 - \xC1\xBF */ - return fallback; - case 0xE0: - if (look[1] < 0xA0) { - /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */ - return fallback; - } - /* Valid sequence: \xE0\xA0\x80 for \u0800 , etc. */ - return (CONST char *)look; -#if TCL_UTF_MAX > 3 - case 0xF0: - if (look[1] < 0x90) { - /* Reject overlong: \xF0\x80\x80\x80 - \xF0\x8F\xBF\xBF */ - return fallback; - } - /* Valid sequence: \xF0\x90\x80\x80 for \U10000 , etc. */ - return (CONST char *)look; -#endif - default: - /* All other lead bytes lead only valid sequences */ - return (CONST char *)look; } + return (CONST char *)look; } /* We saw a trail byte. */ -- cgit v0.12 From 2ce8119bcc38c78455f02cf8f267e337c7e5a28f Mon Sep 17 00:00:00 2001 From: sebres Date: Thu, 16 Apr 2020 12:55:47 +0000 Subject: zlib, *BO*: fixes possible segfault (or buffer overrun), for instance if limit (max 65K) is set larger as allocated buffer for inflate-input (default 4K) --- generic/tclZlib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/generic/tclZlib.c b/generic/tclZlib.c index 002c6ae..ed29ff9 100644 --- a/generic/tclZlib.c +++ b/generic/tclZlib.c @@ -3027,7 +3027,9 @@ ZlibTransformInput( * reading over the border. */ - readBytes = Tcl_ReadRaw(cd->parent, cd->inBuffer, cd->readAheadLimit); + readBytes = Tcl_ReadRaw(cd->parent, cd->inBuffer, + cd->readAheadLimit <= cd->inAllocated ? + cd->readAheadLimit : cd->inAllocated); /* * Three cases here: -- cgit v0.12 From 7aae7e9b7ac102276776013979bd4fad9393efa5 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 16:22:34 +0000 Subject: When we reject overlong sequences, \xC1 is as invalid a lead byte as \xFF. --- generic/tclUtf.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 5ba8ff5..556a5a2 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -66,7 +66,7 @@ static CONST unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 3 4,4,4,4,4, @@ -123,7 +123,9 @@ UtfCount( * Overlong -- * * Utility routine to report whether /src/ points to the start of an - * overlong byte sequence that should be rejected. + * overlong byte sequence that should be rejected. Caller guarantees + * /src/ points to a byte that can lead a multi-byte sequence, and + * that src[0] and src[1] are readable. * * Results: * A boolean. @@ -132,9 +134,7 @@ UtfCount( INLINE static int Overlong( - unsigned char *src) /* Points to lead byte of a UTF-8 byte - * sequence. Caller guarantees it is safe - * to read src[0] and src[1]. */ + unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */ { switch (*src) { case 0xC0: @@ -144,9 +144,6 @@ Overlong( } /* Reject overlong: \xC0\x81 - \xC0\xBF */ return 1; - case 0xC1: - /* Reject overlong: \xC1\x80 - \xC1\xBF */ - return 1; case 0xE0: if (src[1] < 0xA0) { /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */ -- cgit v0.12 From e0169e273905ed02706471f65ae31f2950e4c3aa Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 17:30:29 +0000 Subject: Convert Overlong() to use a lookup table. --- generic/tclUtf.c | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 556a5a2..f8d89f8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -124,46 +124,50 @@ UtfCount( * * Utility routine to report whether /src/ points to the start of an * overlong byte sequence that should be rejected. Caller guarantees - * /src/ points to a byte that can lead a multi-byte sequence, and - * that src[0] and src[1] are readable. + * that src[0] and src[1] are readable, and + * + * (src[0] >= 0xC0) && (src[0] != 0xC1) + * (src[1] >= 0x80) && (src[1] < 0xC0) + * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF8 : 0xF0)) * * Results: * A boolean. *--------------------------------------------------------------------------- */ +static CONST unsigned char overlong[3] = { + 0x80, /* \xD0 -- all sequences valid */ + 0xA0, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ +#if TCL_UTF_MAX > 3 + 0x90 /* \xF0\x80 through \xF0\x8F are invalid prefixes */ +#else + 0xC0 /* Not used, but reject all again for safety. */ +#endif +}; + INLINE static int Overlong( unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */ { - switch (*src) { - case 0xC0: + unsigned char byte = *src; + + if (byte % 0x10) { + /* Only lead bytes 0xC0, 0xE0, 0xF0 need examination */ + return 0; + } + if (byte == 0xC0) { if (src[1] == 0x80) { /* Valid sequence: \xC0\x80 for \u0000 */ return 0; } /* Reject overlong: \xC0\x81 - \xC0\xBF */ return 1; - case 0xE0: - if (src[1] < 0xA0) { - /* Reject overlong: \xE0\x80\x80 - \xE0\x9F\xBF */ - return 1; - } - /* Valid sequence: \xE0\xA0\x80 for \u0800 , etc. */ - return 0; -#if TCL_UTF_MAX > 3 - case 0xF0: - if (src[1] < 0x90) { - /* Reject overlong: \xF0\x80\x80\x80 - \xF0\x8F\xBF\xBF */ - return 1; - } - /* Valid sequence: \xF0\x90\x80\x80 for \U10000 , etc. */ - return 0 -#endif - default: - /* All other lead bytes lead only valid sequences */ - return 0; } + if (src[1] < overlong[(byte >> 4) - 0x0D]) { + /* Reject overlong */ + return 1; + } + return 0; } /* -- cgit v0.12 From 624b4722c445f0891fad9350fa4b412e547485ab Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 18:14:55 +0000 Subject: Create and use macro TclUtfPrev for Tcl_UtfPrev. --- generic/tclCmdMZ.c | 2 +- generic/tclInt.h | 5 +++++ generic/tclStringObj.c | 6 +++--- generic/tclUtil.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 6515d98..f6bdd3e 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2406,7 +2406,7 @@ StringStartCmd( break; } - next = Tcl_UtfPrev(p, string); + next = TclUtfPrev(p, string); do { next += delta; delta = TclUtfToUniChar(next, &ch); diff --git a/generic/tclInt.h b/generic/tclInt.h index 15bc000..3dc3d1d 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3691,6 +3691,11 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, CONST char *file, ((*(chPtr) = (unsigned char) *(str)), 1) \ : Tcl_UtfToUniChar(str, chPtr)) +#define TclUtfPrev(src, start) \ + (((src) < (start)+2) ? (start) : \ + ((unsigned char) *(src - 1)) < 0x80 ? (src)-1 : \ + Tcl_UtfPrev(src, start)) + /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index c3c85dc..d62fc72 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1139,10 +1139,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2585,7 +2585,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = Tcl_UtfPrev(end, bytes); + q = TclUtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (int)(end - q))) { end = q; } diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 3dd9a32..50922bf 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1554,7 +1554,7 @@ TclTrimRight( const char *q = trim; int pInc = 0, bytesLeft = numTrim; - pp = Tcl_UtfPrev(p, bytes); + pp = TclUtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUniChar(pp, &ch1); -- cgit v0.12 From 85cfd71803a9da1d4ce180b8b0b6446a4c22cd43 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 18:40:40 +0000 Subject: More tests and fix for overlong handling in revised Tcl_UtfNext. --- generic/tclUtf.c | 3 +++ tests/utf.test | 23 ++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e41e7a5..00ca94e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -712,6 +712,9 @@ Tcl_UtfNext( } next++; } + if (Overlong(src)) { + return src + 1; + } return next; } diff --git a/tests/utf.test b/tests/utf.test index 72165f9..02b7002 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -352,9 +352,30 @@ test utf-6.78 {Tcl_UtfNext} testutfnext { test utf-6.79 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0G\xF8 } 1 -test utf-6.80 {Tcl_UtfNext - overlong sequences} { +test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xC0\x80 +} 2 +test utf-6.81 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x81 } 1 +test utf-6.82 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xC1\x80 +} 1 +test utf-6.83 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xC2\x80 +} 2 +test utf-6.84 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xE0\x80\x80 +} 1 +test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xE0\xA0\x80 +} 3 +test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xF0\x80\x80\x80 +} 1 +test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { + testutfnext \xF0\x90\x80\x80 +} 1 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From 356efe2f403bf94f25e9399847b21ca07ad129f9 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 18:42:37 +0000 Subject: compiler warning --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 00ca94e..91e9c73 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -712,7 +712,7 @@ Tcl_UtfNext( } next++; } - if (Overlong(src)) { + if (Overlong((unsigned char *)src)) { return src + 1; } return next; -- cgit v0.12 From c7b97db095f7b803062b07caddb7922c2815dee9 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 19:02:40 +0000 Subject: More detailed comments. --- generic/tclUtf.c | 70 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 91e9c73..67603af 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -678,13 +678,35 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * Given a pointer to some current location in a UTF-8 string, move - * forward one character. The caller must ensure that they are not asking - * for the next character after the last character in the string. + * The aim of this routine is to provide a way to iterate forward + * through a UTF-8 string. The caller is expected to pass a non-NULL + * pointer argument /src/ which points to a location within a string. + * (*src) will be read, so /src/ must not point to an unreadable + * location past the end of the string. If /src/ points to the + * beginning of a complete, well-formed and valid UTF_8 byte sequence + * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer + * just past the end of that sequence. In any other circumstance, + * Tcl_UtfNext returns /src/+1. + * + * Because this routine always returns a value > /src/, it is useful + * as a forward iterator that will always make progress. If the string + * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating + * NUL character. If it is not NUL-terminated, the caller must make + * use of the companion routine Tcl_UtfCharComplete to test whether + * there is risk that Tcl_UtfNext will read beyond the end of the string. + * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. + * + * In a string where all characters are complete and properly formed, + * and /src/ points to the first byte of a character, repeated + * Tcl_UtfNext calls will step to the starting bytes of characters, one + * character at a time. Within those limitations, Tcl_UtfPrev and + * Tcl_UtfNext are inverses. If either condition cannot be met, + * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the + * caller will have to take greater care. * * Results: - * The return value is the pointer to the next character in the UTF-8 - * string. + * A pointer to the start of the next character in the string (or to + * the end of the string) as described above. * * Side effects: * None. @@ -725,37 +747,37 @@ Tcl_UtfNext( * * The aim of this routine is to provide a way to move backward * through a UTF-8 string. The caller is expected to pass non-NULL - * pointer arguments start and src. start points to the beginning - * of a string, and src >= start points to a location within (or just - * past the end) of the string. This routine always returns a - * pointer within the string (>= start). When (src == start), it - * returns start. When (src > start), it returns a pointer (< src) - * and (>= src - TCL_UTF_MAX). Subject to these constraints, the - * routine returns a pointer to the earliest byte in the string that - * starts a character when characters are read starting at start and + * pointer arguments /start/ and /src/. /start/ points to the beginning + * of a string, and /src/ (>= /start/) points to a location within (or + * just past the end) of the string. This routine always returns a + * pointer within the string (>= /start/). When (/src/ == /start/), + * it returns /start/. When (/src/ > /start/), it returns a pointer + * (< /src/) and (>= /src/ - TCL_UTF_MAX). Subject to these constraints, + * the routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at /start/ and * that character might include the byte src[-1]. The routine will * examine only those bytes in the range that might be returned. - * It will not examine the byte *src, and because of that cannot + * It will not examine the byte (*src), and because of that cannot * determine for certain in all circumstances whether the character * that begins with the returned pointer will or will not include - * the byte src[-1]. In the scenario, where src points to the end of - * a buffer being filled, the returned pointer point to either the + * the byte src[-1]. In the scenario where /src/ points to the end of + * a buffer being filled, the returned pointer points to either the * final complete character in the string or to the earliest byte * that might start an incomplete character waiting for more bytes to * complete. * - * Because this routine always returns a value < src until the point - * it is forced to return start, it is useful as a backward iterator + * Because this routine always returns a value < /src/ until the point + * it is forced to return /start/, it is useful as a backward iterator * through a string that will always make progress and always be * prevented from running past the beginning of the string. * * In a string where all characters are complete and properly formed, - * and the value of src points to the first byte of a character, - * repeated Tcl_UtfPrev calls will step to the starting bytes of - * characters, one character at a time. Within those limitations, - * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot - * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and - * the caller will have to take greater care. + * and /src/ points to the first byte of a character, repeated + * Tcl_UtfPrev calls will step to the starting bytes of characters, one + * character at a time. Within those limitations, Tcl_UtfPrev and + * Tcl_UtfNext are inverses. If either condition cannot be met, + * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the + * caller will have to take greater care. * * Results: * A pointer to the start of a character in the string as described -- cgit v0.12 From b6144c5420305309bed79834b42c482e089afbea Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 19:04:47 +0000 Subject: delete merge litter --- generic/tclTest.c | 1 - 1 file changed, 1 deletion(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index b8507bf..6e0fbed 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -325,7 +325,6 @@ static Tcl_FSPathInFilesystemProc SimplePathInFilesystem; static Tcl_Obj * SimpleRedirect(Tcl_Obj *pathPtr); static Tcl_FSMatchInDirectoryProc SimpleMatchInDirectory; static Tcl_ObjCmdProc TestUtfNextCmd; -static Tcl_ObjCmdProc TestUtfNextCmd; static Tcl_ObjCmdProc TestUtfPrevCmd; static Tcl_ObjCmdProc TestNumUtfCharsCmd; static Tcl_ObjCmdProc TestFindFirstCmd; -- cgit v0.12 From 44740227313fb5b5da5fcd077619c478e0869adc Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 19:39:36 +0000 Subject: Improve the docs for Tcl_UtfNext. --- doc/Utf.3 | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 87d1318..cb82699 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -217,11 +217,20 @@ returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is considered part of the UTF-8 string. .PP -Given \fIsrc\fR, a pointer to some location in a UTF-8 string, -\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the -string. The caller must not ask for the next character after the last -character in the string if the string is not terminated by a null -character. +\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string. +If the UTF-8 string is made up entirely of complete, well-formed, and +valid character byte sequences, and \fIsrc\fR points to the lead byte +of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will +return pointers to the lead bytes of each character in the string, one +character at a time. In any other circumstance, \fBTcl_UtfNext\fR +returns \fIsrc\fR+1. \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR +and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR) +as needed to find the end of the byte sequence. If the string is +\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating +\fBNUL\fR byte. If not, the caller must use the companion routine +\fBTcl_UtfCharComplete\fR to determine whether there is any risk +\fBTcl_UtfNext\fR might read beyond the readable memory occupied +by the string. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made -- cgit v0.12 From daa258510cdb9253b1861915a86346d2dd093c9a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 16 Apr 2020 20:24:49 +0000 Subject: Add "knownBug" testcase, showing a situation in which Tcl_UtfNext doesn't behave as described in the documentation --- tests/utf.test | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 1b7b409..8742a5e 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -153,8 +153,13 @@ test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { testfindlast [testbytestring "abcbc"] 98 } {bc} +testConstraint testutfnext [llength [info commands testutfnext]] + test utf-6.1 {Tcl_UtfNext} { } {} +test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext knownBug} { + testutfnext \xE8\xA0\xA0 1 +} 3 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From 4e88da99a66457b31572580b7bc6fedcfa3db2d6 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 20:28:15 +0000 Subject: merge litter --- generic/tclUtf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 67603af..b5c430b 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -558,7 +558,7 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - register int i; + register int i = 0; /* * The separate implementations are faster. @@ -567,7 +567,6 @@ Tcl_NumUtfChars( * single-byte char case specially. */ - i = 0; if (length < 0) { while ((*src != '\0') && (i < INT_MAX)) { src = TclUtfNext(src); -- cgit v0.12 From bcd1fa949014e08bee6b67360b8bea459e30cdc1 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 16 Apr 2020 20:59:11 +0000 Subject: Adjust test results and implementation for Tcl 8.6 current support of 4-byte sequences in a TCL_UTF_MAX=3 build. --- generic/tclUtf.c | 4 ++-- tests/utf.test | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index cd57d12..d6ba15c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -897,10 +897,10 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < /* was TCL_UTF_MAX */ 4); /* - * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * We've seen 4 (was TCL_UTF_MAX) trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, * accepting the fallback. */ diff --git a/tests/utf.test b/tests/utf.test index 3a2911f..76cf3fe 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -363,7 +363,7 @@ test utf-6.68 {Tcl_UtfNext} testutfnext { } 1 test utf-6.69 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0 -} 1 +} 4 test utf-6.70 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xD0 } 1 @@ -378,22 +378,22 @@ test utf-6.73 {Tcl_UtfNext} testutfnext { } 1 test utf-6.74 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0G -} 1 +} 4 test utf-6.75 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0\xA0 -} 1 +} 4 test utf-6.76 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0\xD0 -} 1 +} 4 test utf-6.77 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0\xE8 -} 1 +} 4 test utf-6.78 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0\xF4 -} 1 +} 4 test utf-6.79 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xA0G\xF8 -} 1 +} 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x80 } 2 -- cgit v0.12 From 4ebbe478b1112910187685cb9d79496b95a2ce1a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 16 Apr 2020 22:06:54 +0000 Subject: Fix more test-cases for TCL_UTF_MAX=3 --- generic/tclUtf.c | 18 ++++++++++-------- tests/utf.test | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index d6ba15c..b37d55a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -82,7 +82,7 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #endif - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 4 4,4,4,4,4, @@ -589,8 +589,9 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { - register int i = 0; - + Tcl_UniChar ch = 0; + int i = 0; + /* * The separate implementations are faster. * @@ -600,19 +601,20 @@ Tcl_NumUtfChars( if (length < 0) { while ((*src != '\0') && (i < INT_MAX)) { - src = TclUtfNext(src); + src += Tcl_UtfToUniChar(src, &ch); i++; } + if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - register const char *endPtr = src + length - TCL_UTF_MAX; + const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - src = TclUtfNext(src); + src += Tcl_UtfToUniChar(src, &ch); i++; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src = TclUtfNext(src); + src += Tcl_UtfToUniChar(src, &ch); i++; } if (src < endPtr) { @@ -958,7 +960,7 @@ Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { -#if 0 +#if 1 /* The Tcl 8.6 implementation */ Tcl_UniChar ch = 0; int len = 0; diff --git a/tests/utf.test b/tests/utf.test index 76cf3fe..b5d1f5c 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -418,7 +418,7 @@ test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x90\x80\x80 } 1 -test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { +test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext knownBug} { testutfnext \xE8\xA0\xA0 1 } 3 -- cgit v0.12 From de19f40116629de7d96fff190c9faf41db135d5a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 16 Apr 2020 22:19:15 +0000 Subject: Fix build for TCL_UTF_MAX=4. Mark some failing tests with "knownBug". Those still need to be fixed! --- generic/tclUtf.c | 2 +- tests/utf.test | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b37d55a..0768992 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -972,7 +972,7 @@ Tcl_UtfAtIndex( #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) { /* Index points at character following high Surrogate */ - src = TclUtfToUniChar(src, &ch); + src += TclUtfToUniChar(src, &ch); } #endif return src; diff --git a/tests/utf.test b/tests/utf.test index b5d1f5c..a730feb 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -415,7 +415,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext knownBug} { # Doesn't work with TCL_UTF_MAX>3 testutfnext \xF0\x90\x80\x80 } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext knownBug} { @@ -628,13 +628,13 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev knownBug} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev knownBug} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev knownBug} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { -- cgit v0.12 From 10f11062433181a7f8e687e3d4b4d13f350bece1 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 03:54:50 +0000 Subject: Fix the bad tests utf-2.11 and utf-6.88 that expected the wrong results. Also reconcile the merge from 8.5 to the new decoupling of bytesequence counts from indexed code unit couints. Docs still need an update. --- generic/tclUtf.c | 50 ++++++++++++++++++++------------------------------ tests/utf.test | 4 ++-- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index d6ba15c..24fd418 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -589,6 +589,7 @@ Tcl_NumUtfChars( int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { + const char *next; register int i = 0; /* @@ -600,20 +601,23 @@ Tcl_NumUtfChars( if (length < 0) { while ((*src != '\0') && (i < INT_MAX)) { - src = TclUtfNext(src); - i++; + next = TclUtfNext(src); + i += 1 + ((next - src) > 3); + src = next; } } else { register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - src = TclUtfNext(src); - i++; + next = TclUtfNext(src); + i += 1 + ((next - src) > 3); + src = next; } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src = TclUtfNext(src); - i++; + next = TclUtfNext(src); + i += 1 + ((next - src) > 3); + src = next; } if (src < endPtr) { i += endPtr - src; @@ -958,33 +962,19 @@ Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { -#if 0 -/* The Tcl 8.6 implementation */ - Tcl_UniChar ch = 0; - int len = 0; - while (index-- > 0) { - len = TclUtfToUniChar(src, &ch); - src += len; - } -#if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following high Surrogate */ - src = TclUtfToUniChar(src, &ch); - } -#endif - return src; -#else -/* The Tcl 8.5 implementation */ - while (index > 0) { - index--; - src = TclUtfNext(src); /* NOTE: counts each valid byte sequence - * as one character, maybe including those - * that will get stored as two UCS-2 units - * in the UTF-16 encoding. */ + const char *next = TclUtfNext(src); + + /* + * 4-byte sequences generate two UCS-2 code units in the + * UTF-16 representation, so in the current indexing scheme + * we need to account for an extra index (total of two). + */ + index -= ((next - src) > 3); + + src = next; } return src; -#endif } /* diff --git a/tests/utf.test b/tests/utf.test index 76cf3fe..dd94c54 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -96,7 +96,7 @@ test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} t } {4} test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring { string length [testbytestring "\xF4\x90\x80\x80"] -} {4} +} {2} test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring { string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"] } {5} @@ -420,7 +420,7 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xE8\xA0\xA0 1 -} 3 +} 2 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From 2223b02d79a0887ce4942842deffea9a4c0c0051 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 04:08:46 +0000 Subject: Bring the single-byte marker for invalid lead byte \xC1 into the complete table --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 24fd418..fdf2e32 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -82,7 +82,7 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #endif - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 4 4,4,4,4,4, -- cgit v0.12 From 9a9a63968465d4126caa12d09b1d8a795a3a1df3 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 04:17:51 +0000 Subject: enable the tests on a bug fix branch --- tests/utf.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index b89be25..2004625 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -415,7 +415,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext knownBug} { # Doesn't work with TCL_UTF_MAX>3 +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext} { # Doesn't work with TCL_UTF_MAX>3 testutfnext \xF0\x90\x80\x80 } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -628,13 +628,13 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev knownBug} { # Doesn't work with TCL_UTF_MAX>3 +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev knownBug} { # Doesn't work with TCL_UTF_MAX>3 +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev knownBug} { # Doesn't work with TCL_UTF_MAX>3 +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { -- cgit v0.12 From a44378bbfa6e0aaa20eb2ecd506c6be8c0646c16 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 04:45:26 +0000 Subject: When supporting 4-byte sequences, make sure the Overlong test does too, and make sure the test results reflect it. --- generic/tclUtf.c | 2 +- tests/utf.test | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index fdf2e32..e637263 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -154,7 +154,7 @@ UtfCount( static CONST unsigned char overlong[3] = { 0x80, /* \xD0 -- all sequences valid */ 0xA0, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX >= 3 0x90 /* \xF0\x80 through \xF0\x8F are invalid prefixes */ #else 0xC0 /* Not used, but reject all again for safety. */ diff --git a/tests/utf.test b/tests/utf.test index 2004625..e8b1d51 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -630,13 +630,13 @@ test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { } 1 test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 -} 4 +} 1 test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 4 -} 3 +} 1 test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 testutfprev A\xF0\x90\x80\x80 3 -} 2 +} 1 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x90\x80\x80 2 } 1 -- cgit v0.12 From 1554b799cc9c928154d94bb10b010d8c137fcde7 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 04:51:48 +0000 Subject: more test fixes --- tests/utf.test | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index e8b1d51..c584ea1 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -415,9 +415,9 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext} { # Doesn't work with TCL_UTF_MAX>3 +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext} { testutfnext \xF0\x90\x80\x80 -} 1 +} 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xE8\xA0\xA0 1 } 2 @@ -628,13 +628,13 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { testutfprev A\xF0\x90\x80\x80 } 1 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { testutfprev A\xF0\x90\x80\x80 4 } 1 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { # Doesn't work with TCL_UTF_MAX>3 +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { testutfprev A\xF0\x90\x80\x80 3 } 1 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { -- cgit v0.12 From b82a4c9f55e3fb83c99ed13d99f75148738780c4 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 05:14:43 +0000 Subject: When supporting 4-byte sequences even with TCL_UTF_MAX = 3, need to paramterize a few things differently. (utf-4.11 failures). --- generic/tclUtf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e637263..4b5d500 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -84,11 +84,7 @@ static const unsigned char complete[256] = { #endif 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 4 4,4,4,4,4, -#else - 3,3,3,3,3, /* Tcl_UtfCharComplete() only checks TCL_UTF_MAX bytes */ -#endif 1,1,1,1,1,1,1,1,1,1,1 }; @@ -606,14 +602,14 @@ Tcl_NumUtfChars( src = next; } } else { - register const char *endPtr = src + length - TCL_UTF_MAX; + register const char *endPtr = src + length - /*TCL_UTF_MAX*/ 4; while (src < endPtr) { next = TclUtfNext(src); i += 1 + ((next - src) > 3); src = next; } - endPtr += TCL_UTF_MAX; + endPtr += /*TCL_UTF_MAX*/ 4; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { next = TclUtfNext(src); i += 1 + ((next - src) > 3); -- cgit v0.12 From 7bc9c5e06b82717dbf526519586d68ea79a0513f Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 17 Apr 2020 09:47:01 +0000 Subject: More test-cases --- tests/utf.test | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 02b7002..7b7b5c2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -376,6 +376,18 @@ test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x90\x80\x80 } 1 +test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { + testutfnext \xA0\xA0 +} 1 +test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { + testutfnext \xE8\xA0\xA0 1 +} 2 +test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { + testutfnext \x80\x80 +} 1 +test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { + testutfnext \xF0\x80\x80 1 +} 2 testConstraint testutfprev [llength [info commands testutfprev]] @@ -460,6 +472,9 @@ test utf-7.11.1 {Tcl_UtfPrev} testutfprev { test utf-7.11.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xF8\xA0 3 } 1 +test utf-7.11.3 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xF8 3 +} 1 test utf-7.12 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0 } 1 @@ -548,9 +563,15 @@ test utf-7.26 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80\x80 } 3 test utf-7.27 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0\x80 +} 2 +test utf-7.27.1 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80\x80 3 } 2 test utf-7.28 {Tcl_UtfPrev -- overlong sequence} testutfprev { + testutfprev A\xE0 +} 1 +test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80\x80 2 } 1 test utf-7.29 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -607,6 +628,15 @@ test utf-7.45 {Tcl_UtfPrev -- no lead byte at start} testutfprev { test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} testutfprev { testutfprev \xA0\xA0\xA0\xA0 } 3 +test utf-7.47 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev} { + testutfprev \xE8\xA0 +} 0 +test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev} { + testutfprev \xE8\xA0\xA0 2 +} 0 +test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { + testutfprev \xE8\xA0\x00 2 +} 0 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From c972afae3a5b4ec270e5c53bb4885ec820b09ad3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 17 Apr 2020 11:02:19 +0000 Subject: Fix implementation of Tcl_UtfAtIndex() for TCL_UTF_MAX=6 (There's no test-case for this in the core-8-6-branch, but there is in core-8-branch). --- generic/tclUtf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 94fa5f6..1ba474e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -971,14 +971,16 @@ Tcl_UtfAtIndex( register int index) /* The position of the desired character. */ { while (index-- > 0) { - const char *next = TclUtfNext(src); + const char *next = TclUtfNext(src); +#if TCL_UTF_MAX <= 4 /* * 4-byte sequences generate two UCS-2 code units in the * UTF-16 representation, so in the current indexing scheme * we need to account for an extra index (total of two). */ index -= ((next - src) > 3); +#endif src = next; } -- cgit v0.12 From 5a4198d655da6c9bd1daa51f9daf6b6e2182aee0 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 17 Apr 2020 12:49:32 +0000 Subject: Clean-up some unnecessary spacing. --- compat/unistd.h | 2 +- generic/tclCompile.h | 2 +- generic/tclStubInit.c | 2 +- generic/tclStubLib.c | 2 +- generic/tclTomMath.h | 10 +++++----- libtommath/tommath.h | 12 ++++++------ libtommath/tommath_superclass.h | 4 ++-- unix/tclUnixFCmd.c | 2 +- unix/tclUnixFile.c | 2 +- unix/tclUnixPort.h | 6 +++--- unix/tclUnixThrd.h | 2 +- unix/tclUnixTime.c | 4 ++-- unix/tclXtTest.c | 2 +- win/tclWinInt.h | 34 +++++++++++++++++----------------- 14 files changed, 43 insertions(+), 43 deletions(-) diff --git a/compat/unistd.h b/compat/unistd.h index 2de5bd0..a8f14f2 100644 --- a/compat/unistd.h +++ b/compat/unistd.h @@ -20,7 +20,7 @@ #define NULL 0 #endif -/* +/* * Strict POSIX stuff goes here. Extensions go down below, in the ifndef * _POSIX_SOURCE section. */ diff --git a/generic/tclCompile.h b/generic/tclCompile.h index 9ee60c3..ccfa4be 100644 --- a/generic/tclCompile.h +++ b/generic/tclCompile.h @@ -1257,7 +1257,7 @@ MODULE_SCOPE int TclWordKnownAtCompileTime(Tcl_Token *tokenPtr, * If the second macro is defined, logging to file starts immediately, * otherwise only after the first call to [tcl::dtrace]. Note that the debug * probe data is always computed, even when it is not logged to file. - * + * * Defining the third macro enables debug logging of inst probes (disabled * by default due to the significant performance impact). */ diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index d6f1da9..1a83752 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1,4 +1,4 @@ -/* +/* * tclStubInit.c -- * * This file contains the initializers for the Tcl stub vectors. diff --git a/generic/tclStubLib.c b/generic/tclStubLib.c index 31fc865..c895ffe 100644 --- a/generic/tclStubLib.c +++ b/generic/tclStubLib.c @@ -142,7 +142,7 @@ TclTomMathInitializeStubs( const char* packageName = "tcl::tommath"; const char* errMsg = NULL; ClientData pkgClientData = NULL; - const char* actualVersion = + const char* actualVersion = tclStubsPtr->tcl_PkgRequireEx(interp, packageName, version, exact, &pkgClientData); TclTomMathStubs* stubsPtr = (TclTomMathStubs*) pkgClientData; if (actualVersion == NULL) { diff --git a/generic/tclTomMath.h b/generic/tclTomMath.h index b219405..1e455fe 100644 --- a/generic/tclTomMath.h +++ b/generic/tclTomMath.h @@ -80,10 +80,10 @@ extern "C" { #define DIGIT_BIT 60 #else /* this is the default case, 28-bit digits */ - + /* this is to make porting into LibTomCrypt easier :-) */ #ifndef CRYPT - #if defined(_MSC_VER) || defined(__BORLANDC__) + #if defined(_MSC_VER) || defined(__BORLANDC__) typedef unsigned __int64 ulong64; typedef signed __int64 long64; #else @@ -98,21 +98,21 @@ extern "C" { #endif typedef ulong64 mp_word; -#ifdef MP_31BIT +#ifdef MP_31BIT /* this is an extension that uses 31-bit digits */ #define DIGIT_BIT 31 #else /* default case is 28-bit digits, defines MP_28BIT as a handy macro to test */ #define DIGIT_BIT 28 #define MP_28BIT -#endif +#endif #endif /* define heap macros */ #if 0 /* these are macros in tclTomMathDecls.h */ #ifndef CRYPT /* default to libc stuff */ - #ifndef XMALLOC + #ifndef XMALLOC #define XMALLOC malloc #define XFREE free #define XREALLOC realloc diff --git a/libtommath/tommath.h b/libtommath/tommath.h index df460f6..babb478 100644 --- a/libtommath/tommath.h +++ b/libtommath/tommath.h @@ -36,7 +36,7 @@ extern "C" { /* detect 64-bit mode if possible */ -#if defined(__x86_64__) +#if defined(__x86_64__) #if !(defined(MP_64BIT) && defined(MP_16BIT) && defined(MP_8BIT)) #define MP_64BIT #endif @@ -69,10 +69,10 @@ extern "C" { #define DIGIT_BIT 60 #else /* this is the default case, 28-bit digits */ - + /* this is to make porting into LibTomCrypt easier :-) */ #ifndef CRYPT - #if defined(_MSC_VER) || defined(__BORLANDC__) + #if defined(_MSC_VER) || defined(__BORLANDC__) typedef unsigned __int64 ulong64; typedef signed __int64 long64; #else @@ -84,20 +84,20 @@ extern "C" { typedef unsigned long mp_digit; typedef ulong64 mp_word; -#ifdef MP_31BIT +#ifdef MP_31BIT /* this is an extension that uses 31-bit digits */ #define DIGIT_BIT 31 #else /* default case is 28-bit digits, defines MP_28BIT as a handy macro to test */ #define DIGIT_BIT 28 #define MP_28BIT -#endif +#endif #endif /* define heap macros */ #ifndef CRYPT /* default to libc stuff */ - #ifndef XMALLOC + #ifndef XMALLOC #define XMALLOC malloc #define XFREE free #define XREALLOC realloc diff --git a/libtommath/tommath_superclass.h b/libtommath/tommath_superclass.h index e3926df..c4ec19f 100644 --- a/libtommath/tommath_superclass.h +++ b/libtommath/tommath_superclass.h @@ -60,9 +60,9 @@ #undef BN_FAST_MP_INVMOD_C /* To safely undefine these you have to make sure your RSA key won't exceed the Comba threshold - * which is roughly 255 digits [7140 bits for 32-bit machines, 15300 bits for 64-bit machines] + * which is roughly 255 digits [7140 bits for 32-bit machines, 15300 bits for 64-bit machines] * which means roughly speaking you can handle upto 2536-bit RSA keys with these defined without - * trouble. + * trouble. */ #undef BN_S_MP_MUL_DIGS_C #undef BN_S_MP_SQR_C diff --git a/unix/tclUnixFCmd.c b/unix/tclUnixFCmd.c index 7360c32..a582223 100644 --- a/unix/tclUnixFCmd.c +++ b/unix/tclUnixFCmd.c @@ -1051,7 +1051,7 @@ TraverseUnixTree( unsigned short pathlen = ent->fts_pathlen - sourceLen; int type; Tcl_StatBuf *statBufPtr = NULL; - + if (info == FTS_DNR || info == FTS_ERR || info == FTS_NS) { errfile = ent->fts_path; break; diff --git a/unix/tclUnixFile.c b/unix/tclUnixFile.c index 038cbf8..3b09799 100644 --- a/unix/tclUnixFile.c +++ b/unix/tclUnixFile.c @@ -332,7 +332,7 @@ TclpMatchInDirectory( matchHiddenPat = (pattern[0] == '.') || ((pattern[0] == '\\') && (pattern[1] == '.')); - matchHidden = matchHiddenPat + matchHidden = matchHiddenPat || (types && (types->perm & TCL_GLOB_PERM_HIDDEN)); while ((entryPtr = TclOSreaddir(d)) != NULL) { /* INTL: Native. */ Tcl_DString utfDs; diff --git a/unix/tclUnixPort.h b/unix/tclUnixPort.h index 086dd91..0e1bce8 100644 --- a/unix/tclUnixPort.h +++ b/unix/tclUnixPort.h @@ -590,8 +590,8 @@ extern char **environ; /* *--------------------------------------------------------------------------- - * The following macros and declarations represent the interface between - * generic and unix-specific parts of Tcl. Some of the macros may override + * The following macros and declarations represent the interface between + * generic and unix-specific parts of Tcl. Some of the macros may override * functions declared in tclInt.h. *--------------------------------------------------------------------------- */ @@ -608,7 +608,7 @@ typedef int socklen_t; #endif /* - * The following macros have trivial definitions, allowing generic code to + * The following macros have trivial definitions, allowing generic code to * address platform-specific issues. */ diff --git a/unix/tclUnixThrd.h b/unix/tclUnixThrd.h index 6a73132..f03b530 100644 --- a/unix/tclUnixThrd.h +++ b/unix/tclUnixThrd.h @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. */ - + #ifndef _TCLUNIXTHRD #define _TCLUNIXTHRD diff --git a/unix/tclUnixTime.c b/unix/tclUnixTime.c index 4860876..a6572f9 100644 --- a/unix/tclUnixTime.c +++ b/unix/tclUnixTime.c @@ -228,7 +228,7 @@ TclpWideClicksToNanoseconds( #ifdef MAC_OSX_TCL static mach_timebase_info_data_t tb; static uint64_t maxClicksForUInt64; - + if (!tb.denom) { mach_timebase_info(&tb); maxClicksForUInt64 = UINT64_MAX / tb.numer; @@ -251,7 +251,7 @@ TclpWideClicksToNanoseconds( * * TclpWideClickInMicrosec -- * - * This procedure return scale to convert click values from the + * This procedure return scale to convert click values from the * TclpGetWideClicks native resolution to microsecond resolution * and back. * diff --git a/unix/tclXtTest.c b/unix/tclXtTest.c index 8437f2a..1393dfe 100644 --- a/unix/tclXtTest.c +++ b/unix/tclXtTest.c @@ -1,4 +1,4 @@ -/* +/* * tclXtTest.c -- * * Contains commands for Xt notifier specific tests on Unix. diff --git a/win/tclWinInt.h b/win/tclWinInt.h index af6619f..39790a0 100644 --- a/win/tclWinInt.h +++ b/win/tclWinInt.h @@ -59,7 +59,7 @@ typedef struct TCLEXCEPTION_REGISTRATION { #endif /* - * The following structure keeps track of whether we are using the + * The following structure keeps track of whether we are using the * multi-byte or the wide-character interfaces to the operating system. * System calls should be made through the following function table. */ @@ -76,10 +76,10 @@ typedef struct TclWinProcs { TCHAR *(WINAPI *charLowerProc)(TCHAR *); BOOL (WINAPI *copyFileProc)(CONST TCHAR *, CONST TCHAR *, BOOL); BOOL (WINAPI *createDirectoryProc)(CONST TCHAR *, LPSECURITY_ATTRIBUTES); - HANDLE (WINAPI *createFileProc)(CONST TCHAR *, DWORD, DWORD, + HANDLE (WINAPI *createFileProc)(CONST TCHAR *, DWORD, DWORD, LPSECURITY_ATTRIBUTES, DWORD, DWORD, HANDLE); - BOOL (WINAPI *createProcessProc)(CONST TCHAR *, TCHAR *, - LPSECURITY_ATTRIBUTES, LPSECURITY_ATTRIBUTES, BOOL, DWORD, + BOOL (WINAPI *createProcessProc)(CONST TCHAR *, TCHAR *, + LPSECURITY_ATTRIBUTES, LPSECURITY_ATTRIBUTES, BOOL, DWORD, LPVOID, CONST TCHAR *, LPSTARTUPINFOA, LPPROCESS_INFORMATION); BOOL (WINAPI *deleteFileProc)(CONST TCHAR *); HANDLE (WINAPI *findFirstFileProc)(CONST TCHAR *, WIN32_FIND_DATAT *); @@ -87,35 +87,35 @@ typedef struct TclWinProcs { BOOL (WINAPI *getComputerNameProc)(WCHAR *, LPDWORD); DWORD (WINAPI *getCurrentDirectoryProc)(DWORD, WCHAR *); DWORD (WINAPI *getFileAttributesProc)(CONST TCHAR *); - DWORD (WINAPI *getFullPathNameProc)(CONST TCHAR *, DWORD nBufferLength, + DWORD (WINAPI *getFullPathNameProc)(CONST TCHAR *, DWORD nBufferLength, WCHAR *, TCHAR **); DWORD (WINAPI *getModuleFileNameProc)(HMODULE, WCHAR *, int); - DWORD (WINAPI *getShortPathNameProc)(CONST TCHAR *, WCHAR *, DWORD); - UINT (WINAPI *getTempFileNameProc)(CONST TCHAR *, CONST TCHAR *, UINT, + DWORD (WINAPI *getShortPathNameProc)(CONST TCHAR *, WCHAR *, DWORD); + UINT (WINAPI *getTempFileNameProc)(CONST TCHAR *, CONST TCHAR *, UINT, WCHAR *); DWORD (WINAPI *getTempPathProc)(DWORD, WCHAR *); - BOOL (WINAPI *getVolumeInformationProc)(CONST TCHAR *, WCHAR *, DWORD, + BOOL (WINAPI *getVolumeInformationProc)(CONST TCHAR *, WCHAR *, DWORD, LPDWORD, LPDWORD, LPDWORD, WCHAR *, DWORD); HINSTANCE (WINAPI *loadLibraryExProc)(CONST TCHAR *, HANDLE, DWORD); TCHAR (WINAPI *lstrcpyProc)(WCHAR *, CONST TCHAR *); BOOL (WINAPI *moveFileProc)(CONST TCHAR *, CONST TCHAR *); BOOL (WINAPI *removeDirectoryProc)(CONST TCHAR *); - DWORD (WINAPI *searchPathProc)(CONST TCHAR *, CONST TCHAR *, + DWORD (WINAPI *searchPathProc)(CONST TCHAR *, CONST TCHAR *, CONST TCHAR *, DWORD, WCHAR *, TCHAR **); BOOL (WINAPI *setCurrentDirectoryProc)(CONST TCHAR *); BOOL (WINAPI *setFileAttributesProc)(CONST TCHAR *, DWORD); - /* + /* * These two function pointers will only be set when * Tcl_FindExecutable is called. If you don't ever call that * function, the application will crash whenever WinTcl tries to call * functions through these null pointers. That is not a bug in Tcl * -- Tcl_FindExecutable is obligatory in recent Tcl releases. */ - BOOL (WINAPI *getFileAttributesExProc)(CONST TCHAR *, + BOOL (WINAPI *getFileAttributesExProc)(CONST TCHAR *, GET_FILEEX_INFO_LEVELS, LPVOID); - BOOL (WINAPI *createHardLinkProc)(CONST TCHAR*, CONST TCHAR*, + BOOL (WINAPI *createHardLinkProc)(CONST TCHAR*, CONST TCHAR*, LPSECURITY_ATTRIBUTES); - + /* deleted INT (__cdecl *utimeProc)(CONST TCHAR*, struct _utimbuf *); */ /* These two are also NULL at start; see comment above */ HANDLE (WINAPI *findFirstFileExProc)(CONST TCHAR*, UINT, @@ -123,7 +123,7 @@ typedef struct TclWinProcs { LPVOID, DWORD); BOOL (WINAPI *getVolumeNameForVMPProc)(CONST TCHAR*, TCHAR*, DWORD); DWORD (WINAPI *getLongPathNameProc)(CONST TCHAR*, TCHAR*, DWORD); - /* + /* * These six are for the security sdk to get correct file * permissions on NT, 2000, XP, etc. On 95,98,ME they are * always null. @@ -131,9 +131,9 @@ typedef struct TclWinProcs { BOOL (WINAPI *getFileSecurityProc)(LPCTSTR lpFileName, SECURITY_INFORMATION RequestedInformation, PSECURITY_DESCRIPTOR pSecurityDescriptor, - DWORD nLength, + DWORD nLength, LPDWORD lpnLengthNeeded); - BOOL (WINAPI *impersonateSelfProc) (SECURITY_IMPERSONATION_LEVEL + BOOL (WINAPI *impersonateSelfProc) (SECURITY_IMPERSONATION_LEVEL ImpersonationLevel); BOOL (WINAPI *openThreadTokenProc) (HANDLE ThreadHandle, DWORD DesiredAccess, BOOL OpenAsSelf, @@ -191,7 +191,7 @@ MODULE_SCOPE HANDLE TclWinSerialOpen(HANDLE handle, CONST TCHAR *name, DWORD access); MODULE_SCOPE int TclWinSymLinkCopyDirectory(CONST TCHAR* LinkOriginal, CONST TCHAR* LinkCopy); -MODULE_SCOPE int TclWinSymLinkDelete(CONST TCHAR* LinkOriginal, +MODULE_SCOPE int TclWinSymLinkDelete(CONST TCHAR* LinkOriginal, int linkOnly); #if defined(TCL_THREADS) && defined(USE_THREAD_ALLOC) MODULE_SCOPE void TclWinFreeAllocCache(void); -- cgit v0.12 From 11e6ae48b4e7c038fc3d1a5bb47978e70b53249a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 17 Apr 2020 13:01:35 +0000 Subject: Unbreak shared windows build. Remove some ARGSUSED usage. --- generic/tclTest.c | 49 ++++++++++++++++++++----------------------------- unix/tclUnixChan.c | 6 ------ unix/tclUnixPipe.c | 4 ---- win/tclWinPipe.c | 1 - win/tclWinSock.c | 3 --- 5 files changed, 20 insertions(+), 43 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index 6e0fbed..8c29aa7 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -457,12 +457,6 @@ Tcltest_Init( "-appinitprocclosestderr", "-appinitprocsetrcfile", NULL }; - if (Tcl_InitStubs(interp, "8.5", 0) == NULL) { - return TCL_ERROR; - } - if (Tcl_TomMath_InitStubs(interp, "8.5") == NULL) { - return TCL_ERROR; - } /* TIP #268: Full patchlevel instead of just major.minor */ if (Tcl_PkgProvide(interp, "Tcltest", TCL_PATCH_LEVEL) == TCL_ERROR) { @@ -683,7 +677,6 @@ Tcltest_Init( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int TestasyncCmd( ClientData dummy, /* Not used. */ @@ -913,7 +906,7 @@ AsyncThreadProc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestcmdinfoCmd( ClientData dummy, /* Not used. */ @@ -985,7 +978,6 @@ TestcmdinfoCmd( return TCL_OK; } - /*ARGSUSED*/ static int CmdProc1( ClientData clientData, /* String to return. */ @@ -997,7 +989,6 @@ CmdProc1( return TCL_OK; } - /*ARGSUSED*/ static int CmdProc2( ClientData clientData, /* String to return. */ @@ -1044,7 +1035,7 @@ CmdDelProc2( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestcmdtokenCmd( ClientData dummy, /* Not used. */ @@ -1108,7 +1099,7 @@ TestcmdtokenCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestcmdtraceCmd( ClientData dummy, /* Not used. */ @@ -1398,7 +1389,7 @@ CreatedCommandProc2( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestdcallCmd( ClientData dummy, /* Not used. */ @@ -1463,7 +1454,7 @@ DelCallbackProc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestdelCmd( ClientData dummy, /* Not used. */ @@ -1668,7 +1659,7 @@ TestdoubledigitsObjCmd(ClientData unused, *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestdstringCmd( ClientData dummy, /* Not used. */ @@ -1795,7 +1786,7 @@ static void SpecialFree(blockPtr) *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestencodingObjCmd( ClientData dummy, /* Not used. */ @@ -2650,7 +2641,7 @@ TestgetplatformCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestinterpdeleteCmd( ClientData dummy, /* Not used. */ @@ -2691,7 +2682,7 @@ TestinterpdeleteCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestlinkCmd( ClientData dummy, /* Not used. */ @@ -3222,7 +3213,7 @@ TestlocaleCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestMathFunc( ClientData clientData, /* Integer value to return. */ @@ -3252,7 +3243,7 @@ TestMathFunc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestMathFunc2( ClientData clientData, /* Integer value to return. */ @@ -3359,7 +3350,7 @@ TestMathFunc2( * *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static void CleanupTestSetassocdataTests( ClientData clientData, /* Data to be released. */ @@ -3693,7 +3684,7 @@ TestparsevarnameObjCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestregexpObjCmd( ClientData dummy, /* Not used. */ @@ -4019,7 +4010,7 @@ TestregexpXflags( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestreturnObjCmd( ClientData dummy, /* Not used. */ @@ -4241,7 +4232,7 @@ TesttranslatefilenameCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestupvarCmd( ClientData dummy, /* Not used. */ @@ -4294,7 +4285,7 @@ TestupvarCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestseterrorcodeCmd( ClientData dummy, /* Not used. */ @@ -4347,7 +4338,7 @@ TestseterrorcodeCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestsetobjerrorcodeCmd( ClientData dummy, /* Not used. */ @@ -4376,7 +4367,7 @@ TestsetobjerrorcodeCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestfeventCmd( ClientData clientData, /* Not used. */ @@ -5768,7 +5759,7 @@ TestOpenFileChannelProc3( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestChannelCmd( ClientData clientData, /* Not used. */ @@ -6237,7 +6228,7 @@ TestChannelCmd( *---------------------------------------------------------------------- */ - /* ARGSUSED */ + static int TestChannelEventCmd( ClientData dummy, /* Not used. */ diff --git a/unix/tclUnixChan.c b/unix/tclUnixChan.c index 9cac4ae..d3207e2 100644 --- a/unix/tclUnixChan.c +++ b/unix/tclUnixChan.c @@ -344,7 +344,6 @@ static Tcl_ChannelType tcpChannelType = { *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int FileBlockModeProc( ClientData instanceData, /* File state. */ @@ -1840,7 +1839,6 @@ Tcl_MakeFileChannel( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int TcpBlockModeProc( ClientData instanceData, /* Socket state. */ @@ -1938,7 +1936,6 @@ WaitForConnect( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int TcpInputProc( ClientData instanceData, /* Socket state. */ @@ -2033,7 +2030,6 @@ TcpOutputProc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int TcpCloseProc( ClientData instanceData, /* The socket to close. */ @@ -2331,7 +2327,6 @@ TcpWatchProc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int TcpGetHandleProc( ClientData instanceData, /* The socket state. */ @@ -2853,7 +2848,6 @@ Tcl_OpenTcpServer( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static void TcpAccept( ClientData data, /* Callback token. */ diff --git a/unix/tclUnixPipe.c b/unix/tclUnixPipe.c index d0a5e53..f2a2daa 100644 --- a/unix/tclUnixPipe.c +++ b/unix/tclUnixPipe.c @@ -410,7 +410,6 @@ TclpCloseFile( *--------------------------------------------------------------------------- */ - /* ARGSUSED */ int TclpCreateProcess( Tcl_Interp *interp, /* Interpreter in which to leave errors that @@ -889,7 +888,6 @@ TclGetAndDetachPids( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int PipeBlockModeProc( ClientData instanceData, /* Pipe state. */ @@ -933,7 +931,6 @@ PipeBlockModeProc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int PipeCloseProc( ClientData instanceData, /* The pipe to close. */ @@ -1231,7 +1228,6 @@ Tcl_WaitPid( *---------------------------------------------------------------------- */ - /* ARGSUSED */ int Tcl_PidObjCmd( ClientData dummy, /* Not used. */ diff --git a/win/tclWinPipe.c b/win/tclWinPipe.c index e33273b..cdb955f 100644 --- a/win/tclWinPipe.c +++ b/win/tclWinPipe.c @@ -2782,7 +2782,6 @@ TclWinAddProcess( *---------------------------------------------------------------------- */ - /* ARGSUSED */ int Tcl_PidObjCmd( ClientData dummy, /* Not used. */ diff --git a/win/tclWinSock.c b/win/tclWinSock.c index 8f565b9..882aa4a 100644 --- a/win/tclWinSock.c +++ b/win/tclWinSock.c @@ -358,7 +358,6 @@ InitSockets(void) *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int SocketsEnabled(void) { @@ -387,7 +386,6 @@ SocketsEnabled(void) *---------------------------------------------------------------------- */ - /* ARGSUSED */ static void SocketExitHandler( ClientData clientData) /* Not used. */ @@ -786,7 +784,6 @@ TcpBlockProc( *---------------------------------------------------------------------- */ - /* ARGSUSED */ static int TcpCloseProc( ClientData instanceData, /* The socket to close. */ -- cgit v0.12 From a06573ba8beca69a80a887e0e3f9e33e37216fdc Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 17 Apr 2020 13:31:06 +0000 Subject: Add mem-debug build to Travis. Update Xcode from 8 to 8.3, 9 to 9.2 and 11.3 to 11.4 --- .travis.yml | 59 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 02e3102..e10ca7c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ sudo: false language: c - matrix: include: # Testing on Linux with various compilers @@ -24,6 +23,13 @@ matrix: env: - BUILD_DIR=unix - CFGOPT="--enable-symbols" + - name: "Linux/GCC/Mem-Debug" + os: linux + dist: bionic + compiler: gcc + env: + - BUILD_DIR=unix + - CFGOPT="--enable-symbols=mem" # Older versions of GCC... - name: "Linux/GCC 7/Shared" os: linux @@ -82,15 +88,17 @@ matrix: env: - BUILD_DIR=unix - CFGOPT="--enable-symbols" -# Testing on Mac, various styles - - name: "macOS/Xcode 11.3/Shared/Unix-like" - os: osx - osx_image: xcode11.3 + - name: "Linux/Clang/Mem-Debug" + os: linux + dist: bionic + compiler: clang env: - BUILD_DIR=unix - - name: "macOS/Xcode 11.3/Shared" + - CFGOPT="--enable-symbols=mem" +# Testing on Mac, various styles + - name: "macOS/Xcode 11.4/Shared" os: osx - osx_image: xcode11.3 + osx_image: xcode11.4 env: - BUILD_DIR=macosx install: [] @@ -98,6 +106,12 @@ matrix: - make all # The styles=develop avoids some weird problems on OSX - make test styles=develop + - name: "macOS/Xcode 11.4/Shared/Unix-like" + os: osx + osx_image: xcode11.4 + env: + - BUILD_DIR=unix +# Older MacOS versions - name: "macOS/Xcode 11/Shared" os: osx osx_image: xcode11 @@ -114,14 +128,14 @@ matrix: script: *mactest - name: "macOS/Xcode 9/Shared" os: osx - osx_image: xcode9 + osx_image: xcode9.2 env: - BUILD_DIR=macosx install: [] script: *mactest - name: "macOS/Xcode 8/Shared" os: osx - osx_image: xcode8 + osx_image: xcode8.3 env: - BUILD_DIR=macosx install: [] @@ -197,6 +211,15 @@ matrix: script: - cmd.exe //C vcvarsall.bat x64 '&&' nmake 'OPTS=symbols,threads' '-f' makefile.vc all tcltest - cmd.exe //C vcvarsall.bat x64 '&&' nmake 'OPTS=symbols,threads' '-f' makefile.vc test + - name: "Windows/MSVC/Mem-Debug" + os: windows + compiler: cl + env: *vcenv + before_install: *vcpreinst + install: [] + script: + - cmd.exe //C vcvarsall.bat x64 '&&' nmake 'STATS=memdbg OPTS=threads' '-f' makefile.vc all tcltest + - cmd.exe //C vcvarsall.bat x64 '&&' nmake 'STATS=memdbg OPTS=threads' '-f' makefile.vc test # Test on Windows with MSVC native (32-bit) - name: "Windows/MSVC-x86/Shared" os: windows @@ -225,6 +248,15 @@ matrix: script: - cmd.exe //C vcvarsall.bat x86 '&&' nmake 'OPTS=symbols,threads' '-f' makefile.vc all tcltest - cmd.exe //C vcvarsall.bat x86 '&&' nmake 'OPTS=symbols,threads' '-f' makefile.vc test + - name: "Windows/MSVC-x86/Mem-Debug" + os: windows + compiler: cl + env: *vcenv + before_install: *vcpreinst + install: [] + script: + - cmd.exe //C vcvarsall.bat x86 '&&' nmake 'STATS=memdbg OPTS=threads' '-f' makefile.vc all tcltest + - cmd.exe //C vcvarsall.bat x86 '&&' nmake 'STATS=memdbg OPTS=threads' '-f' makefile.vc test # Test on Windows with GCC native - name: "Windows/GCC/Shared" os: windows @@ -233,7 +265,7 @@ matrix: - BUILD_DIR=win - CFGOPT="--enable-64bit --enable-threads" before_install: &makepreinst - - choco install make + - choco install -y make - cd ${BUILD_DIR} - name: "Windows/GCC/Static" os: windows @@ -271,6 +303,13 @@ matrix: - BUILD_DIR=win - CFGOPT="--enable-threads --enable-symbols" before_install: *makepreinst + - name: "Windows/GCC-x86/Mem-Debug" + os: windows + compiler: gcc + env: + - BUILD_DIR=win + - CFGOPT="--enable-threads --enable-symbols=mem" + before_install: *makepreinst before_install: - cd ${BUILD_DIR} install: -- cgit v0.12 From 3e4b30ced377fbb060251b1fd2b93af3e0a174d2 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 16:56:51 +0000 Subject: Bring back the test utf-2.11; it fails in a TCL_UTF_MAX=4 build. --- tests/utf.test | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 7b7b5c2..a22dafe 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,6 +13,8 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } +testConstraint testbytestring [llength [info commands testbytestring]] + catch {unset x} test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { @@ -59,6 +61,12 @@ test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} { string length [bytestring "\xF4\xA2\xA2\xA2"] } {4} +test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, invalid} testbytestring { + # Would decode to U+110000 but that is outside the Unicode range. + string length [testbytestring "\xF4\x90\x80\x80"] +} {4} + + test utf-3.1 {Tcl_UtfCharComplete} { } {} -- cgit v0.12 From 56fd392b06220e2e62bbae6c85ca703b9524a0c0 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 20:15:07 +0000 Subject: Backport a collection of tests for consistency between branches. --- tests/utf.test | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index a22dafe..ff4f4a9 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -57,15 +57,22 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { string length [bytestring "\xE4\xb9\x8e"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} { - string length [bytestring "\xF4\xA2\xA2\xA2"] +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { + string length [testbytestring "\xF0\x90\x80\x80"] +} -result {4} +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { + string length [testbytestring "\xF4\x8F\xBF\xBF"] +} -result {4} +test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { + string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} - -test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, invalid} testbytestring { +test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring { # Would decode to U+110000 but that is outside the Unicode range. string length [testbytestring "\xF4\x90\x80\x80"] } {4} - +test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring { + string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"] +} {5} test utf-3.1 {Tcl_UtfCharComplete} { } {} -- cgit v0.12 From 71dae9330e54a368426513c8fa5ad8ac04bdf110 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 20:23:49 +0000 Subject: Corrections for many tests, changing lead byte \xF4 to \xF2. The tested sequences were always intended to be valid 4-byte sequences. Also a few errors with greedy \xHHHH . --- tests/utf.test | 92 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index ff4f4a9..7953a68 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -149,7 +149,7 @@ test utf-6.6 {Tcl_UtfNext} testutfnext { testutfnext A\xE8 } 1 test utf-6.7 {Tcl_UtfNext} testutfnext { - testutfnext A\xF4 + testutfnext A\xF2 } 1 test utf-6.8 {Tcl_UtfNext} testutfnext { testutfnext A\xF8 @@ -170,7 +170,7 @@ test utf-6.13 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xE8 } 1 test utf-6.14 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF4 + testutfnext \xA0\xF2 } 1 test utf-6.15 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xF8 @@ -179,7 +179,7 @@ test utf-6.16 {Tcl_UtfNext} testutfnext { testutfnext \xD0 } 1 test utf-6.17 {Tcl_UtfNext} testutfnext { - testutfnext \xD0A + testutfnext \xD0G } 1 test utf-6.18 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0 @@ -191,7 +191,7 @@ test utf-6.20 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xE8 } 1 test utf-6.21 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF4 + testutfnext \xD0\xF2 } 1 test utf-6.22 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xF8 @@ -200,7 +200,7 @@ test utf-6.23 {Tcl_UtfNext} testutfnext { testutfnext \xE8 } 1 test utf-6.24 {Tcl_UtfNext} testutfnext { - testutfnext \xE8A + testutfnext \xE8G } 1 test utf-6.25 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0 @@ -212,37 +212,37 @@ test utf-6.27 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xE8 } 1 test utf-6.28 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF4 + testutfnext \xE8\xF2 } 1 test utf-6.29 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xF8 } 1 test utf-6.30 {Tcl_UtfNext} testutfnext { - testutfnext \xF4 + testutfnext \xF2 } 1 test utf-6.31 {Tcl_UtfNext} testutfnext { - testutfnext \xF4A + testutfnext \xF2G } 1 test utf-6.32 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0 + testutfnext \xF2\xA0 } 1 test utf-6.33 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xD0 + testutfnext \xF2\xD0 } 1 test utf-6.34 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xE8 + testutfnext \xF2\xE8 } 1 test utf-6.35 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xF4 + testutfnext \xF2\xF2 } 1 test utf-6.36 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xF8 + testutfnext \xF2\xF8 } 1 test utf-6.37 {Tcl_UtfNext} testutfnext { testutfnext \xF8 } 1 test utf-6.38 {Tcl_UtfNext} testutfnext { - testutfnext \xF8A + testutfnext \xF8G } 1 test utf-6.39 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xA0 @@ -254,7 +254,7 @@ test utf-6.41 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xE8 } 1 test utf-6.42 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF4 + testutfnext \xF8\xF2 } 1 test utf-6.43 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xF8 @@ -272,7 +272,7 @@ test utf-6.47 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0\xE8 } 2 test utf-6.48 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF4 + testutfnext \xD0\xA0\xF2 } 2 test utf-6.49 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0\xF8 @@ -290,28 +290,28 @@ test utf-6.53 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xE8 } 1 test utf-6.54 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF4 + testutfnext \xE8\xA0\xF2 } 1 test utf-6.55 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xF8 } 1 test utf-6.56 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0G + testutfnext \xF2\xA0G } 1 test utf-6.57 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0 + testutfnext \xF2\xA0\xA0 } 1 test utf-6.58 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xD0 + testutfnext \xF2\xA0\xD0 } 1 test utf-6.59 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xE8 + testutfnext \xF2\xA0\xE8 } 1 test utf-6.60 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xF4 + testutfnext \xF2\xA0\xF2 } 1 test utf-6.61 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xF8 + testutfnext \xF2\xA0\xF8 } 1 test utf-6.62 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0G @@ -326,46 +326,46 @@ test utf-6.65 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0\xE8 } 3 test utf-6.66 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF4 + testutfnext \xE8\xA0\xA0\xF2 } 3 test utf-6.67 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0\xF8 } 3 test utf-6.68 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0G + testutfnext \xF2\xA0\xA0G } 1 test utf-6.69 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0 + testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.70 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xD0 + testutfnext \xF2\xA0\xA0\xD0 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xE8 + testutfnext \xF2\xA0\xA0\xE8 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xF8 + testutfnext \xF2\xA0\xA0\xF8 } 1 test utf-6.74 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0G + testutfnext \xF2\xA0\xA0\xA0G } 1 test utf-6.75 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xA0 + testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 test utf-6.76 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xD0 + testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 test utf-6.77 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xE8 + testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 test utf-6.78 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 test utf-6.79 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0G\xF8 + testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x80 @@ -425,13 +425,13 @@ test utf-7.4.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xF8\xA0\xA0 2 } 1 test utf-7.5 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4 + testutfprev A\xF2 } 1 test utf-7.5.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 2 + testutfprev A\xF2\xA0\xA0\xA0 2 } 1 test utf-7.5.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xF8\xA0\xA0 2 + testutfprev A\xF2\xF8\xA0\xA0 2 } 1 test utf-7.6 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8 @@ -470,13 +470,13 @@ test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 test utf-7.10 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0 + testutfprev A\xF2\xA0 } 2 test utf-7.10.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 3 + testutfprev A\xF2\xA0\xA0\xA0 3 } 2 test utf-7.10.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xF8\xA0 3 + testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 @@ -518,13 +518,13 @@ test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 test utf-7.15 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0 + testutfprev A\xF2\xA0\xA0 } 3 test utf-7.15.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 4 + testutfprev A\xF2\xA0\xA0\xA0 4 } 3 test utf-7.15.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xF8 4 + testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 @@ -557,7 +557,7 @@ test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 test utf-7.20 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 + testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.21 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 -- cgit v0.12 From a7275840313f4b3497a699c689df6c9a8e653474 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 20:38:38 +0000 Subject: [493dccc2de] Coverage that Tcl_UtfPrev also checks the upper range validity. --- tests/utf.test | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 7953a68..6d87928 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -652,6 +652,30 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 +test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF +} 4 +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 4 +} 3 +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 3 +} 2 +test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 2 +} 1 +test utf-7.49 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 +} 4 +test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 4 +} 3 +test utf-7.49.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 3 +} 2 +test utf-7.49.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 2 +} 1 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From 1f2c2c12ced95a827ea4bf2cbc9eae00f544f47b Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 21:03:30 +0000 Subject: More tests explicitly for Tcl_UtfNext near validity boundary U+110000 --- tests/utf.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 6d87928..01e0bb2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -403,6 +403,12 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 +test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { + testutfnext \xF4\x8F\xBF\xBF +} 1 +test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { + testutfnext \xF4\x90\x80\x80 +} 1 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From ea66e551f9104248156101365933628df52f2698 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 21:07:09 +0000 Subject: [493dccc2de] Revise sequence validity check to reject out of range decodes too. --- generic/tclUtf.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b5c430b..1883804 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,7 +81,7 @@ static CONST unsigned char totalBytes[256] = { */ static int UtfCount(int ch); -static int Overlong(unsigned char *src); +static int Invalid(unsigned char *src); /* *--------------------------------------------------------------------------- @@ -120,51 +120,52 @@ UtfCount( /* *--------------------------------------------------------------------------- * - * Overlong -- + * Invalid -- * * Utility routine to report whether /src/ points to the start of an - * overlong byte sequence that should be rejected. Caller guarantees - * that src[0] and src[1] are readable, and + * invald byte sequence that should be rejected. This might be because + * it is an overlong encoding, or because it encodes something out of + * the proper range. Caller guarantees that src[0] and src[1] are + * readable, and * * (src[0] >= 0xC0) && (src[0] != 0xC1) * (src[1] >= 0x80) && (src[1] < 0xC0) - * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF8 : 0xF0)) + * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0)) * * Results: * A boolean. *--------------------------------------------------------------------------- */ -static CONST unsigned char overlong[3] = { - 0x80, /* \xD0 -- all sequences valid */ - 0xA0, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ +static CONST unsigned char bounds[28] = { + 0x80, 0x80, /* \xC0 accepts \x80 only */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, + 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */ + 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */ #if TCL_UTF_MAX > 3 - 0x90 /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */ #else - 0xC0 /* Not used, but reject all again for safety. */ + 0xC0, 0xBF, /* Not used, but reject all again for safety. */ + 0xC0, 0xBF /* Not used, but reject all again for safety. */ #endif }; INLINE static int -Overlong( +Invalid( unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */ { unsigned char byte = *src; + int index; - if (byte % 0x10) { - /* Only lead bytes 0xC0, 0xE0, 0xF0 need examination */ + if (byte % 0x04) { + /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ return 0; } - if (byte == 0xC0) { - if (src[1] == 0x80) { - /* Valid sequence: \xC0\x80 for \u0000 */ - return 0; - } - /* Reject overlong: \xC0\x81 - \xC0\xBF */ - return 1; - } - if (src[1] < overlong[(byte >> 4) - 0x0D]) { - /* Reject overlong */ + index = (byte - 0xC0) >> 1; + if (src[1] < bounds[index] || src[1] > bounds[index+1]) { + /* Out of bounds - report invalid. */ return 1; } return 0; @@ -733,7 +734,7 @@ Tcl_UtfNext( } next++; } - if (Overlong((unsigned char *)src)) { + if (Invalid((unsigned char *)src)) { return src + 1; } return next; @@ -843,10 +844,10 @@ Tcl_UtfPrev( /* * trailBytesSeen > 0, so we can examine look[1] safely. - * Use that capability to screen out overlong sequences. + * Use that capability to screen out invalid sequences. */ - if (Overlong(look)) { + if (Invalid(look)) { /* Reject */ return fallback; } -- cgit v0.12 From 075da14ffdcf09068e360b5b85468e3748c5d2d8 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 12:46:54 +0000 Subject: Update documentation of Tcl_UtfPrev/Tcl_UtfNext back to how it was. Will be updated later, when implementation is ready and agreed upon. --- doc/Utf.3 | 37 +++++++++++------------------ generic/tclUtf.c | 72 ++++++++++++++++++++------------------------------------ 2 files changed, 39 insertions(+), 70 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index cb82699..334fa6f 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -3,7 +3,7 @@ '\" '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. -'\" +'\" .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" .so man.macros .BS @@ -13,7 +13,7 @@ Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_Ut .nf \fB#include \fR .sp -typedef ... Tcl_UniChar; +typedef ... \fBTcl_UniChar\fR; .sp int \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) @@ -48,7 +48,7 @@ int int \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) .sp -int +int \fBTcl_NumUtfChars\fR(\fIsrc, length\fR) .sp const char * @@ -109,7 +109,7 @@ Pointer to the beginning of a UTF-8 string. .AP int index in The index of a character (not byte) in the UTF-8 string. .AP int *readPtr out -If non-NULL, filled with the number of bytes in the backslash sequence, +If non-NULL, filled with the number of bytes in the backslash sequence, including the backslash character. .AP char *dst out Buffer in which the bytes represented by the backslash sequence are stored. @@ -141,8 +141,8 @@ source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first -byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and -0x00FF and return 1. +byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0080 and +0x00FF and return 1. .PP \fBTcl_UniCharToUtfDString\fR converts the given Unicode string to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. @@ -210,27 +210,18 @@ length is negative, all bytes up to the first null byte are used. \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP -\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string. -If the UTF-8 string is made up entirely of complete, well-formed, and -valid character byte sequences, and \fIsrc\fR points to the lead byte -of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will -return pointers to the lead bytes of each character in the string, one -character at a time. In any other circumstance, \fBTcl_UtfNext\fR -returns \fIsrc\fR+1. \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR -and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR) -as needed to find the end of the byte sequence. If the string is -\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating -\fBNUL\fR byte. If not, the caller must use the companion routine -\fBTcl_UtfCharComplete\fR to determine whether there is any risk -\fBTcl_UtfNext\fR might read beyond the readable memory occupied -by the string. +Given \fIsrc\fR, a pointer to some location in a UTF-8 string, +\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the +string. The caller must not ask for the next character after the last +character in the string if the string is not terminated by a null +character. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made @@ -262,7 +253,7 @@ characters. Behavior is undefined if a negative \fIindex\fR is given. .PP \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must -contain at least \fIindex\fR characters. This is equivalent to calling +contain at least \fIindex\fR characters. This is equivalent to calling \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, the return pointer points to the first character in the source string. .PP diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1883804..64ee0a8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -678,35 +678,13 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * The aim of this routine is to provide a way to iterate forward - * through a UTF-8 string. The caller is expected to pass a non-NULL - * pointer argument /src/ which points to a location within a string. - * (*src) will be read, so /src/ must not point to an unreadable - * location past the end of the string. If /src/ points to the - * beginning of a complete, well-formed and valid UTF_8 byte sequence - * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer - * just past the end of that sequence. In any other circumstance, - * Tcl_UtfNext returns /src/+1. - * - * Because this routine always returns a value > /src/, it is useful - * as a forward iterator that will always make progress. If the string - * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating - * NUL character. If it is not NUL-terminated, the caller must make - * use of the companion routine Tcl_UtfCharComplete to test whether - * there is risk that Tcl_UtfNext will read beyond the end of the string. - * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. - * - * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfNext calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. * * Results: - * A pointer to the start of the next character in the string (or to - * the end of the string) as described above. + * The return value is the pointer to the next character in the UTF-8 + * string. * * Side effects: * None. @@ -747,37 +725,37 @@ Tcl_UtfNext( * * The aim of this routine is to provide a way to move backward * through a UTF-8 string. The caller is expected to pass non-NULL - * pointer arguments /start/ and /src/. /start/ points to the beginning - * of a string, and /src/ (>= /start/) points to a location within (or - * just past the end) of the string. This routine always returns a - * pointer within the string (>= /start/). When (/src/ == /start/), - * it returns /start/. When (/src/ > /start/), it returns a pointer - * (< /src/) and (>= /src/ - TCL_UTF_MAX). Subject to these constraints, - * the routine returns a pointer to the earliest byte in the string that - * starts a character when characters are read starting at /start/ and + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and * that character might include the byte src[-1]. The routine will * examine only those bytes in the range that might be returned. - * It will not examine the byte (*src), and because of that cannot + * It will not examine the byte *src, and because of that cannot * determine for certain in all circumstances whether the character * that begins with the returned pointer will or will not include - * the byte src[-1]. In the scenario where /src/ points to the end of - * a buffer being filled, the returned pointer points to either the + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the * final complete character in the string or to the earliest byte * that might start an incomplete character waiting for more bytes to * complete. * - * Because this routine always returns a value < /src/ until the point - * it is forced to return /start/, it is useful as a backward iterator + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator * through a string that will always make progress and always be * prevented from running past the beginning of the string. * * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfPrev calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: * A pointer to the start of a character in the string as described @@ -887,7 +865,7 @@ Tcl_UtfPrev( * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified character + * Returns the Tcl_UniChar represented at the specified character * (not byte) position in the UTF-8 string. * * Results: -- cgit v0.12 From 4622dd341acfb34627c5366ff8a04aa6dcbf4fc0 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 13:47:06 +0000 Subject: Fix [c574e50a3b30e76f]: CRASH: utf-2.[89] in 8.5 built with TCL_UTF_MAX=4 --- generic/regcustom.h | 2 +- generic/tcl.h | 2 +- generic/tclUtf.c | 84 +---------------- tests/utf.test | 259 ++++++++++++++++++++++++++-------------------------- tests/util.test | 1 + 5 files changed, 132 insertions(+), 216 deletions(-) diff --git a/generic/regcustom.h b/generic/regcustom.h index 57a2d47..ac33087 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ diff --git a/generic/tcl.h b/generic/tcl.h index 7378a8f..d7d064c 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2148,7 +2148,7 @@ typedef struct Tcl_Parse { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 64ee0a8..3741d70 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -209,30 +209,6 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 - if ((ch & 0xF800) == 0xD800) { - if (ch & 0x0400) { - /* Low surrogate */ - if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) - && ((buf[2] & 0xCF) == 0)) { - /* Previous Tcl_UniChar was a High surrogate, so combine */ - buf[3] = (char) ((ch & 0x3F) | 0x80); - buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); - return 4; - } - /* Previous Tcl_UniChar was not a High surrogate, so just output */ - } else { - /* High surrogate */ - ch += 0x40; - /* Fill buffer with specific 3-byte (invalid) byte combination, - so following Low surrogate can recognize it and combine */ - buf[2] = (char) ((ch << 4) & 0x30); - buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); - buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); - return 0; - } - } -#endif goto three; } @@ -321,15 +297,6 @@ Tcl_UniCharToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: - * For any UTF-8 string containing a character outside of the BMP, the - * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr - * is used to remember whether the high surrogate is already produced, it - * is recommended to initialize the variable it points to as 0 before - * the first call to Tcl_UtfToUniChar is done. - * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -402,34 +369,15 @@ Tcl_UtfToUniChar( /* * Four-byte-character lead byte followed by three trail bytes. */ -#if TCL_UTF_MAX == 4 - Tcl_UniChar surrogate; - - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); - if (byte & 0x100000) { - /* out of range, < 0x10000 or > 0x10ffff */ - } else if (*chPtr != surrogate) { - /* produce high surrogate, but don't advance source pointer */ - *chPtr = surrogate; - return 0; - } else { - /* produce low surrogate, and advance source pointer */ - *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); - return 4; - } -#else *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { return 4; } -#endif } /* - * A four-byte-character lead-byte not followed by two trail-bytes + * A four-byte-character lead-byte not followed by three trail-bytes * represents itself. */ } @@ -1230,16 +1178,6 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif return (ch1 - ch2); } } @@ -1280,16 +1218,6 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1329,16 +1257,6 @@ TclUtfCasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { diff --git a/tests/utf.test b/tests/utf.test index 01e0bb2..9a55729 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,54 +13,61 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } +testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint testfindfirst [llength [info commands testfindfirst]] +testConstraint testfindlast [llength [info commands testfindlast]] +testConstraint testnumutfchars [llength [info commands testnumutfchars]] +testConstraint teststringobj [llength [info commands teststringobj]] +testConstraint testutfnext [llength [info commands testutfnext]] +testConstraint testutfprev [llength [info commands testutfprev]] catch {unset x} -test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { +test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { set x \x01 -} [bytestring "\x01"] -test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} { +} [testbytestring "\x01"] +test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { set x "\x00" -} [bytestring "\xc0\x80"] -test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} { +} [testbytestring "\xC0\x80"] +test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { set x "\xe0" -} [bytestring "\xc3\xa0"] -test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} { - set x "\u4e4e" -} [bytestring "\xe4\xb9\x8e"] -test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} { +} [testbytestring "\xC3\xA0"] +test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} testbytestring { + set x "\u4E4E" +} [testbytestring "\xE4\xB9\x8E"] +test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { format %c 0x110000 -} [bytestring "\xef\xbf\xbd"] -test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} { +} [testbytestring "\xEF\xBF\xBD"] +test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { format %c -1 -} [bytestring "\xef\xbf\xbd"] +} [testbytestring "\xEF\xBF\xBD"] test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" } {3} -test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} { - string length [bytestring "\x82\x83\x84"] +test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring { + string length [testbytestring "\x82\x83\x84"] } {3} -test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} { - string length [bytestring "\xC2"] +test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xC2"] } {1} -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { - string length [bytestring "\xC2\xa2"] +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { + string length [testbytestring "\xC2\xA2"] } {1} -test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} { - string length [bytestring "\xE2"] +test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xE2"] } {1} -test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { - string length [bytestring "\xE2\xA2"] +test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestring { + string length [testbytestring "\xE2\xA2"] } {2} -test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { - string length [bytestring "\xE4\xb9\x8e"] +test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { + string length [testbytestring "\xE4\xb9\x8E"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { @@ -77,57 +84,51 @@ test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestr test utf-3.1 {Tcl_UtfCharComplete} { } {} -testConstraint testnumutfchars [llength [info commands testnumutfchars]] -testConstraint testfindfirst [llength [info commands testfindfirst]] -testConstraint testfindlast [llength [info commands testfindlast]] - test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } {0} -test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] +test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] } {1} -test utf-4.3 {Tcl_NumUtfChars: long string} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] +test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8e\uA2\u4E4E"] } {7} -test utf-4.4 {Tcl_NumUtfChars: #u0000} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] +test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] } {1} test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } {0} -test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] 1 +test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} -test utf-4.7 {Tcl_NumUtfChars: long string, calc len} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 10 +test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\u4E4E"] 10 } {7} -test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] 1 +test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] 1 } {1} # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check -test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} testnumutfchars { - testnumutfchars [bytestring "\xE2\x82\xAC"] 2 +test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xE2\x82\xAC"] 2 } {2} -test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} testnumutfchars { - testnumutfchars [bytestring "\x00"] 2 +test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\x00"] 2 } {2} -test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 3 +test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring \xf0\x9f\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 4 +test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { + testnumutfchars [testbytestring \xf0\x9f\x92\xA9] 4 } {4} -test utf-5.1 {Tcl_UtfFindFirst} testfindfirst { - testfindfirst [bytestring "abcbc"] 98 +test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { + testfindfirst [testbytestring "abcbc"] 98 } {bcbc} -test utf-5.2 {Tcl_UtfFindLast} testfindlast { - testfindlast [bytestring "abcbc"] 98 +test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { + testfindlast [testbytestring "abcbc"] 98 } {bc} -testConstraint testutfnext [llength [info commands testutfnext]] - test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. @@ -334,7 +335,7 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} testutfnext { +test utf-6.69 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.70 {Tcl_UtfNext} testutfnext { @@ -349,22 +350,22 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} testutfnext { +test utf-6.74 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.75 {Tcl_UtfNext} testutfnext { +test utf-6.75 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.76 {Tcl_UtfNext} testutfnext { +test utf-6.76 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.77 {Tcl_UtfNext} testutfnext { +test utf-6.77 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.78 {Tcl_UtfNext} testutfnext { +test utf-6.78 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.79 {Tcl_UtfNext} testutfnext { +test utf-6.79 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -388,7 +389,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { testutfnext \xF0\x90\x80\x80 } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -403,15 +404,13 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 -test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { +test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { testutfnext \xF4\x8F\xBF\xBF } 1 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 -testConstraint testutfprev [llength [info commands testutfprev]] - test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} } 0 @@ -475,13 +474,13 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} testutfprev { +test utf-7.10 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} testutfprev { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.2 {Tcl_UtfPrev} testutfprev { +test utf-7.10.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { @@ -523,13 +522,13 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} testutfprev { +test utf-7.15 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} testutfprev { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.2 {Tcl_UtfPrev} testutfprev { +test utf-7.15.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { @@ -562,7 +561,7 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} testutfprev { +test utf-7.20 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.21 {Tcl_UtfPrev} testutfprev { @@ -622,16 +621,16 @@ test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 3 } 1 -test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -658,13 +657,13 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { @@ -708,18 +707,18 @@ test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { set x \n } { } -test utf-10.2 {Tcl_UtfBackslash: \u subst} { - set x \ua2 -} [bytestring "\xc2\xa2"] -test utf-10.3 {Tcl_UtfBackslash: longer \u subst} { - set x \u4e21 -} [bytestring "\xe4\xb8\xa1"] -test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} { - set x \u4e2k -} "[bytestring \xd3\xa2]k" -test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} { - set x \u4e216 -} "[bytestring \xe4\xb8\xa1]6" +test utf-10.2 {Tcl_UtfBackslash: \u subst} testbytestring { + set x \uA2 +} [testbytestring "\xC2\xA2"] +test utf-10.3 {Tcl_UtfBackslash: longer \u subst} testbytestring { + set x \u4E21 +} [testbytestring "\xE4\xB8\xA1"] +test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { + set x \u4E2k +} "[testbytestring \xD3\xA2]k" +test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { + set x \u4E216 +} "[testbytestring \xE4\xB8\xA1]6" proc bsCheck {char num} { global errNum test utf-10.$errNum {backslash substitution} { @@ -774,11 +773,11 @@ test utf-11.2 {Tcl_UtfToUpper} { string toupper abc } ABC test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00e3ab -} \u00c3AB + string toupper \u00E3ab +} \u00C3AB test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01e3ab -} \u01e2AB + string toupper \u01E3ab +} \u01E2AB test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -787,11 +786,11 @@ test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00c3AB -} \u00e3ab + string tolower \u00C3AB +} \u00E3ab test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01e2AB -} \u01e3ab + string tolower \u01E2AB +} \u01E3ab test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -800,11 +799,11 @@ test utf-13.2 {Tcl_UtfToTitle} { string totitle abc } Abc test utf-13.3 {Tcl_UtfToTitle} { - string totitle \u00e3ab -} \u00c3ab + string totitle \u00E3ab +} \u00C3ab test utf-13.4 {Tcl_UtfToTitle} { - string totitle \u01f3ab -} \u01f2ab + string totitle \u01F3ab +} \u01F2ab test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -823,7 +822,7 @@ test utf-15.1 {Tcl_UniCharToUpper, negative delta} { string toupper aA } AA test utf-15.2 {Tcl_UniCharToUpper, positive delta} { - string toupper \u0178\u00ff + string toupper \u0178\xFF } \u0178\u0178 test utf-15.3 {Tcl_UniCharToUpper, no delta} { string toupper ! @@ -833,24 +832,24 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff\uA78D\u01c5 -} \u00ff\u00ff\u0265\u01c6 + string tolower \u0178\xFF\uA78D\u01C5 +} \xFF\xFF\u0265\u01C6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! } ! test utf-18.1 {Tcl_UniCharToTitle, add one for title} { - string totitle \u01c4 -} \u01c5 + string totitle \u01C4 +} \u01C5 test utf-18.2 {Tcl_UniCharToTitle, subtract one for title} { - string totitle \u01c6 -} \u01c5 + string totitle \u01C6 +} \u01C5 test utf-18.3 {Tcl_UniCharToTitle, subtract delta for title (positive)} { - string totitle \u017f + string totitle \u017F } \u0053 test utf-18.4 {Tcl_UniCharToTitle, subtract delta for title (negative)} { - string totitle \u00ff + string totitle \xFF } \u0178 test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! @@ -865,15 +864,15 @@ test utf-20.1 {TclUniCharNcmp} { test utf-21.1 {TclUniCharIsAlnum} { # this returns 1 with Unicode 7 compliance - string is alnum \u1040\u021f\u0220 + string is alnum \u1040\u021F\u0220 } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f] + list [regexp {^[[:alnum:]]+$} \u1040\u021F\u0220] [regexp {^\w+$} \u1040\u021F\u0220_\u203F\u2040\u2054\uFE33\uFE34\uFE4D\uFE4E\uFE4F\uFF3F] } {1 1} test utf-21.3 {unicode print char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:print:]]+$} \ufbc1 + regexp {^[[:print:]]+$} \uFBC1 } 1 test utf-21.4 {TclUniCharIsGraph} { # [Bug 3464428] @@ -885,11 +884,11 @@ test utf-21.5 {unicode graph char in regc_locale.c} { } {1} test utf-21.6 {TclUniCharIsGraph} { # [Bug 3464428] - string is graph \u00a0 + string is graph \xA0 } {0} test utf-21.7 {unicode graph char in regc_locale.c} { # [Bug 3464428] - regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029 + regexp {[[:graph:]]} \x20\xA0\u2028\u2029 } {0} test utf-21.8 {TclUniCharIsPrint} { # [Bug 3464428] @@ -905,49 +904,47 @@ test utf-21.10 {unicode print char in regc_locale.c} { } {0} test utf-21.11 {TclUniCharIsControl} { # [Bug 3464428] - string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + string is control \x00\x1F\xad\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-21.12 {unicode control char in regc_locale.c} { # [Bug 3464428], [Bug a876646efe] - regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + regexp {^[[:cntrl:]]*$} \x00\x1F\xad\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-22.1 {TclUniCharIsWordChar} { string wordend "xyz123_bar fg" 0 } 10 test utf-22.2 {TclUniCharIsWordChar} { - string wordend "x\u5080z123_bar\u203c fg" 0 + string wordend "x\u5080z123_bar\u203C fg" 0 } 10 test utf-23.1 {TclUniCharIsAlpha} { # this returns 1 with Unicode 7 compliance - string is alpha \u021f\u0220\u037f\u052f + string is alpha \u021F\u0220\u037F\u052F } {1} test utf-23.2 {unicode alpha char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f + regexp {^[[:alpha:]]+$} \u021F\u0220\u037F\u052F } {1} test utf-24.1 {TclUniCharIsDigit} { # this returns 1 with Unicode 7 compliance - string is digit \u1040\uabf0 + string is digit \u1040\uABF0 } {1} test utf-24.2 {unicode digit char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0] + list [regexp {^[[:digit:]]+$} \u1040\uABF0] [regexp {^\d+$} \u1040\uABF0] } {1 1} test utf-24.3 {TclUniCharIsSpace} { # this returns 1 with Unicode 7 compliance - string is space \u1680\u180e\u202f + string is space \u1680\u180E\u202F } {1} test utf-24.4 {unicode space char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f] + list [regexp {^[[:space:]]+$} \u1680\u180E\u202F] [regexp {^\s+$} \u1680\u180E\u202F] } {1 1} -testConstraint teststringobj [llength [info commands teststringobj]] - test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \ -setup { testobj freeallvars diff --git a/tests/util.test b/tests/util.test index 85c06dd..a483de1 100644 --- a/tests/util.test +++ b/tests/util.test @@ -15,6 +15,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { testConstraint testdstring [llength [info commands testdstring]] testConstraint testconcatobj [llength [info commands testconcatobj]] testConstraint testdoubledigits [llength [info commands testdoubledigits]] +testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] # Big test for correct ordering of data in [expr] -- cgit v0.12 From 93dcb677fb5f574221d801236bf355ed1abb5461 Mon Sep 17 00:00:00 2001 From: dgp Date: Sat, 18 Apr 2020 15:02:51 +0000 Subject: Make TCL_UTF_MAX=4 build test clean again. --- generic/tcl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tcl.h b/generic/tcl.h index d7d064c..7378a8f 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2148,7 +2148,7 @@ typedef struct Tcl_Parse { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this -- cgit v0.12 From 7b261d640f9d390b7a237201d9095f8788c97832 Mon Sep 17 00:00:00 2001 From: dgp Date: Sat, 18 Apr 2020 15:11:22 +0000 Subject: regexp engine has to agree about the sizeof(Tcl_UniChar). --- generic/regcustom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/regcustom.h b/generic/regcustom.h index ac33087..57a2d47 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ -- cgit v0.12 From 966ae57b960c5ab2f69298103c43f92c8e5fc95f Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 19:54:08 +0000 Subject: Clean-up testcases: Constant use of uppercase in hex-values. Use "testbytestring" in stead of "bytestring". Mark tests not working with TCL_UTF_MAX>3 with "compat85" --- tests/utf.test | 278 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 137 insertions(+), 141 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 01e0bb2..189b85d 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,54 +13,60 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } +testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint testfindfirst [llength [info commands testfindfirst]] +testConstraint testfindlast [llength [info commands testfindlast]] +testConstraint testnumutfchars [llength [info commands testnumutfchars]] +testConstraint teststringobj [llength [info commands teststringobj]] +testConstraint testutfnext [llength [info commands testutfnext]] +testConstraint testutfprev [llength [info commands testutfprev]] catch {unset x} -test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { - set x \x01 -} [bytestring "\x01"] -test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} { - set x "\x00" -} [bytestring "\xc0\x80"] -test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} { - set x "\xe0" -} [bytestring "\xc3\xa0"] -test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} { - set x "\u4e4e" -} [bytestring "\xe4\xb9\x8e"] -test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} { - format %c 0x110000 -} [bytestring "\xef\xbf\xbd"] -test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} { - format %c -1 -} [bytestring "\xef\xbf\xbd"] - +test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { + expr {"\x01" eq [testbytestring "\x01"]} +} 1 +test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { + expr {"\x00" eq [testbytestring "\xC0\x80"]} +} 1 +test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { + expr {"\xE0" eq [testbytestring "\xC3\xA0"]} +} 1 +test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} testbytestring { + expr {"\u4E4E" eq [testbytestring "\xE4\xB9\x8E"]} +} 1 +test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { + expr {[format %c 0x110000] eq [testbytestring "\xEF\xBF\xBD"]} +} 1 +test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { + expr {[format %c -1] eq [testbytestring "\xEF\xBF\xBD"]} +} 1 test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" } {3} -test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} { - string length [bytestring "\x82\x83\x84"] +test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring { + string length [testbytestring "\x82\x83\x84"] } {3} -test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} { - string length [bytestring "\xC2"] +test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xC2"] } {1} -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { - string length [bytestring "\xC2\xa2"] +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { + string length [testbytestring "\xC2\xA2"] } {1} -test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} { - string length [bytestring "\xE2"] +test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xE2"] } {1} -test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { - string length [bytestring "\xE2\xA2"] +test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestring { + string length [testbytestring "\xE2\xA2"] } {2} -test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { - string length [bytestring "\xE4\xb9\x8e"] +test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { + string length [testbytestring "\xE4\xB9\x8E"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { @@ -77,57 +83,51 @@ test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestr test utf-3.1 {Tcl_UtfCharComplete} { } {} -testConstraint testnumutfchars [llength [info commands testnumutfchars]] -testConstraint testfindfirst [llength [info commands testfindfirst]] -testConstraint testfindlast [llength [info commands testfindlast]] - test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } {0} -test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] +test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] } {1} -test utf-4.3 {Tcl_NumUtfChars: long string} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] +test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] } {7} -test utf-4.4 {Tcl_NumUtfChars: #u0000} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] +test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] } {1} test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } {0} -test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] 1 +test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} -test utf-4.7 {Tcl_NumUtfChars: long string, calc len} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 10 +test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] 10 } {7} -test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] 1 +test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] 1 } {1} # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check -test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} testnumutfchars { - testnumutfchars [bytestring "\xE2\x82\xAC"] 2 +test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xE2\x82\xAC"] 2 } {2} -test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} testnumutfchars { - testnumutfchars [bytestring "\x00"] 2 +test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\x00"] 2 } {2} -test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 3 +test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 4 +test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 } {4} -test utf-5.1 {Tcl_UtfFindFirst} testfindfirst { - testfindfirst [bytestring "abcbc"] 98 +test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { + testfindfirst [testbytestring "abcbc"] 98 } {bcbc} -test utf-5.2 {Tcl_UtfFindLast} testfindlast { - testfindlast [bytestring "abcbc"] 98 +test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { + testfindlast [testbytestring "abcbc"] 98 } {bc} -testConstraint testutfnext [llength [info commands testutfnext]] - test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. @@ -334,7 +334,7 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} testutfnext { +test utf-6.69 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.70 {Tcl_UtfNext} testutfnext { @@ -349,22 +349,22 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} testutfnext { +test utf-6.74 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.75 {Tcl_UtfNext} testutfnext { +test utf-6.75 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.76 {Tcl_UtfNext} testutfnext { +test utf-6.76 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.77 {Tcl_UtfNext} testutfnext { +test utf-6.77 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.78 {Tcl_UtfNext} testutfnext { +test utf-6.78 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.79 {Tcl_UtfNext} testutfnext { +test utf-6.79 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -388,7 +388,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { testutfnext \xF0\x90\x80\x80 } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -403,15 +403,13 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 -test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { +test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { testutfnext \xF4\x8F\xBF\xBF } 1 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 -testConstraint testutfprev [llength [info commands testutfprev]] - test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} } 0 @@ -475,13 +473,13 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} testutfprev { +test utf-7.10 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} testutfprev { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.2 {Tcl_UtfPrev} testutfprev { +test utf-7.10.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { @@ -523,13 +521,13 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} testutfprev { +test utf-7.15 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} testutfprev { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.2 {Tcl_UtfPrev} testutfprev { +test utf-7.15.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { @@ -562,7 +560,7 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} testutfprev { +test utf-7.20 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.21 {Tcl_UtfPrev} testutfprev { @@ -622,16 +620,16 @@ test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 3 } 1 -test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -658,13 +656,13 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { @@ -708,18 +706,18 @@ test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { set x \n } { } -test utf-10.2 {Tcl_UtfBackslash: \u subst} { - set x \ua2 -} [bytestring "\xc2\xa2"] -test utf-10.3 {Tcl_UtfBackslash: longer \u subst} { - set x \u4e21 -} [bytestring "\xe4\xb8\xa1"] -test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} { - set x \u4e2k -} "[bytestring \xd3\xa2]k" -test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} { - set x \u4e216 -} "[bytestring \xe4\xb8\xa1]6" +test utf-10.2 {Tcl_UtfBackslash: \u subst} testbytestring { + expr {"\uA2" eq [testbytestring "\xC2\xA2"]} +} 1 +test utf-10.3 {Tcl_UtfBackslash: longer \u subst} testbytestring { + expr {"\u4E21" eq [testbytestring "\xE4\xB8\xA1"]} +} 1 +test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { + expr {"\u4E2k" eq "[testbytestring \xD3\xA2]k"} +} 1 +test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { + expr {"\u4E216" eq [testbytestring "\xE4\xB8\xA1"]6} +} 1 proc bsCheck {char num} { global errNum test utf-10.$errNum {backslash substitution} { @@ -774,11 +772,11 @@ test utf-11.2 {Tcl_UtfToUpper} { string toupper abc } ABC test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00e3ab -} \u00c3AB + string toupper \xE3gh +} \xC3GH test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01e3ab -} \u01e2AB + string toupper \u01E3ab +} \u01E2AB test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -787,11 +785,11 @@ test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00c3AB -} \u00e3ab + string tolower \xC3GH +} \xE3gh test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01e2AB -} \u01e3ab + string tolower \u01E2AB +} \u01E3ab test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -800,11 +798,11 @@ test utf-13.2 {Tcl_UtfToTitle} { string totitle abc } Abc test utf-13.3 {Tcl_UtfToTitle} { - string totitle \u00e3ab -} \u00c3ab + string totitle \xE3GH +} \xC3gh test utf-13.4 {Tcl_UtfToTitle} { - string totitle \u01f3ab -} \u01f2ab + string totitle \u01F3AB +} \u01F2ab test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -823,7 +821,7 @@ test utf-15.1 {Tcl_UniCharToUpper, negative delta} { string toupper aA } AA test utf-15.2 {Tcl_UniCharToUpper, positive delta} { - string toupper \u0178\u00ff + string toupper \u0178\xFF } \u0178\u0178 test utf-15.3 {Tcl_UniCharToUpper, no delta} { string toupper ! @@ -833,24 +831,24 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff\uA78D\u01c5 -} \u00ff\u00ff\u0265\u01c6 + string tolower \u0178\xFF\uA78D\u01C5 +} \xFF\xFF\u0265\u01C6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! } ! test utf-18.1 {Tcl_UniCharToTitle, add one for title} { - string totitle \u01c4 -} \u01c5 + string totitle \u01C4 +} \u01C5 test utf-18.2 {Tcl_UniCharToTitle, subtract one for title} { - string totitle \u01c6 -} \u01c5 + string totitle \u01C6 +} \u01C5 test utf-18.3 {Tcl_UniCharToTitle, subtract delta for title (positive)} { - string totitle \u017f -} \u0053 + string totitle \u017F +} \x53 test utf-18.4 {Tcl_UniCharToTitle, subtract delta for title (negative)} { - string totitle \u00ff + string totitle \xFF } \u0178 test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! @@ -865,15 +863,15 @@ test utf-20.1 {TclUniCharNcmp} { test utf-21.1 {TclUniCharIsAlnum} { # this returns 1 with Unicode 7 compliance - string is alnum \u1040\u021f\u0220 + string is alnum \u1040\u021F\u0220 } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f] + list [regexp {^[[:alnum:]]+$} \u1040\u021F\u0220] [regexp {^\w+$} \u1040\u021F\u0220_\u203F\u2040\u2054\uFE33\uFE34\uFE4D\uFE4E\uFE4F\uFF3F] } {1 1} test utf-21.3 {unicode print char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:print:]]+$} \ufbc1 + regexp {^[[:print:]]+$} \uFBC1 } 1 test utf-21.4 {TclUniCharIsGraph} { # [Bug 3464428] @@ -885,69 +883,67 @@ test utf-21.5 {unicode graph char in regc_locale.c} { } {1} test utf-21.6 {TclUniCharIsGraph} { # [Bug 3464428] - string is graph \u00a0 + string is graph \xA0 } {0} test utf-21.7 {unicode graph char in regc_locale.c} { # [Bug 3464428] - regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029 + regexp {[[:graph:]]} \x20\xA0\u2028\u2029 } {0} test utf-21.8 {TclUniCharIsPrint} { # [Bug 3464428] - string is print \u0009 + string is print \x09 } {0} test utf-21.9 {unicode print char in regc_locale.c} { # [Bug 3464428] - regexp {[[:print:]]} \u0009 + regexp {[[:print:]]} \x09 } {0} test utf-21.10 {unicode print char in regc_locale.c} { # [Bug 3464428] - regexp {[[:print:]]} \u0009 + regexp {[[:print:]]} \x09 } {0} test utf-21.11 {TclUniCharIsControl} { # [Bug 3464428] - string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + string is control \x00\x1F\xAD\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-21.12 {unicode control char in regc_locale.c} { # [Bug 3464428], [Bug a876646efe] - regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + regexp {^[[:cntrl:]]*$} \x00\x1F\xAD\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-22.1 {TclUniCharIsWordChar} { string wordend "xyz123_bar fg" 0 } 10 test utf-22.2 {TclUniCharIsWordChar} { - string wordend "x\u5080z123_bar\u203c fg" 0 + string wordend "x\u5080z123_bar\u203C fg" 0 } 10 test utf-23.1 {TclUniCharIsAlpha} { # this returns 1 with Unicode 7 compliance - string is alpha \u021f\u0220\u037f\u052f + string is alpha \u021F\u0220\u037F\u052F } {1} test utf-23.2 {unicode alpha char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f + regexp {^[[:alpha:]]+$} \u021F\u0220\u037F\u052F } {1} test utf-24.1 {TclUniCharIsDigit} { # this returns 1 with Unicode 7 compliance - string is digit \u1040\uabf0 + string is digit \u1040\uABF0 } {1} test utf-24.2 {unicode digit char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0] + list [regexp {^[[:digit:]]+$} \u1040\uABF0] [regexp {^\d+$} \u1040\uABF0] } {1 1} test utf-24.3 {TclUniCharIsSpace} { # this returns 1 with Unicode 7 compliance - string is space \u1680\u180e\u202f + string is space \u1680\u180E\u202F } {1} test utf-24.4 {unicode space char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f] + list [regexp {^[[:space:]]+$} \u1680\u180E\u202F] [regexp {^\s+$} \u1680\u180E\u202F] } {1 1} -testConstraint teststringobj [llength [info commands teststringobj]] - test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \ -setup { testobj freeallvars -- cgit v0.12 From 09fc6afbce6c3eab900ae96c6808ef44821cdaf3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 19 Apr 2020 19:37:05 +0000 Subject: More test-cases. Fix wrong quoting in testcase utf-10.5 --- tests/utf.test | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 189b85d..946aa83 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -42,6 +42,7 @@ test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { expr {[format %c -1] eq [testbytestring "\xEF\xBF\xBD"]} } 1 + test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" } {3} @@ -90,7 +91,7 @@ test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC2\xA2"] } {1} test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E"] } {7} test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC0\x80"] @@ -693,6 +694,18 @@ test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { string index \u4E4E\u25A\xFF\u543 2 } "\uFF" +test utf-8.5 {Tcl_UniCharAtIndex: high surrogate} { + string index \uD842 0 +} "\uD842" +test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { + string index \uDC42 0 +} "\uDC42" +test utf-8.7 {Tcl_UniCharAtIndex: Emoji} compat85 { + string index \uD83D\uDE00 0 +} "\uD83D" +test utf-8.8 {Tcl_UniCharAtIndex: Emoji} compat85 { + string index \uD83D\uDE00 1 +} "\uDE00" test utf-9.1 {Tcl_UtfAtIndex: index = 0} { string range abcd 0 2 @@ -700,6 +713,12 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 } "\u25A\xFF\u543kl" +test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} compat85 { + string range \uD83D\uDE00G 0 0 +} "\uD83D" +test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} compat85 { + string range \uD83D\uDE00G 1 1 +} "\uDE00" test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { @@ -716,7 +735,7 @@ test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { expr {"\u4E2k" eq "[testbytestring \xD3\xA2]k"} } 1 test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { - expr {"\u4E216" eq [testbytestring "\xE4\xB8\xA1"]6} + expr {"\u4E216" eq "[testbytestring \xE4\xB8\xA1]6"} } 1 proc bsCheck {char num} { global errNum @@ -775,8 +794,8 @@ test utf-11.3 {Tcl_UtfToUpper} { string toupper \xE3gh } \xC3GH test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01E3ab -} \u01E2AB + string toupper \u01E3gh +} \u01E2GH test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -788,8 +807,14 @@ test utf-12.3 {Tcl_UtfToLower} { string tolower \xC3GH } \xE3gh test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01E2AB -} \u01E3ab + string tolower \u01E2GH +} \u01E3gh +test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { + string tolower \u10D0\u1C90 +} \u10D0\u10D0 +test utf-12.6 {Tcl_UtfToUpper low/high surrogate)} { + string tolower \uDC24\uD824 +} \uDC24\uD824 test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -803,6 +828,15 @@ test utf-13.3 {Tcl_UtfToTitle} { test utf-13.4 {Tcl_UtfToTitle} { string totitle \u01F3AB } \u01F2ab +test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { + string totitle \u10D0\u1C90 +} \u10D0\u1C90 +test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { + string totitle \u1C90\u10D0 +} \u1C90\u10D0 +test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} { + string totitle \uDC24\uD824 +} \uDC24\uD824 test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -854,9 +888,11 @@ test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! } ! -test utf-19.1 {TclUniCharLen} { +test utf-19.1 {TclUniCharLen} -body { list [regexp \\d abc456def foo] $foo -} {1 4} +} -cleanup { + unset -nocomplain foo +} -result {1 4} test utf-20.1 {TclUniCharNcmp} { } {} -- cgit v0.12 From 75ae3c494cd6e59303963964efedc91f76675357 Mon Sep 17 00:00:00 2001 From: dgp Date: Sun, 19 Apr 2020 22:02:10 +0000 Subject: typo --- tests/utf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utf.test b/tests/utf.test index 946aa83..07863b9 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -103,7 +103,7 @@ test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestri testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] 10 + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E"] 10 } {7} test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC0\x80"] 1 -- cgit v0.12 From 82d0518f792a138480179ef9b0343a05384f1666 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 05:35:54 +0000 Subject: Reconcile tests to the 8.5 branch history. --- tests/utf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utf.test b/tests/utf.test index 07863b9..1ca3647 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -621,7 +621,7 @@ test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 3 } 1 -test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { -- cgit v0.12 From 3f575ea747ca4171b2d20bed73bf7d4857ba75e6 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 06:45:47 +0000 Subject: Backport the encoding fix for source-7.2 in TCL_UTF_MAX=6 build. --- generic/tclEncoding.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 6c16827..5a9d2d5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2470,20 +2470,33 @@ UtfToUnicodeProc( if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; - } + } src += TclUtfToUniChar(src, &ch); /* * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] - * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. */ #ifdef WORDS_BIGENDIAN +#if TCL_UTF_MAX > 4 + *dst++ = (ch >> 24); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); +#else *dst++ = (ch >> 8); *dst++ = (ch & 0xFF); +#endif +#else +#if TCL_UTF_MAX > 4 + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = (ch >> 24); #else *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); #endif +#endif } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; -- cgit v0.12 From b71f5615e78f19b07c19be2d362c7d13d45c47f6 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 07:34:28 +0000 Subject: Backport the fix for encoding-16.1 in a TCL_UTF_MAX=6 build. --- generic/tclEncoding.c | 240 +++++++++++++++++++++++++++----------------------- 1 file changed, 128 insertions(+), 112 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5a9d2d5..da03055 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -83,7 +83,7 @@ typedef struct TableEncodingData { } TableEncodingData; /* - * The following structures is the clientData for a dynamically-loaded, + * Each of the following structures is the clientData for a dynamically-loaded * escape-driven encoding that is itself comprised of other simpler encodings. * An example is "iso-2022-jp", which uses escape sequences to switch between * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" @@ -117,8 +117,8 @@ typedef struct EscapeEncodingData { * 0. */ int numSubTables; /* Length of following array. */ EscapeSubTable subTables[1];/* Information about each EscapeSubTable used - * by this encoding type. The actual size will - * be as large as necessary to hold all + * by this encoding type. The actual size is + * as large as necessary to hold all * EscapeSubTables. */ } EscapeEncodingData; @@ -156,7 +156,7 @@ static ProcessGlobalValue encodingFileMap = { * A list of directories making up the "library path". Historically this * search path has served many uses, but the only one remaining is a base for * the encodingSearchPath above. If the application does not explicitly set - * the encodingSearchPath, then it will be initialized by appending /encoding + * the encodingSearchPath, then it is initialized by appending /encoding * to each directory in this "libraryPath". */ @@ -177,7 +177,7 @@ TCL_DECLARE_MUTEX(encodingMutex) /* * The following are used to hold the default and current system encodings. * If NULL is passed to one of the conversion routines, the current setting of - * the system encoding will be used to perform the conversion. + * the system encoding is used to perform the conversion. */ static Tcl_Encoding defaultEncoding; @@ -429,9 +429,8 @@ TclGetLibraryPath(void) * Keeps the per-thread copy of the library path current with changes to * the global copy. * - * NOTE: this routine returns void, so there's no way to report the error - * that searchPath is not a valid list. In that case, this routine will - * silently do nothing. + * Since the result of this routine is void, if searchPath is not a valid + * list this routine silently does nothing. * *---------------------------------------------------------------------- */ @@ -453,17 +452,16 @@ TclSetLibraryPath( * * FillEncodingFileMap -- * - * Called to bring the encoding file map in sync with the current value - * of the encoding search path. + * Called to update the encoding file map with the current value + * of the encoding search path. * - * Scan the directories on the encoding search path, find the *.enc - * files, and store the found pathnames in a map associated with the - * encoding name. + * Finds *.end files in the directories on the encoding search path and + * stores the found pathnames in a map associated with the encoding name. * - * In particular, if $dir is on the encoding search path, and the file - * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. - * Later, any need for the "foo" encoding will quickly * be able to - * construct the $dir/foo.enc pathname for reading the encoding data. + * If $dir is on the encoding search path and the file $dir/foo.enc is + * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding + * is needed later, the $dir/foo.enc name can be quickly constructed in + * order to read the encoding data. * * Results: * None. @@ -544,19 +542,24 @@ void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; + union { + char c; + short s; + } isLe; if (encodingsInitialized) { return; } + isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* - * Create a few initial encodings. Note that the UTF-8 to UTF-8 - * translation is not a no-op, because it will turn a stream of improperly - * formed UTF-8 into a properly formed stream. + * Create a few initial encodings. UTF-8 to UTF-8 translation is not a + * no-op because it turns a stream of improperly formed UTF-8 into a + * properly formed stream. */ type.encodingName = "identity"; @@ -583,7 +586,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUnicodeProc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = NULL; + type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); /* @@ -755,11 +758,7 @@ Tcl_SetDefaultEncodingDir( * interp was NULL. * * Side effects: - * The new encoding type is entered into a table visible to all - * interpreters, keyed off the encoding's name. For each call to this - * function, there should eventually be a call to Tcl_FreeEncoding, so - * that the database can be cleaned up when encodings aren't needed - * anymore. + * LoadEncodingFile is called if necessary. * *------------------------------------------------------------------------- */ @@ -797,15 +796,15 @@ Tcl_GetEncoding( * * Tcl_FreeEncoding -- * - * This function is called to release an encoding allocated by - * Tcl_CreateEncoding() or Tcl_GetEncoding(). + * Releases an encoding allocated by Tcl_CreateEncoding() or + * Tcl_GetEncoding(). * * Results: * None. * * Side effects: * The reference count associated with the encoding is decremented and - * the encoding may be deleted if nothing is using it anymore. + * the encoding is deleted if nothing is using it anymore. * *--------------------------------------------------------------------------- */ @@ -824,13 +823,14 @@ Tcl_FreeEncoding( * * FreeEncoding -- * - * This function is called to release an encoding by functions that - * already have the encodingMutex. + * Decrements the reference count of an encoding. The caller must hold + * encodingMutes. * * Results: * None. * * Side effects: + * Releases the resource for an encoding if it is now unused. * The reference count associated with the encoding is decremented and * the encoding may be deleted if nothing is using it anymore. * @@ -850,16 +850,17 @@ FreeEncoding( if (encodingPtr->refCount<=0) { Tcl_Panic("FreeEncoding: refcount problem !!!"); } - encodingPtr->refCount--; - if (encodingPtr->refCount == 0) { + if (encodingPtr->refCount-- <= 1) { if (encodingPtr->freeProc != NULL) { (*encodingPtr->freeProc)(encodingPtr->clientData); } if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - ckfree((char *) encodingPtr->name); - ckfree((char *) encodingPtr); + if (encodingPtr->name) { + ckfree((char *)encodingPtr->name); + } + ckfree((char *)encodingPtr); } } @@ -1020,23 +1021,22 @@ Tcl_SetSystemEncoding( * * Tcl_CreateEncoding -- * - * This function is called to define a new encoding and the functions - * that are used to convert between the specified encoding and Unicode. + * Defines a new encoding, along with the functions that are used to + * convert to and from Unicode. * * Results: * Returns a token that represents the encoding. If an encoding with the * same name already existed, the old encoding token remains valid and - * continues to behave as it used to, and will eventually be garbage - * collected when the last reference to it goes away. Any subsequent - * calls to Tcl_GetEncoding with the specified name will retrieve the - * most recent encoding token. + * continues to behave as it used to, and is eventually garbage collected + * when the last reference to it goes away. Any subsequent calls to + * Tcl_GetEncoding with the specified name retrieve the most recent + * encoding token. * * Side effects: - * The new encoding type is entered into a table visible to all - * interpreters, keyed off the encoding's name. For each call to this - * function, there should eventually be a call to Tcl_FreeEncoding, so - * that the database can be cleaned up when encodings aren't needed - * anymore. + * A new record having the name of the encoding is entered into a table of + * encodings visible to all interpreters. For each call to this function, + * there should eventually be a call to Tcl_FreeEncoding, which cleans + * deletes the record in the table when an encoding is no longer needed. * *--------------------------------------------------------------------------- */ @@ -1258,10 +1258,9 @@ Tcl_ExternalToUtf( * * Tcl_UtfToExternalDString -- * - * Convert a source buffer from UTF-8 into the specified encoding. If any + * Convert a source buffer from UTF-8 to the specified encoding. If any * of the bytes in the source buffer are invalid or cannot be represented - * in the target encoding, a default fallback character will be - * substituted. + * in the target encoding, a default fallback character is substituted. * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1570,13 +1569,13 @@ OpenEncodingFileChannel( * the data. * * Results: - * The return value is the newly loaded Encoding, or NULL if the file - * didn't exist of was in the incorrect format. If NULL was returned, an - * error message is left in interp's result object, unless interp was - * NULL. + * The return value is the newly loaded Tcl_Encoding or NULL if the file + * didn't exist or could not be processed. If NULL is returned and interp + * is not NULL, an error message is left in interp's result object. * * Side effects: - * File read from disk. + * A corresponding encoding file might be read from persistent storage, in + * which case LoadTableEncoding is called. * *--------------------------------------------------------------------------- */ @@ -1584,8 +1583,8 @@ OpenEncodingFileChannel( static Tcl_Encoding LoadEncodingFile( Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ - const char *name) /* The name of the encoding file on disk and - * also the name for new encoding. */ + const char *name) /* The name of both the encoding file + * and the new encoding. */ { Tcl_Channel chan = NULL; Tcl_Encoding encoding = NULL; @@ -1637,27 +1636,27 @@ LoadEncodingFile( * * LoadTableEncoding -- * - * Helper function for LoadEncodingTable(). Loads a table to that - * converts between Unicode and some other encoding and creates an - * encoding (using a TableEncoding structure) from that information. + * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType + * structure along with its corresponding TableEncodingData structure, and + * passes it to Tcl_Createncoding. * - * File contains binary data, but begins with a marker to indicate - * byte-ordering, so that same binary file can be read on either endian - * platforms. + * The file contains binary data but begins with a marker to indicate + * byte-ordering so a single binary file can be read on big or + * little-endian systems. * * Results: - * The return value is the new encoding, or NULL if the encoding could - * not be created (because the file contained invalid data). + * Returns the new Tcl_Encoding, or NULL if it could could + * not be created because the file contained invalid data. * * Side effects: - * None. + * See Tcl_CreateEncoding(). * *------------------------------------------------------------------------- */ static Tcl_Encoding LoadTableEncoding( - const char *name, /* Name for new encoding. */ + const char *name, /* Name of the new encoding. */ int type, /* Type of encoding (ENCODING_?????). */ Tcl_Channel chan) /* File containing new encoding. */ { @@ -1769,10 +1768,10 @@ LoadTableEncoding( } /* - * Invert toUnicode array to produce the fromUnicode array. Performs a + * Invert the toUnicode array to produce the fromUnicode array. Performs a * single malloc to get the memory for the array and all the pages needed - * by the array. While reading in the toUnicode array, we remembered what - * pages that would be needed for the fromUnicode array. + * by the array. While reading in the toUnicode array remember what + * pages are needed for the fromUnicode array. */ if (symbol) { @@ -1814,8 +1813,8 @@ LoadTableEncoding( if (type == ENCODING_MULTIBYTE) { /* * If multibyte encodings don't have a backslash character, define - * one. Otherwise, on Windows, native file names won't work because - * the backslash in the file name will map to the unknown character + * one. Otherwise, on Windows, native file names don't work because + * the backslash in the file name maps to the unknown character * (question mark) when converting from UTF-8 to external encoding. */ @@ -1829,13 +1828,13 @@ LoadTableEncoding( unsigned short *page; /* - * Make a special symbol encoding that not only maps the symbol - * characters from their Unicode code points down into page 0, but - * also ensure that the characters on page 0 map to themselves. This - * is so that a symbol font can be used to display a simple string - * like "abcd" and have alpha, beta, chi, delta show up, rather than - * have "unknown" chars show up because strictly speaking the symbol - * font doesn't have glyphs for those low ascii chars. + * Make a special symbol encoding that maps each symbol character from + * its Unicode code point down into page 0, and also ensure that each + * characters on page 0 maps to itself so that a symbol font can be + * used to display a simple string like "abcd" and have alpha, beta, + * chi, delta show up, rather than have "unknown" chars show up because + * strictly speaking the symbol font doesn't have glyphs for those low + * ASCII chars. */ page = dataPtr->fromUnicode[0]; @@ -1939,7 +1938,7 @@ LoadTableEncoding( static Tcl_Encoding LoadEscapeEncoding( - const char *name, /* Name for new encoding. */ + const char *name, /* Name of the new encoding. */ Tcl_Channel chan) /* File containing new encoding. */ { int i; @@ -2318,7 +2317,7 @@ UtfToUtfProc( * * UnicodeToUtfProc -- * - * Convert from Unicode to UTF-8. + * Convert from UTF-16 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2331,7 +2330,7 @@ UtfToUtfProc( static int UnicodeToUtfProc( - ClientData clientData, /* Not used. */ + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2359,13 +2358,19 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; char *dstEnd, *dstStart; int result, numChars; - Tcl_UniChar ch; + unsigned short ch; result = TCL_OK; - if ((srcLen % sizeof(Tcl_UniChar)) != 0) { + + /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ + if ((srcLen % 2) != 0) { + result = TCL_CONVERT_MULTIBYTE; + srcLen--; + } + /* If last code point is a high surrogate, we cannot handle that yet */ + if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; - srcLen /= sizeof(Tcl_UniChar); - srcLen *= sizeof(Tcl_UniChar); + srcLen-= 2; } srcStart = src; @@ -2379,17 +2384,21 @@ UnicodeToUtfProc( result = TCL_CONVERT_NOSPACE; break; } + if (clientData) { + ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); + } /* - * Special case for 1-byte utf chars for speed. Make sure we - * work with Tcl_UniChar-size data. + * Special case for 1-byte utf chars for speed. Make sure we work with + * unsigned short-size data. */ - ch = *(Tcl_UniChar *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(Tcl_UniChar); + src += sizeof(unsigned short); } *srcReadPtr = src - srcStart; @@ -2403,7 +2412,7 @@ UnicodeToUtfProc( * * UtfToUnicodeProc -- * - * Convert from UTF-8 to Unicode. + * Convert from UTF-8 to UTF-16. * * Results: * Returns TCL_OK if conversion was successful. @@ -2416,8 +2425,7 @@ UnicodeToUtfProc( static int UtfToUnicodeProc( - ClientData clientData, /* TableEncodingData that specifies - * encoding. */ + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2444,7 +2452,7 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch; + Tcl_UniChar ch = 0; srcStart = src; srcEnd = src + srcLen; @@ -2476,27 +2484,37 @@ UtfToUnicodeProc( * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] */ -#ifdef WORDS_BIGENDIAN + if (clientData) { #if TCL_UTF_MAX > 4 - *dst++ = (ch >> 24); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = (ch & 0xFF); + if (ch <= 0xFFFF) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); + } else { + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (ch & 0xFF); + *dst++ = ((ch & 0x3) >> 8) | 0xDC; + } #else - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); #endif -#else + } else { #if TCL_UTF_MAX > 4 - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = (ch >> 24); + if (ch <= 0xFFFF) { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); + } else { + *dst++ = ((ch & 0x3) >> 8) | 0xDC; + *dst++ = (ch & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + } #else - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); -#endif + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); #endif + } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -2899,7 +2917,6 @@ Iso88591FromUtfProc( result = TCL_CONVERT_UNKNOWN; break; } - /* * Plunge on, using '?' as a fallback character. */ @@ -3387,14 +3404,13 @@ EscapeFromUtfProc( * * EscapeFreeProc -- * - * This function is invoked when an EscapeEncodingData encoding is - * deleted. It deletes the memory used by the encoding. + * Frees resources used by the encoding. * * Results: * None. * * Side effects: - * Memory freed. + * Memory is freed. * *--------------------------------------------------------------------------- */ -- cgit v0.12 From 286def7af7659d72d4a752b056948c41e88121b8 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 20 Apr 2020 07:50:22 +0000 Subject: Move the needed apt package in .travis.yml to the top, so they can be shared between the images. --- .travis.yml | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index e10ca7c..5672c0b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,15 @@ sudo: false language: c +addons: + apt: + packages: + - binutils-mingw-w64-i686 + - binutils-mingw-w64-x86-64 + - gcc-mingw-w64 + - gcc-mingw-w64-base + - gcc-mingw-w64-i686 + - gcc-mingw-w64-x86-64 + - gcc-multilib matrix: include: # Testing on Linux with various compilers @@ -146,13 +156,6 @@ matrix: os: linux dist: bionic compiler: x86_64-w64-mingw32-gcc - addons: - apt: - packages: - - gcc-mingw-w64-base - - binutils-mingw-w64-x86-64 - - gcc-mingw-w64-x86-64 - - gcc-mingw-w64 env: - BUILD_DIR=win - CFGOPT="--host=x86_64-w64-mingw32 --enable-64bit --enable-threads" @@ -167,14 +170,6 @@ matrix: os: linux dist: bionic compiler: i686-w64-mingw32-gcc - addons: - apt: - packages: - - gcc-mingw-w64-base - - binutils-mingw-w64-i686 - - gcc-mingw-w64-i686 - - gcc-mingw-w64 - - gcc-multilib env: - BUILD_DIR=win - CFGOPT="--host=i686-w64-mingw32 --enable-threads" -- cgit v0.12 From f2b8a177a5bf8632da8429fc34ff5f16dbeef73a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 20 Apr 2020 11:30:31 +0000 Subject: Code cleanup (more typecasts), making it more compatible with -Wc++-compat. Less use of /* ARGUSED */. --- generic/tclEncoding.c | 116 ++++++++++++++++++++++---------------------------- generic/tclUtil.c | 61 +++++++++++++------------- generic/tclZlib.c | 64 ++++++++++++++-------------- win/tclWinPipe.c | 1 - 4 files changed, 114 insertions(+), 128 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index b879ec8..e012570 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -18,7 +18,7 @@ typedef size_t (LengthProc)(const char *src); * convert between various character sets and UTF-8. */ -typedef struct Encoding { +typedef struct { char *name; /* Name of encoding. Malloced because (1) hash * table entry that owns this encoding may be * freed prior to this encoding being freed, @@ -57,7 +57,7 @@ typedef struct Encoding { * encoding. */ -typedef struct TableEncodingData { +typedef struct { int fallback; /* Character (in this encoding) to substitute * when this encoding cannot represent a UTF-8 * character. */ @@ -91,7 +91,7 @@ typedef struct TableEncodingData { * for switching character sets. */ -typedef struct EscapeSubTable { +typedef struct { unsigned sequenceLen; /* Length of following string. */ char sequence[16]; /* Escape code that marks this encoding. */ char name[32]; /* Name for encoding. */ @@ -100,7 +100,7 @@ typedef struct EscapeSubTable { * yet. */ } EscapeSubTable; -typedef struct EscapeEncodingData { +typedef struct { int fallback; /* Character (in this encoding) to substitute * when this encoding cannot represent a UTF-8 * character. */ @@ -195,35 +195,25 @@ static unsigned short emptyPage[256]; * Functions used only in this module. */ -static int BinaryProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static void DupEncodingIntRep(Tcl_Obj *srcPtr, Tcl_Obj *dupPtr); -static void EscapeFreeProc(ClientData clientData); -static int EscapeFromUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static int EscapeToUtfProc(ClientData clientData, - const char *src, int srcLen, int flags, - Tcl_EncodingState *statePtr, char *dst, int dstLen, - int *srcReadPtr, int *dstWrotePtr, - int *dstCharsPtr); -static void FillEncodingFileMap(void); -static void FreeEncoding(Tcl_Encoding encoding); -static void FreeEncodingIntRep(Tcl_Obj *objPtr); -static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr, - int state); -static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp, const char *name); -static Tcl_Encoding LoadTableEncoding(const char *name, int type, - Tcl_Channel chan); -static Tcl_Encoding LoadEscapeEncoding(const char *name, Tcl_Channel chan); -static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, - const char *name); -static void TableFreeProc(ClientData clientData); +static Tcl_EncodingConvertProc BinaryProc; +static Tcl_DupInternalRepProc DupEncodingIntRep; +static Tcl_EncodingFreeProc EscapeFreeProc; +static Tcl_EncodingConvertProc EscapeFromUtfProc; +static Tcl_EncodingConvertProc EscapeToUtfProc; +static void FillEncodingFileMap(void); +static void FreeEncoding(Tcl_Encoding encoding); +static Tcl_FreeInternalRepProc FreeEncodingIntRep; +static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr, + int state); +static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp, + const char *name); +static Tcl_Encoding LoadTableEncoding(const char *name, int type, + Tcl_Channel chan); +static Tcl_Encoding LoadEscapeEncoding(const char *name, + Tcl_Channel chan); +static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, + const char *name); +static Tcl_EncodingFreeProc TableFreeProc; static int TableFromUtfProc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, @@ -335,7 +325,7 @@ static void FreeEncodingIntRep( Tcl_Obj *objPtr) { - Tcl_FreeEncoding(objPtr->internalRep.twoPtrValue.ptr1); + Tcl_FreeEncoding((Tcl_Encoding)objPtr->internalRep.twoPtrValue.ptr1); objPtr->typePtr = NULL; } @@ -596,14 +586,14 @@ TclInitEncodingSubsystem(void) * code to duplicate the structure of a table encoding here. */ - dataPtr = ckalloc(sizeof(TableEncodingData)); + dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData)); memset(dataPtr, 0, sizeof(TableEncodingData)); dataPtr->fallback = '?'; size = 256*(sizeof(unsigned short *) + sizeof(unsigned short)); - dataPtr->toUnicode = ckalloc(size); + dataPtr->toUnicode = (unsigned short **)ckalloc(size); memset(dataPtr->toUnicode, 0, size); - dataPtr->fromUnicode = ckalloc(size); + dataPtr->fromUnicode = (unsigned short **)ckalloc(size); memset(dataPtr->fromUnicode, 0, size); dataPtr->toUnicode[0] = (unsigned short *) (dataPtr->toUnicode + 256); @@ -669,7 +659,7 @@ TclFinalizeEncodingSubsystem(void) * cleaned up. */ - FreeEncoding(Tcl_GetHashValue(hPtr)); + FreeEncoding((Tcl_Encoding)Tcl_GetHashValue(hPtr)); hPtr = Tcl_FirstHashEntry(&encodingTable, &search); } @@ -778,7 +768,7 @@ Tcl_GetEncoding( hPtr = Tcl_FindHashEntry(&encodingTable, name); if (hPtr != NULL) { - encodingPtr = Tcl_GetHashValue(hPtr); + encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); encodingPtr->refCount++; Tcl_MutexUnlock(&encodingMutex); return (Tcl_Encoding) encodingPtr; @@ -926,7 +916,7 @@ Tcl_GetEncodingNames( Tcl_MutexLock(&encodingMutex); for (hPtr = Tcl_FirstHashEntry(&encodingTable, &search); hPtr != NULL; hPtr = Tcl_NextHashEntry(&search)) { - Encoding *encodingPtr = Tcl_GetHashValue(hPtr); + Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); Tcl_CreateHashEntry(&table, Tcl_NewStringObj(encodingPtr->name, -1), &dummy); @@ -1056,13 +1046,13 @@ Tcl_CreateEncoding( * reference goes away. */ - encodingPtr = Tcl_GetHashValue(hPtr); + encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr); encodingPtr->hPtr = NULL; } - name = ckalloc(strlen(typePtr->encodingName) + 1); + name = (char *)ckalloc(strlen(typePtr->encodingName) + 1); - encodingPtr = ckalloc(sizeof(Encoding)); + encodingPtr = (Encoding *)ckalloc(sizeof(Encoding)); encodingPtr->name = strcpy(name, typePtr->encodingName); encodingPtr->toUtfProc = typePtr->toUtfProc; encodingPtr->fromUtfProc = typePtr->fromUtfProc; @@ -1740,7 +1730,7 @@ LoadTableEncoding( #undef PAGESIZE #define PAGESIZE (256 * sizeof(unsigned short)) - dataPtr = ckalloc(sizeof(TableEncodingData)); + dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData)); memset(dataPtr, 0, sizeof(TableEncodingData)); dataPtr->fallback = fallback; @@ -1752,7 +1742,7 @@ LoadTableEncoding( */ size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; - dataPtr->toUnicode = ckalloc(size); + dataPtr->toUnicode = (unsigned short **)ckalloc(size); memset(dataPtr->toUnicode, 0, size); pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256); @@ -1813,7 +1803,7 @@ LoadTableEncoding( } } size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; - dataPtr->fromUnicode = ckalloc(size); + dataPtr->fromUnicode = (unsigned short **)ckalloc(size); memset(dataPtr->fromUnicode, 0, size); pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256); @@ -2051,7 +2041,7 @@ LoadEscapeEncoding( size = sizeof(EscapeEncodingData) - sizeof(EscapeSubTable) + Tcl_DStringLength(&escapeData); - dataPtr = ckalloc(size); + dataPtr = (EscapeEncodingData *)ckalloc(size); dataPtr->initLen = strlen(init); memcpy(dataPtr->init, init, dataPtr->initLen + 1); dataPtr->finalLen = strlen(final); @@ -2661,7 +2651,7 @@ TableToUtfProc( Tcl_UniChar ch = 0; const unsigned short *const *toUnicode; const unsigned short *pageZero; - TableEncodingData *dataPtr = clientData; + TableEncodingData *dataPtr = (TableEncodingData *)clientData; if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; @@ -2772,7 +2762,7 @@ TableFromUtfProc( const char *dstStart, *dstEnd, *prefixBytes; Tcl_UniChar ch = 0; int result, len, word, numChars; - TableEncodingData *dataPtr = clientData; + TableEncodingData *dataPtr = (TableEncodingData *)clientData; const unsigned short *const *fromUnicode; result = TCL_OK; @@ -2979,11 +2969,9 @@ Iso88591FromUtfProc( { const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; - int result, numChars; + int result = TCL_OK, numChars; Tcl_UniChar ch = 0; - result = TCL_OK; - srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -3067,7 +3055,7 @@ TableFreeProc( ClientData clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = clientData; + TableEncodingData *dataPtr = (TableEncodingData *)clientData; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] @@ -3125,7 +3113,7 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; @@ -3136,8 +3124,8 @@ EscapeToUtfProc( charLimit = *dstCharsPtr; } result = TCL_OK; - tablePrefixBytes = NULL; /* lint. */ - tableToUnicode = NULL; /* lint. */ + tablePrefixBytes = NULL; + tableToUnicode = NULL; prefixBytes = dataPtr->prefixBytes; encodingPtr = NULL; @@ -3261,7 +3249,7 @@ EscapeToUtfProc( TableEncodingData *tableDataPtr; encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = encodingPtr->clientData; + tableDataPtr = (TableEncodingData *)encodingPtr->clientData; tablePrefixBytes = tableDataPtr->prefixBytes; tableToUnicode = (const unsigned short *const*) tableDataPtr->toUnicode; @@ -3339,7 +3327,7 @@ EscapeFromUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; const Encoding *encodingPtr; const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; @@ -3380,7 +3368,7 @@ EscapeFromUtfProc( } encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = encodingPtr->clientData; + tableDataPtr = (const TableEncodingData *)encodingPtr->clientData; tablePrefixBytes = tableDataPtr->prefixBytes; tableFromUnicode = (const unsigned short *const *) tableDataPtr->fromUnicode; @@ -3408,7 +3396,7 @@ EscapeFromUtfProc( oldState = state; for (state = 0; state < dataPtr->numSubTables; state++) { encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = encodingPtr->clientData; + tableDataPtr = (const TableEncodingData *)encodingPtr->clientData; word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xFF]; if (word != 0) { break; @@ -3422,7 +3410,7 @@ EscapeFromUtfProc( break; } encodingPtr = GetTableEncoding(dataPtr, state); - tableDataPtr = encodingPtr->clientData; + tableDataPtr = (const TableEncodingData *)encodingPtr->clientData; word = tableDataPtr->fallback; } @@ -3528,7 +3516,7 @@ EscapeFreeProc( ClientData clientData) /* EscapeEncodingData that specifies * encoding. */ { - EscapeEncodingData *dataPtr = clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; EscapeSubTable *subTablePtr; int i; @@ -3693,8 +3681,8 @@ InitializeEncodingSearchPath( bytes = Tcl_GetStringFromObj(searchPathObj, &numBytes); *lengthPtr = numBytes; - *valuePtr = ckalloc(numBytes + 1); - memcpy(*valuePtr, bytes, (size_t) numBytes + 1); + *valuePtr = (char *)ckalloc(numBytes + 1); + memcpy(*valuePtr, bytes, numBytes + 1); Tcl_DecrRefCount(searchPathObj); } diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 4139804..13b0d55 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -577,7 +577,7 @@ FindElement( const char *limit; /* Points just after list/dict's last byte. */ int openBraces = 0; /* Brace nesting level during parse. */ int inQuotes = 0; - int size = 0; /* lint. */ + int size = 0; int numChars; int literal = 1; const char *p2; @@ -869,7 +869,7 @@ Tcl_SplitList( size = TclMaxListLength(list, -1, &end) + 1; length = end - list; - argv = ckalloc((size * sizeof(char *)) + length + 1); + argv = (const char **)ckalloc((size * sizeof(char *)) + length + 1); for (i = 0, p = ((char *) argv) + size*sizeof(char *); *list != 0; i++) { @@ -1577,7 +1577,7 @@ Tcl_Merge( */ if (argc == 0) { - result = ckalloc(1); + result = (char *)ckalloc(1); result[0] = '\0'; return result; } @@ -1589,7 +1589,7 @@ Tcl_Merge( if (argc <= LOCAL_SIZE) { flagPtr = localFlags; } else { - flagPtr = ckalloc(argc); + flagPtr = (char *)ckalloc(argc); } for (i = 0; i < argc; i++) { flagPtr[i] = ( i ? TCL_DONT_QUOTE_HASH : 0 ); @@ -1607,7 +1607,7 @@ Tcl_Merge( * Pass two: copy into the result area. */ - result = ckalloc(bytesNeeded); + result = (char *)ckalloc(bytesNeeded); dst = result; for (i = 0; i < argc; i++) { flagPtr[i] |= ( i ? TCL_DONT_QUOTE_HASH : 0 ); @@ -1931,7 +1931,7 @@ Tcl_Concat( * All element bytes + (argc - 1) spaces + 1 terminating NULL. */ - result = ckalloc(bytesNeeded + argc); + result = (char *)ckalloc(bytesNeeded + argc); for (p = result, i = 0; i < argc; i++) { int triml, trimr, elemLength; @@ -2666,7 +2666,7 @@ Tcl_DStringAppend( if (newSize >= dsPtr->spaceAvl) { dsPtr->spaceAvl = newSize * 2; if (dsPtr->string == dsPtr->staticSpace) { - char *newString = ckalloc(dsPtr->spaceAvl); + char *newString = (char *)ckalloc(dsPtr->spaceAvl); memcpy(newString, dsPtr->string, dsPtr->length); dsPtr->string = newString; @@ -2679,7 +2679,7 @@ Tcl_DStringAppend( offset = bytes - dsPtr->string; } - dsPtr->string = ckrealloc(dsPtr->string, dsPtr->spaceAvl); + dsPtr->string = (char *)ckrealloc(dsPtr->string, dsPtr->spaceAvl); if (offset >= 0) { bytes = dsPtr->string + offset; @@ -2797,7 +2797,7 @@ Tcl_DStringAppendElement( if (newSize >= dsPtr->spaceAvl) { dsPtr->spaceAvl = newSize * 2; if (dsPtr->string == dsPtr->staticSpace) { - char *newString = ckalloc(dsPtr->spaceAvl); + char *newString = (char *)ckalloc(dsPtr->spaceAvl); memcpy(newString, dsPtr->string, dsPtr->length); dsPtr->string = newString; @@ -2810,7 +2810,7 @@ Tcl_DStringAppendElement( offset = element - dsPtr->string; } - dsPtr->string = ckrealloc(dsPtr->string, dsPtr->spaceAvl); + dsPtr->string = (char *)ckrealloc(dsPtr->string, dsPtr->spaceAvl); if (offset >= 0) { element = dsPtr->string + offset; @@ -2884,12 +2884,12 @@ Tcl_DStringSetLength( dsPtr->spaceAvl = length + 1; } if (dsPtr->string == dsPtr->staticSpace) { - char *newString = ckalloc(dsPtr->spaceAvl); + char *newString = (char *)ckalloc(dsPtr->spaceAvl); memcpy(newString, dsPtr->string, dsPtr->length); dsPtr->string = newString; } else { - dsPtr->string = ckrealloc(dsPtr->string, dsPtr->spaceAvl); + dsPtr->string = (char *)ckrealloc(dsPtr->string, dsPtr->spaceAvl); } } dsPtr->length = length; @@ -3034,7 +3034,7 @@ Tcl_DStringGetResult( dsPtr->string = iPtr->result; dsPtr->spaceAvl = dsPtr->length+1; } else { - dsPtr->string = ckalloc(dsPtr->length+1); + dsPtr->string = (char *)ckalloc(dsPtr->length+1); memcpy(dsPtr->string, iPtr->result, dsPtr->length+1); iPtr->freeProc(iPtr->result); } @@ -3045,7 +3045,7 @@ Tcl_DStringGetResult( dsPtr->string = dsPtr->staticSpace; dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; } else { - dsPtr->string = ckalloc(dsPtr->length+1); + dsPtr->string = (char *)ckalloc(dsPtr->length+1); dsPtr->spaceAvl = dsPtr->length + 1; } memcpy(dsPtr->string, iPtr->result, dsPtr->length+1); @@ -3203,7 +3203,7 @@ Tcl_PrintDouble( int signum; char *digits; char *end; - int *precisionPtr = Tcl_GetThreadData(&precisionKey, sizeof(int)); + int *precisionPtr = (int *)Tcl_GetThreadData(&precisionKey, sizeof(int)); /* * Handle NaN. @@ -3365,7 +3365,6 @@ Tcl_PrintDouble( *---------------------------------------------------------------------- */ - /* ARGSUSED */ char * TclPrecTraceProc( ClientData clientData, /* Not used. */ @@ -3376,7 +3375,7 @@ TclPrecTraceProc( { Tcl_Obj *value; int prec; - int *precisionPtr = Tcl_GetThreadData(&precisionKey, (int) sizeof(int)); + int *precisionPtr = (int *)Tcl_GetThreadData(&precisionKey, sizeof(int)); /* * If the variable is unset, then recreate the trace. @@ -3724,8 +3723,8 @@ UpdateStringOfEndOffset( buffer[len++] = '-'; len += TclFormatInt(buffer+len, -(objPtr->internalRep.longValue)); } - objPtr->bytes = ckalloc((unsigned) len+1); - memcpy(objPtr->bytes, buffer, (unsigned) len+1); + objPtr->bytes = (char *)ckalloc(len+1); + memcpy(objPtr->bytes, buffer, len+1); objPtr->length = len; } @@ -3734,14 +3733,14 @@ UpdateStringOfEndOffset( * * GetEndOffsetFromObj -- * - * Look for a string of the form "end[+-]offset" and convert it to an - * internal representation holding the offset. + * Look for a string of the form "end[+-]offset" and convert it to an + * internal representation holding the offset. * * Results: - * Tcl return code. + * Tcl return code. * * Side effects: - * May store a Tcl_ObjType. + * May store a Tcl_ObjType. * *---------------------------------------------------------------------- */ @@ -4086,7 +4085,7 @@ ClearHash( for (hPtr = Tcl_FirstHashEntry(tablePtr, &search); hPtr != NULL; hPtr = Tcl_NextHashEntry(&search)) { - Tcl_Obj *objPtr = Tcl_GetHashValue(hPtr); + Tcl_Obj *objPtr = (Tcl_Obj *)Tcl_GetHashValue(hPtr); Tcl_DecrRefCount(objPtr); Tcl_DeleteHashEntry(hPtr); @@ -4116,10 +4115,10 @@ GetThreadHash( Tcl_ThreadDataKey *keyPtr) { Tcl_HashTable **tablePtrPtr = - Tcl_GetThreadData(keyPtr, sizeof(Tcl_HashTable *)); + (Tcl_HashTable **)Tcl_GetThreadData(keyPtr, sizeof(Tcl_HashTable *)); if (NULL == *tablePtrPtr) { - *tablePtrPtr = ckalloc(sizeof(Tcl_HashTable)); + *tablePtrPtr = (Tcl_HashTable *)ckalloc(sizeof(Tcl_HashTable)); Tcl_CreateThreadExitHandler(FreeThreadHash, *tablePtrPtr); Tcl_InitHashTable(*tablePtrPtr, TCL_ONE_WORD_KEYS); } @@ -4144,7 +4143,7 @@ static void FreeThreadHash( ClientData clientData) { - Tcl_HashTable *tablePtr = clientData; + Tcl_HashTable *tablePtr = (Tcl_HashTable *)clientData; ClearHash(tablePtr); Tcl_DeleteHashTable(tablePtr); @@ -4166,7 +4165,7 @@ static void FreeProcessGlobalValue( ClientData clientData) { - ProcessGlobalValue *pgvPtr = clientData; + ProcessGlobalValue *pgvPtr = (ProcessGlobalValue *)clientData; pgvPtr->epoch++; pgvPtr->numBytes = 0; @@ -4214,7 +4213,7 @@ TclSetProcessGlobalValue( Tcl_CreateExitHandler(FreeProcessGlobalValue, pgvPtr); } bytes = Tcl_GetStringFromObj(newValue, &pgvPtr->numBytes); - pgvPtr->value = ckalloc(pgvPtr->numBytes + 1); + pgvPtr->value = (char *)ckalloc(pgvPtr->numBytes + 1); memcpy(pgvPtr->value, bytes, pgvPtr->numBytes + 1); if (pgvPtr->encoding) { Tcl_FreeEncoding(pgvPtr->encoding); @@ -4278,7 +4277,7 @@ TclGetProcessGlobalValue( Tcl_DStringLength(&native), &newValue); Tcl_DStringFree(&native); ckfree(pgvPtr->value); - pgvPtr->value = ckalloc(Tcl_DStringLength(&newValue) + 1); + pgvPtr->value = (char *)ckalloc(Tcl_DStringLength(&newValue) + 1); memcpy(pgvPtr->value, Tcl_DStringValue(&newValue), Tcl_DStringLength(&newValue) + 1); Tcl_DStringFree(&newValue); @@ -4328,7 +4327,7 @@ TclGetProcessGlobalValue( Tcl_SetHashValue(hPtr, value); Tcl_IncrRefCount(value); } - return Tcl_GetHashValue(hPtr); + return (Tcl_Obj *)Tcl_GetHashValue(hPtr); } /* diff --git a/generic/tclZlib.c b/generic/tclZlib.c index ed29ff9..86fda86 100644 --- a/generic/tclZlib.c +++ b/generic/tclZlib.c @@ -593,7 +593,7 @@ SetInflateDictionary( int length; unsigned char *bytes = Tcl_GetByteArrayFromObj(compDictObj, &length); - return inflateSetDictionary(strm, bytes, (unsigned) length); + return inflateSetDictionary(strm, bytes, length); } return Z_OK; } @@ -607,7 +607,7 @@ SetDeflateDictionary( int length; unsigned char *bytes = Tcl_GetByteArrayFromObj(compDictObj, &length); - return deflateSetDictionary(strm, bytes, (unsigned) length); + return deflateSetDictionary(strm, bytes, length); } return Z_OK; } @@ -623,7 +623,7 @@ Deflate( int e; strm->next_out = (Bytef *) bufferPtr; - strm->avail_out = (unsigned) bufferSize; + strm->avail_out = bufferSize; e = deflate(strm, flush); if (writtenPtr != NULL) { *writtenPtr = bufferSize - strm->avail_out; @@ -697,7 +697,7 @@ Tcl_ZlibStreamInit( case TCL_ZLIB_FORMAT_GZIP: wbits = WBITS_GZIP; if (dictObj) { - gzHeaderPtr = ckalloc(sizeof(GzipHeader)); + gzHeaderPtr = (GzipHeader *)ckalloc(sizeof(GzipHeader)); memset(gzHeaderPtr, 0, sizeof(GzipHeader)); if (GenerateHeader(interp, dictObj, gzHeaderPtr, NULL) != TCL_OK) { @@ -731,7 +731,7 @@ Tcl_ZlibStreamInit( break; case TCL_ZLIB_FORMAT_GZIP: wbits = WBITS_GZIP; - gzHeaderPtr = ckalloc(sizeof(GzipHeader)); + gzHeaderPtr = (GzipHeader *)ckalloc(sizeof(GzipHeader)); memset(gzHeaderPtr, 0, sizeof(GzipHeader)); gzHeaderPtr->header.name = (Bytef *) gzHeaderPtr->nativeFilenameBuf; @@ -757,7 +757,7 @@ Tcl_ZlibStreamInit( " TCL_ZLIB_STREAM_INFLATE"); } - zshPtr = ckalloc(sizeof(ZlibStreamHandle)); + zshPtr = (ZlibStreamHandle *)ckalloc(sizeof(ZlibStreamHandle)); zshPtr->interp = interp; zshPtr->mode = mode; zshPtr->format = format; @@ -884,7 +884,7 @@ static void ZlibStreamCmdDelete( ClientData cd) { - ZlibStreamHandle *zshPtr = cd; + ZlibStreamHandle *zshPtr = (ZlibStreamHandle *)cd; zshPtr->cmd = NULL; ZlibStreamCleanup(zshPtr); @@ -1231,7 +1231,7 @@ Tcl_ZlibStreamPut( if (outSize > BUFFER_SIZE_LIMIT) { outSize = BUFFER_SIZE_LIMIT; } - dataTmp = ckalloc(outSize); + dataTmp = (char *)ckalloc(outSize); while (1) { e = Deflate(&zshPtr->stream, dataTmp, outSize, flush, &toStore); @@ -1265,7 +1265,7 @@ Tcl_ZlibStreamPut( if (outSize < BUFFER_SIZE_LIMIT) { outSize = BUFFER_SIZE_LIMIT; /* There may be *lots* of data left to output... */ - dataTmp = ckrealloc(dataTmp, outSize); + dataTmp = (char *)ckrealloc(dataTmp, outSize); } } @@ -1750,10 +1750,10 @@ Tcl_ZlibInflate( if (gzipHeaderDictObj) { headerPtr = &header; memset(headerPtr, 0, sizeof(gz_header)); - nameBuf = ckalloc(MAXPATHLEN); + nameBuf = (char *)ckalloc(MAXPATHLEN); header.name = (Bytef *) nameBuf; header.name_max = MAXPATHLEN - 1; - commentBuf = ckalloc(MAX_COMMENT_LEN); + commentBuf = (char *)ckalloc(MAX_COMMENT_LEN); header.comment = (Bytef *) commentBuf; header.comm_max = MAX_COMMENT_LEN - 1; } @@ -1858,7 +1858,7 @@ Tcl_ZlibInflate( if (headerPtr != NULL) { ExtractHeader(&header, gzipHeaderDictObj); SetValue(gzipHeaderDictObj, "size", - Tcl_NewLongObj((long) stream.total_out)); + Tcl_NewLongObj(stream.total_out)); ckfree(nameBuf); ckfree(commentBuf); } @@ -1894,7 +1894,7 @@ Tcl_ZlibCRC32( int len) { /* Nothing much to do, just wrap the crc32(). */ - return crc32(crc, (Bytef *) buf, (unsigned) len); + return crc32(crc, (Bytef *) buf, len); } unsigned int @@ -1903,7 +1903,7 @@ Tcl_ZlibAdler32( const unsigned char *buf, int len) { - return adler32(adler, (Bytef *) buf, (unsigned) len); + return adler32(adler, (Bytef *) buf, len); } /* @@ -2519,7 +2519,7 @@ ZlibStreamCmd( int objc, Tcl_Obj *const objv[]) { - Tcl_ZlibStream zstream = cd; + Tcl_ZlibStream zstream = (Tcl_ZlibStream)cd; int command, count, code; Tcl_Obj *obj; static const char *const cmds[] = { @@ -2645,7 +2645,7 @@ ZlibStreamAddCmd( int objc, Tcl_Obj *const objv[]) { - Tcl_ZlibStream zstream = cd; + Tcl_ZlibStream zstream = (Tcl_ZlibStream)cd; int index, code, buffersize = -1, flush = -1, i; Tcl_Obj *obj, *compDictObj = NULL; static const char *const add_options[] = { @@ -2769,7 +2769,7 @@ ZlibStreamPutCmd( int objc, Tcl_Obj *const objv[]) { - Tcl_ZlibStream zstream = cd; + Tcl_ZlibStream zstream = (Tcl_ZlibStream)cd; int index, flush = -1, i; Tcl_Obj *compDictObj = NULL; static const char *const put_options[] = { @@ -2858,7 +2858,7 @@ ZlibStreamHeaderCmd( int objc, Tcl_Obj *const objv[]) { - ZlibStreamHandle *zshPtr = cd; + ZlibStreamHandle *zshPtr = (ZlibStreamHandle *)cd; Tcl_Obj *resultObj; if (objc != 2) { @@ -2895,7 +2895,7 @@ ZlibTransformClose( ClientData instanceData, Tcl_Interp *interp) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; int e, written, result = TCL_OK; /* @@ -2988,7 +2988,7 @@ ZlibTransformInput( int toRead, int *errorCodePtr) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; Tcl_DriverInputProc *inProc = Tcl_ChannelInputProc(Tcl_GetChannelType(cd->parent)); int readBytes, gotBytes, copied; @@ -3027,7 +3027,7 @@ ZlibTransformInput( * reading over the border. */ - readBytes = Tcl_ReadRaw(cd->parent, cd->inBuffer, + readBytes = Tcl_ReadRaw(cd->parent, cd->inBuffer, cd->readAheadLimit <= cd->inAllocated ? cd->readAheadLimit : cd->inAllocated); @@ -3104,7 +3104,7 @@ ZlibTransformOutput( int toWrite, int *errorCodePtr) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; Tcl_DriverOutputProc *outProc = Tcl_ChannelOutputProc(Tcl_GetChannelType(cd->parent)); int e, produced; @@ -3225,7 +3225,7 @@ ZlibTransformSetOption( /* not used */ const char *optionName, const char *value) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; Tcl_DriverSetOptionProc *setOptionProc = Tcl_ChannelSetOptionProc(Tcl_GetChannelType(cd->parent)); static const char *compressChanOptions = "dictionary flush"; @@ -3338,7 +3338,7 @@ ZlibTransformGetOption( const char *optionName, Tcl_DString *dsPtr) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; Tcl_DriverGetOptionProc *getOptionProc = Tcl_ChannelGetOptionProc(Tcl_GetChannelType(cd->parent)); static const char *compressChanOptions = "checksum dictionary"; @@ -3456,7 +3456,7 @@ ZlibTransformWatch( ClientData instanceData, int mask) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; Tcl_DriverWatchProc *watchProc; /* @@ -3479,7 +3479,7 @@ ZlibTransformEventHandler( ClientData instanceData, int interestMask) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; ZlibTransformEventTimerKill(cd); return interestMask; @@ -3499,7 +3499,7 @@ static void ZlibTransformTimerRun( ClientData clientData) { - ZlibChannelData *cd = clientData; + ZlibChannelData *cd = (ZlibChannelData *)clientData; cd->timer = NULL; Tcl_NotifyChannel(cd->chan, TCL_READABLE); @@ -3522,7 +3522,7 @@ ZlibTransformGetHandle( int direction, ClientData *handlePtr) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; return Tcl_GetChannelHandle(cd->parent, direction, handlePtr); } @@ -3542,7 +3542,7 @@ ZlibTransformBlockMode( ClientData instanceData, int mode) { - ZlibChannelData *cd = instanceData; + ZlibChannelData *cd = (ZlibChannelData *)instanceData; if (mode == TCL_MODE_NONBLOCKING) { cd->flags |= ASYNC; @@ -3592,7 +3592,7 @@ ZlibStackChannelTransform( * dictionary (not dictObj!) to use if * necessary. */ { - ZlibChannelData *cd = ckalloc(sizeof(ZlibChannelData)); + ZlibChannelData *cd = (ZlibChannelData *)ckalloc(sizeof(ZlibChannelData)); Tcl_Channel chan; int wbits = 0; @@ -3652,7 +3652,7 @@ ZlibStackChannelTransform( goto error; } cd->inAllocated = DEFAULT_BUFFER_SIZE; - cd->inBuffer = ckalloc(cd->inAllocated); + cd->inBuffer = (char *)ckalloc(cd->inAllocated); if (cd->flags & IN_HEADER) { if (inflateGetHeader(&cd->inStream, &cd->inHeader.header) != Z_OK) { goto error; @@ -3669,7 +3669,7 @@ ZlibStackChannelTransform( goto error; } cd->outAllocated = DEFAULT_BUFFER_SIZE; - cd->outBuffer = ckalloc(cd->outAllocated); + cd->outBuffer = (char *)ckalloc(cd->outAllocated); if (cd->flags & OUT_HEADER) { if (deflateSetHeader(&cd->outStream, &cd->outHeader.header) != Z_OK) { goto error; diff --git a/win/tclWinPipe.c b/win/tclWinPipe.c index 69c38d4..04c371e 100644 --- a/win/tclWinPipe.c +++ b/win/tclWinPipe.c @@ -2752,7 +2752,6 @@ TclWinAddProcess( *---------------------------------------------------------------------- */ - /* ARGSUSED */ int Tcl_PidObjCmd( ClientData dummy, /* Not used. */ -- cgit v0.12 From b835a76e5f2294a8b0d7eeea52d0af653c82f732 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 20 Apr 2020 11:56:27 +0000 Subject: (cherry-pick): Proposed fix for [27944a3661]: Taming test utf-6.88. --- generic/tclUtf.c | 38 ++++++++++++-------------------------- tests/utf.test | 22 +++++++++++++++++----- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1ba474e..aa949ca 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -716,31 +716,11 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * The aim of this routine is to provide a way to iterate forward - * through a UTF-8 string. The caller is expected to pass a non-NULL - * pointer argument /src/ which points to a location within a string. - * (*src) will be read, so /src/ must not point to an unreadable - * location past the end of the string. If /src/ points to the - * beginning of a complete, well-formed and valid UTF_8 byte sequence - * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer - * just past the end of that sequence. In any other circumstance, - * Tcl_UtfNext returns /src/+1. - * - * Because this routine always returns a value > /src/, it is useful - * as a forward iterator that will always make progress. If the string - * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating - * NUL character. If it is not NUL-terminated, the caller must make - * use of the companion routine Tcl_UtfCharComplete to test whether - * there is risk that Tcl_UtfNext will read beyond the end of the string. - * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. - * - * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfNext calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext + * returns a pointer to the next UTF-8 character in the string. + * The caller must not ask for the next character after the last + * character in the string if the string is not terminated by a null + * character. * * Results: * A pointer to the start of the next character in the string (or to @@ -760,13 +740,19 @@ Tcl_UtfNext( int left = totalBytes[byte]; const char *next = src + 1; + if (((*src) & 0xC0) == 0x80) { + if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) { + ++src; + } + return src; + } + while (--left) { byte = *((unsigned char *) next); if ((byte & 0xC0) != 0x80) { /* * src points to non-trail byte; We ran out of trail bytes * before the needs of the lead byte were satisfied. - * Let the (malformed) lead byte alone be a character */ return src + 1; } diff --git a/tests/utf.test b/tests/utf.test index 0ba2b85..f56fabc 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -189,7 +189,7 @@ test utf-6.10 {Tcl_UtfNext} testutfnext { } 1 test utf-6.11 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xA0 -} 1 +} 2 test utf-6.12 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xD0 } 1 @@ -420,18 +420,30 @@ test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext} { } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xA0\xA0 -} 1 +} 2 test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xE8\xA0\xA0 1 -} 2 +} 3 test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \x80\x80 -} 1 +} 2 test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 -} 2 +} 3 testConstraint testutfprev [llength [info commands testutfprev]] +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { + testutfnext \xA0\xA0\xA0 +} 3 +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { + testutfnext \xF2\xA0\xA0\xA0 1 +} 4 +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext \x80\x80\x80 +} 3 +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext \xF0\x80\x80\x80 1 +} 4 test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} -- cgit v0.12 From 96e37a5017d24affcdc530f901c09179291ba3ca Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 20 Apr 2020 15:20:55 +0000 Subject: Proposed fix for [c11e0c5ce4]: Regression in Tcl_UtfCharComplete. --- generic/tclUtf.c | 32 ++++++------- tests/utf.test | 136 ++++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 123 insertions(+), 45 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index aa949ca..842744d 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,17 +64,6 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 -}; - -static const unsigned char complete[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, #if TCL_UTF_MAX > 4 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -84,7 +73,11 @@ static const unsigned char complete[256] = { #endif 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4, +#if TCL_UTF_MAX > 3 + 4,4,4,4,4, +#else + 1,1,1,1,1, +#endif 1,1,1,1,1,1,1,1,1,1,1 }; @@ -558,7 +551,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= complete[(unsigned char)*src]; + return length >= totalBytes[(unsigned char)*src]; } /* @@ -606,7 +599,7 @@ Tcl_NumUtfChars( src = next; } } else { - register const char *endPtr = src + length - /*TCL_UTF_MAX*/ 4; + register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { next = TclUtfNext(src); @@ -617,7 +610,7 @@ Tcl_NumUtfChars( #endif src = next; } - endPtr += /*TCL_UTF_MAX*/ 4; + endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { next = TclUtfNext(src); #if TCL_UTF_MAX > 4 @@ -895,15 +888,18 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < /* was TCL_UTF_MAX */ 4); + } while (trailBytesSeen < TCL_UTF_MAX); /* - * We've seen 4 (was TCL_UTF_MAX) trail bytes, so we know there will not be a + * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, * accepting the fallback. */ - +#if TCL_UTF_MAX < 4 + return src - TCL_UTF_MAX; +#else return fallback; +#endif } /* diff --git a/tests/utf.test b/tests/utf.test index f56fabc..3301dde 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -21,6 +21,7 @@ testConstraint testbytestring [llength [info commands testbytestring]] catch {unset x} # Some tests require support for 4-byte UTF-8 sequences +testConstraint smallutf [expr {[format %c 0x010000] == "\uFFFD"}] testConstraint fullutf [expr {[format %c 0x010000] != "\uFFFD"}] testConstraint tip389 [expr {[string length \U010000] == 2}] @@ -361,7 +362,10 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} testutfnext { +test utf-6.69 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0 +} 1 +test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0 } 4 test utf-6.70 {Tcl_UtfNext} testutfnext { @@ -376,22 +380,40 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF4\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} testutfnext { +test utf-6.74 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0G +} 1 +test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0G } 4 -test utf-6.75 {Tcl_UtfNext} testutfnext { +test utf-6.75 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0\xA0 +} 1 +test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0\xA0 } 4 -test utf-6.76 {Tcl_UtfNext} testutfnext { +test utf-6.76 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0\xD0 +} 1 +test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0\xD0 } 4 -test utf-6.77 {Tcl_UtfNext} testutfnext { +test utf-6.77 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0\xE8 +} 1 +test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0\xE8 } 4 -test utf-6.78 {Tcl_UtfNext} testutfnext { +test utf-6.78 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0\xF4 +} 1 +test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0\xF4 } 4 -test utf-6.79 {Tcl_UtfNext} testutfnext { +test utf-6.79 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF4\xA0\xA0\xA0G\xF8 +} 1 +test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext \xF4\xA0\xA0\xA0G\xF8 } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -415,7 +437,10 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext} { +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext smallutf} { + testutfnext \xF0\x90\x80\x80 +} 1 +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { testutfnext \xF0\x90\x80\x80 } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -508,13 +533,22 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} testutfprev { +test utf-7.10 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0 +} 2 +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0\xA0\xA0 3 +} 2 +test utf-7.10.2 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0\xF8\xA0 3 +} 2 +test utf-7.10 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0 } 1 -test utf-7.10.1 {Tcl_UtfPrev} testutfprev { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0\xA0\xA0 3 } 1 -test utf-7.10.2 {Tcl_UtfPrev} testutfprev { +test utf-7.10.2 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0\xF8\xA0 3 } 1 test utf-7.11 {Tcl_UtfPrev} testutfprev { @@ -556,13 +590,22 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} testutfprev { +test utf-7.15 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0\xA0 +} 3 +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0\xA0\xA0 4 +} 3 +test utf-7.15.2 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0\xA0\xF8 4 +} 3 +test utf-7.15.3 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0\xA0 } 1 -test utf-7.15.1 {Tcl_UtfPrev} testutfprev { +test utf-7.15.4 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0\xA0\xA0 4 } 1 -test utf-7.15.2 {Tcl_UtfPrev} testutfprev { +test utf-7.15.5 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0\xA0\xF8 4 } 1 test utf-7.16 {Tcl_UtfPrev} testutfprev { @@ -583,28 +626,52 @@ test utf-7.17.1 {Tcl_UtfPrev} testutfprev { test utf-7.17.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xD0\xA0\xA0\xF8 4 } 3 -test utf-7.18 {Tcl_UtfPrev} testutfprev { +test utf-7.18 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xA0\xA0\xA0 +} 1 +test utf-7.18.1 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xA0\xA0\xA0\xA0 4 +} 1 +test utf-7.18.2 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xA0\xA0\xA0\xF8 4 +} 1 +test utf-7.18.3 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xA0\xA0\xA0 } 3 -test utf-7.18.1 {Tcl_UtfPrev} testutfprev { +test utf-7.18.4 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xA0\xA0\xA0\xA0 4 } 3 -test utf-7.18.2 {Tcl_UtfPrev} testutfprev { +test utf-7.18.5 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xA0\xA0\xA0\xF8 4 } 3 -test utf-7.19 {Tcl_UtfPrev} testutfprev { +test utf-7.19 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF8\xA0\xA0\xA0 +} 2 +test utf-7.19.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} testutfprev { +test utf-7.20 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF4\xA0\xA0\xA0 +} 2 +test utf-7.20.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF4\xA0\xA0\xA0 } 1 -test utf-7.21 {Tcl_UtfPrev} testutfprev { +test utf-7.21 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xE8\xA0\xA0\xA0 +} 2 +test utf-7.21.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xE8\xA0\xA0\xA0 } 4 -test utf-7.22 {Tcl_UtfPrev} testutfprev { +test utf-7.22 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xD0\xA0\xA0\xA0 +} 2 +test utf-7.22.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xD0\xA0\xA0\xA0 } 4 -test utf-7.23 {Tcl_UtfPrev} testutfprev { +test utf-7.23 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xA0\xA0\xA0\xA0 +} 2 +test utf-7.23.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xA0\xA0\xA0\xA0 } 4 test utf-7.24 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -628,7 +695,10 @@ test utf-7.28 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80\x80 2 } 1 -test utf-7.29 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.29 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { + testutfprev A\xF0\x80\x80\x80 +} 2 +test utf-7.29.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x80\x80\x80 } 4 test utf-7.30 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -658,13 +728,22 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { + testutfprev A\xF0\x90\x80\x80 +} 2 +test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 } 1 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { + testutfprev A\xF0\x90\x80\x80 4 +} 3 +test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 4 } 1 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev} { +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { + testutfprev A\xF0\x90\x80\x80 3 +} 2 +test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 3 } 1 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -679,7 +758,10 @@ test utf-7.44 {Tcl_UtfPrev -- no lead byte at start} testutfprev { test utf-7.45 {Tcl_UtfPrev -- no lead byte at start} testutfprev { testutfprev \xA0\xA0\xA0 } 2 -test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} testutfprev { +test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} {testutfprev smallutf} { + testutfprev \xA0\xA0\xA0\xA0 +} 1 +test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} {testutfprev fullutf} { testutfprev \xA0\xA0\xA0\xA0 } 3 test utf-7.47 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev} { -- cgit v0.12 From 874352f8cbfe523aea14bff022306b9d72699818 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 22:35:39 +0000 Subject: Tie together the TCL_UTF_MAX=4 and TCL_UTF_MAX=6 builds to mean the same thing on the 8.5 branch -- use internal UCS-4 storage. --- generic/regcustom.h | 2 +- generic/tcl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/regcustom.h b/generic/regcustom.h index 57a2d47..ac33087 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ diff --git a/generic/tcl.h b/generic/tcl.h index 7378a8f..d7d064c 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2148,7 +2148,7 @@ typedef struct Tcl_Parse { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this -- cgit v0.12 From d1c59c52ba951ba4c0155f419e462cd923e08883 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 23:00:30 +0000 Subject: Pair every compat85 test with a fullutf test so that we cover all variants. --- tests/utf.test | 137 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 30 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 1ca3647..1c79f32 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,7 +13,9 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } -testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] +testConstraint compat85 [expr {[format %c 0x010000] eq "\uFFFD"}] +testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] + testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testfindfirst [llength [info commands testfindfirst]] testConstraint testfindlast [llength [info commands testfindlast]] @@ -64,12 +66,18 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { string length [testbytestring "\xE4\xB9\x8E"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { +test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { +test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring fullutf} -body { + string length [testbytestring "\xF0\x90\x80\x80"] +} -result {1} +test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} +test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring fullutf} -body { + string length [testbytestring "\xF4\x8F\xBF\xBF"] +} -result {1} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} @@ -118,9 +126,12 @@ test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { +test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 } {4} +test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring fullutf} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 +} {1} test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { testfindfirst [testbytestring "abcbc"] 98 @@ -335,9 +346,12 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.69.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0 } 1 +test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0 +} 4 test utf-6.70 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xD0 } 1 @@ -350,24 +364,42 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.74.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.75 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0G +} 4 +test utf-6.75.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.76 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xA0 +} 4 +test utf-6.76.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.77 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xD0 +} 4 +test utf-6.77.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.78 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xE8 +} 4 +test utf-6.78.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.79 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xF2 +} 4 +test utf-6.79.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 +test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0G\xF8 +} 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x80 } 2 @@ -389,9 +421,12 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { +test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { testutfnext \xF0\x90\x80\x80 } 1 +test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { + testutfnext \xF0\x90\x80\x80 +} 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xA0\xA0 } 1 @@ -404,9 +439,12 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 -test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { +test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { testutfnext \xF4\x8F\xBF\xBF } 1 +test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { + testutfnext \xF4\x8F\xBF\xBF +} 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 @@ -474,15 +512,24 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0 +} 1 +test utf-7.10.1.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.2 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 3 +} 1 +test utf-7.10.2.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 +test utf-7.10.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xF8\xA0 3 +} 1 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 } 1 @@ -522,15 +569,24 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0 +} 1 +test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.2 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 4 +} 1 +test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 +test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xF8 4 +} 1 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 } 1 @@ -561,9 +617,12 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.20.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 } 4 +test utf-7.20.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 +} 1 test utf-7.21 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 } 4 @@ -624,15 +683,24 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { + testutfprev A\xF0\x90\x80\x80 +} 1 +test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { + testutfprev A\xF0\x90\x80\x80 4 +} 1 +test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 3 } 2 +test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { + testutfprev A\xF0\x90\x80\x80 3 +} 1 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x90\x80\x80 2 } 1 @@ -657,15 +725,24 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF +} 1 +test utf-7.48.1.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.1.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF 4 +} 1 +test utf-7.48.2.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 +test utf-7.48.2.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF 3 +} 1 test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x8F\xBF\xBF 2 } 1 @@ -700,10 +777,10 @@ test utf-8.5 {Tcl_UniCharAtIndex: high surrogate} { test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { string index \uDC42 0 } "\uDC42" -test utf-8.7 {Tcl_UniCharAtIndex: Emoji} compat85 { +test utf-8.7 {Tcl_UniCharAtIndex: Emoji} { string index \uD83D\uDE00 0 } "\uD83D" -test utf-8.8 {Tcl_UniCharAtIndex: Emoji} compat85 { +test utf-8.8 {Tcl_UniCharAtIndex: Emoji} { string index \uD83D\uDE00 1 } "\uDE00" @@ -713,10 +790,10 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 } "\u25A\xFF\u543kl" -test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} compat85 { +test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} { string range \uD83D\uDE00G 0 0 } "\uD83D" -test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} compat85 { +test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} { string range \uD83D\uDE00G 1 1 } "\uDE00" -- cgit v0.12 From f827a222243d80d22ab660b6956f0627cd9952c0 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 02:50:23 +0000 Subject: Revert the backport to tclEncoding.c that seems to redefine the "unicode" encoding to mean UTF-16. Don't want that behavior change in 8.5. --- generic/tclEncoding.c | 240 +++++++++++++++++++++++--------------------------- 1 file changed, 112 insertions(+), 128 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index da03055..5a9d2d5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -83,7 +83,7 @@ typedef struct TableEncodingData { } TableEncodingData; /* - * Each of the following structures is the clientData for a dynamically-loaded + * The following structures is the clientData for a dynamically-loaded, * escape-driven encoding that is itself comprised of other simpler encodings. * An example is "iso-2022-jp", which uses escape sequences to switch between * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" @@ -117,8 +117,8 @@ typedef struct EscapeEncodingData { * 0. */ int numSubTables; /* Length of following array. */ EscapeSubTable subTables[1];/* Information about each EscapeSubTable used - * by this encoding type. The actual size is - * as large as necessary to hold all + * by this encoding type. The actual size will + * be as large as necessary to hold all * EscapeSubTables. */ } EscapeEncodingData; @@ -156,7 +156,7 @@ static ProcessGlobalValue encodingFileMap = { * A list of directories making up the "library path". Historically this * search path has served many uses, but the only one remaining is a base for * the encodingSearchPath above. If the application does not explicitly set - * the encodingSearchPath, then it is initialized by appending /encoding + * the encodingSearchPath, then it will be initialized by appending /encoding * to each directory in this "libraryPath". */ @@ -177,7 +177,7 @@ TCL_DECLARE_MUTEX(encodingMutex) /* * The following are used to hold the default and current system encodings. * If NULL is passed to one of the conversion routines, the current setting of - * the system encoding is used to perform the conversion. + * the system encoding will be used to perform the conversion. */ static Tcl_Encoding defaultEncoding; @@ -429,8 +429,9 @@ TclGetLibraryPath(void) * Keeps the per-thread copy of the library path current with changes to * the global copy. * - * Since the result of this routine is void, if searchPath is not a valid - * list this routine silently does nothing. + * NOTE: this routine returns void, so there's no way to report the error + * that searchPath is not a valid list. In that case, this routine will + * silently do nothing. * *---------------------------------------------------------------------- */ @@ -452,16 +453,17 @@ TclSetLibraryPath( * * FillEncodingFileMap -- * - * Called to update the encoding file map with the current value - * of the encoding search path. + * Called to bring the encoding file map in sync with the current value + * of the encoding search path. * - * Finds *.end files in the directories on the encoding search path and - * stores the found pathnames in a map associated with the encoding name. + * Scan the directories on the encoding search path, find the *.enc + * files, and store the found pathnames in a map associated with the + * encoding name. * - * If $dir is on the encoding search path and the file $dir/foo.enc is - * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding - * is needed later, the $dir/foo.enc name can be quickly constructed in - * order to read the encoding data. + * In particular, if $dir is on the encoding search path, and the file + * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. + * Later, any need for the "foo" encoding will quickly * be able to + * construct the $dir/foo.enc pathname for reading the encoding data. * * Results: * None. @@ -542,24 +544,19 @@ void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; - union { - char c; - short s; - } isLe; if (encodingsInitialized) { return; } - isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* - * Create a few initial encodings. UTF-8 to UTF-8 translation is not a - * no-op because it turns a stream of improperly formed UTF-8 into a - * properly formed stream. + * Create a few initial encodings. Note that the UTF-8 to UTF-8 + * translation is not a no-op, because it will turn a stream of improperly + * formed UTF-8 into a properly formed stream. */ type.encodingName = "identity"; @@ -586,7 +583,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUnicodeProc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = INT2PTR(isLe.c); + type.clientData = NULL; Tcl_CreateEncoding(&type); /* @@ -758,7 +755,11 @@ Tcl_SetDefaultEncodingDir( * interp was NULL. * * Side effects: - * LoadEncodingFile is called if necessary. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *------------------------------------------------------------------------- */ @@ -796,15 +797,15 @@ Tcl_GetEncoding( * * Tcl_FreeEncoding -- * - * Releases an encoding allocated by Tcl_CreateEncoding() or - * Tcl_GetEncoding(). + * This function is called to release an encoding allocated by + * Tcl_CreateEncoding() or Tcl_GetEncoding(). * * Results: * None. * * Side effects: * The reference count associated with the encoding is decremented and - * the encoding is deleted if nothing is using it anymore. + * the encoding may be deleted if nothing is using it anymore. * *--------------------------------------------------------------------------- */ @@ -823,14 +824,13 @@ Tcl_FreeEncoding( * * FreeEncoding -- * - * Decrements the reference count of an encoding. The caller must hold - * encodingMutes. + * This function is called to release an encoding by functions that + * already have the encodingMutex. * * Results: * None. * * Side effects: - * Releases the resource for an encoding if it is now unused. * The reference count associated with the encoding is decremented and * the encoding may be deleted if nothing is using it anymore. * @@ -850,17 +850,16 @@ FreeEncoding( if (encodingPtr->refCount<=0) { Tcl_Panic("FreeEncoding: refcount problem !!!"); } - if (encodingPtr->refCount-- <= 1) { + encodingPtr->refCount--; + if (encodingPtr->refCount == 0) { if (encodingPtr->freeProc != NULL) { (*encodingPtr->freeProc)(encodingPtr->clientData); } if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - if (encodingPtr->name) { - ckfree((char *)encodingPtr->name); - } - ckfree((char *)encodingPtr); + ckfree((char *) encodingPtr->name); + ckfree((char *) encodingPtr); } } @@ -1021,22 +1020,23 @@ Tcl_SetSystemEncoding( * * Tcl_CreateEncoding -- * - * Defines a new encoding, along with the functions that are used to - * convert to and from Unicode. + * This function is called to define a new encoding and the functions + * that are used to convert between the specified encoding and Unicode. * * Results: * Returns a token that represents the encoding. If an encoding with the * same name already existed, the old encoding token remains valid and - * continues to behave as it used to, and is eventually garbage collected - * when the last reference to it goes away. Any subsequent calls to - * Tcl_GetEncoding with the specified name retrieve the most recent - * encoding token. + * continues to behave as it used to, and will eventually be garbage + * collected when the last reference to it goes away. Any subsequent + * calls to Tcl_GetEncoding with the specified name will retrieve the + * most recent encoding token. * * Side effects: - * A new record having the name of the encoding is entered into a table of - * encodings visible to all interpreters. For each call to this function, - * there should eventually be a call to Tcl_FreeEncoding, which cleans - * deletes the record in the table when an encoding is no longer needed. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *--------------------------------------------------------------------------- */ @@ -1258,9 +1258,10 @@ Tcl_ExternalToUtf( * * Tcl_UtfToExternalDString -- * - * Convert a source buffer from UTF-8 to the specified encoding. If any + * Convert a source buffer from UTF-8 into the specified encoding. If any * of the bytes in the source buffer are invalid or cannot be represented - * in the target encoding, a default fallback character is substituted. + * in the target encoding, a default fallback character will be + * substituted. * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1569,13 +1570,13 @@ OpenEncodingFileChannel( * the data. * * Results: - * The return value is the newly loaded Tcl_Encoding or NULL if the file - * didn't exist or could not be processed. If NULL is returned and interp - * is not NULL, an error message is left in interp's result object. + * The return value is the newly loaded Encoding, or NULL if the file + * didn't exist of was in the incorrect format. If NULL was returned, an + * error message is left in interp's result object, unless interp was + * NULL. * * Side effects: - * A corresponding encoding file might be read from persistent storage, in - * which case LoadTableEncoding is called. + * File read from disk. * *--------------------------------------------------------------------------- */ @@ -1583,8 +1584,8 @@ OpenEncodingFileChannel( static Tcl_Encoding LoadEncodingFile( Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ - const char *name) /* The name of both the encoding file - * and the new encoding. */ + const char *name) /* The name of the encoding file on disk and + * also the name for new encoding. */ { Tcl_Channel chan = NULL; Tcl_Encoding encoding = NULL; @@ -1636,27 +1637,27 @@ LoadEncodingFile( * * LoadTableEncoding -- * - * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType - * structure along with its corresponding TableEncodingData structure, and - * passes it to Tcl_Createncoding. + * Helper function for LoadEncodingTable(). Loads a table to that + * converts between Unicode and some other encoding and creates an + * encoding (using a TableEncoding structure) from that information. * - * The file contains binary data but begins with a marker to indicate - * byte-ordering so a single binary file can be read on big or - * little-endian systems. + * File contains binary data, but begins with a marker to indicate + * byte-ordering, so that same binary file can be read on either endian + * platforms. * * Results: - * Returns the new Tcl_Encoding, or NULL if it could could - * not be created because the file contained invalid data. + * The return value is the new encoding, or NULL if the encoding could + * not be created (because the file contained invalid data). * * Side effects: - * See Tcl_CreateEncoding(). + * None. * *------------------------------------------------------------------------- */ static Tcl_Encoding LoadTableEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ int type, /* Type of encoding (ENCODING_?????). */ Tcl_Channel chan) /* File containing new encoding. */ { @@ -1768,10 +1769,10 @@ LoadTableEncoding( } /* - * Invert the toUnicode array to produce the fromUnicode array. Performs a + * Invert toUnicode array to produce the fromUnicode array. Performs a * single malloc to get the memory for the array and all the pages needed - * by the array. While reading in the toUnicode array remember what - * pages are needed for the fromUnicode array. + * by the array. While reading in the toUnicode array, we remembered what + * pages that would be needed for the fromUnicode array. */ if (symbol) { @@ -1813,8 +1814,8 @@ LoadTableEncoding( if (type == ENCODING_MULTIBYTE) { /* * If multibyte encodings don't have a backslash character, define - * one. Otherwise, on Windows, native file names don't work because - * the backslash in the file name maps to the unknown character + * one. Otherwise, on Windows, native file names won't work because + * the backslash in the file name will map to the unknown character * (question mark) when converting from UTF-8 to external encoding. */ @@ -1828,13 +1829,13 @@ LoadTableEncoding( unsigned short *page; /* - * Make a special symbol encoding that maps each symbol character from - * its Unicode code point down into page 0, and also ensure that each - * characters on page 0 maps to itself so that a symbol font can be - * used to display a simple string like "abcd" and have alpha, beta, - * chi, delta show up, rather than have "unknown" chars show up because - * strictly speaking the symbol font doesn't have glyphs for those low - * ASCII chars. + * Make a special symbol encoding that not only maps the symbol + * characters from their Unicode code points down into page 0, but + * also ensure that the characters on page 0 map to themselves. This + * is so that a symbol font can be used to display a simple string + * like "abcd" and have alpha, beta, chi, delta show up, rather than + * have "unknown" chars show up because strictly speaking the symbol + * font doesn't have glyphs for those low ascii chars. */ page = dataPtr->fromUnicode[0]; @@ -1938,7 +1939,7 @@ LoadTableEncoding( static Tcl_Encoding LoadEscapeEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ Tcl_Channel chan) /* File containing new encoding. */ { int i; @@ -2317,7 +2318,7 @@ UtfToUtfProc( * * UnicodeToUtfProc -- * - * Convert from UTF-16 to UTF-8. + * Convert from Unicode to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2330,7 +2331,7 @@ UtfToUtfProc( static int UnicodeToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* Not used. */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2358,19 +2359,13 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; char *dstEnd, *dstStart; int result, numChars; - unsigned short ch; + Tcl_UniChar ch; result = TCL_OK; - - /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ - if ((srcLen % 2) != 0) { - result = TCL_CONVERT_MULTIBYTE; - srcLen--; - } - /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + if ((srcLen % sizeof(Tcl_UniChar)) != 0) { result = TCL_CONVERT_MULTIBYTE; - srcLen-= 2; + srcLen /= sizeof(Tcl_UniChar); + srcLen *= sizeof(Tcl_UniChar); } srcStart = src; @@ -2384,21 +2379,17 @@ UnicodeToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (clientData) { - ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); - } else { - ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); - } /* - * Special case for 1-byte utf chars for speed. Make sure we work with - * unsigned short-size data. + * Special case for 1-byte utf chars for speed. Make sure we + * work with Tcl_UniChar-size data. */ + ch = *(Tcl_UniChar *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned short); + src += sizeof(Tcl_UniChar); } *srcReadPtr = src - srcStart; @@ -2412,7 +2403,7 @@ UnicodeToUtfProc( * * UtfToUnicodeProc -- * - * Convert from UTF-8 to UTF-16. + * Convert from UTF-8 to Unicode. * * Results: * Returns TCL_OK if conversion was successful. @@ -2425,7 +2416,8 @@ UnicodeToUtfProc( static int UtfToUnicodeProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* TableEncodingData that specifies + * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2452,7 +2444,7 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch = 0; + Tcl_UniChar ch; srcStart = src; srcEnd = src + srcLen; @@ -2484,37 +2476,27 @@ UtfToUnicodeProc( * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] */ - if (clientData) { +#ifdef WORDS_BIGENDIAN #if TCL_UTF_MAX > 4 - if (ch <= 0xFFFF) { - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); - } else { - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (ch & 0xFF); - *dst++ = ((ch & 0x3) >> 8) | 0xDC; - } + *dst++ = (ch >> 24); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); #else - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); #endif - } else { +#else #if TCL_UTF_MAX > 4 - if (ch <= 0xFFFF) { - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); - } else { - *dst++ = ((ch & 0x3) >> 8) | 0xDC; - *dst++ = (ch & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - } + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = (ch >> 24); #else - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); +#endif #endif - } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -2917,6 +2899,7 @@ Iso88591FromUtfProc( result = TCL_CONVERT_UNKNOWN; break; } + /* * Plunge on, using '?' as a fallback character. */ @@ -3404,13 +3387,14 @@ EscapeFromUtfProc( * * EscapeFreeProc -- * - * Frees resources used by the encoding. + * This function is invoked when an EscapeEncodingData encoding is + * deleted. It deletes the memory used by the encoding. * * Results: * None. * * Side effects: - * Memory is freed. + * Memory freed. * *--------------------------------------------------------------------------- */ -- cgit v0.12 From 5e03b5279c2b3c38271707b2d3ad0e83be9f35d7 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 02:57:41 +0000 Subject: We've settled on using (TCL_UTF_MAX > 3) to indicate 4-byte Tcl_UniChar. --- generic/tclEncoding.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5a9d2d5..66bec44 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2477,7 +2477,7 @@ UtfToUnicodeProc( * by casting dst to a Tcl_UniChar. [Bug 1122671] */ #ifdef WORDS_BIGENDIAN -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 *dst++ = (ch >> 24); *dst++ = ((ch >> 16) & 0xFF); *dst++ = ((ch >> 8) & 0xFF); @@ -2487,7 +2487,7 @@ UtfToUnicodeProc( *dst++ = (ch & 0xFF); #endif #else -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 *dst++ = (ch & 0xFF); *dst++ = ((ch >> 8) & 0xFF); *dst++ = ((ch >> 16) & 0xFF); -- cgit v0.12 From 32d22e5d8a39136b9fc5c63c923a1407f5bf8430 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 Apr 2020 07:03:46 +0000 Subject: Add more test-cases for TCL_UTF_MAX>3 --- generic/tclUtf.c | 2 +- tests/utf.test | 341 ++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 210 insertions(+), 133 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 842744d..35a98a1 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -74,7 +74,7 @@ static const unsigned char totalBytes[256] = { 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, #if TCL_UTF_MAX > 3 - 4,4,4,4,4, + 4,4,4,4,4, #else 1,1,1,1,1, #endif diff --git a/tests/utf.test b/tests/utf.test index 3301dde..35772ae 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -16,15 +16,20 @@ if {[lsearch [namespace children] ::tcltest] == -1} { ::tcltest::loadTestedCommands catch [list package require -exact Tcltest [info patchlevel]] -testConstraint testbytestring [llength [info commands testbytestring]] - -catch {unset x} - -# Some tests require support for 4-byte UTF-8 sequences testConstraint smallutf [expr {[format %c 0x010000] == "\uFFFD"}] testConstraint fullutf [expr {[format %c 0x010000] != "\uFFFD"}] testConstraint tip389 [expr {[string length \U010000] == 2}] +testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint testfindfirst [llength [info commands testfindfirst]] +testConstraint testfindlast [llength [info commands testfindlast]] +testConstraint testnumutfchars [llength [info commands testnumutfchars]] +testConstraint teststringobj [llength [info commands teststringobj]] +testConstraint testutfnext [llength [info commands testutfnext]] +testConstraint testutfprev [llength [info commands testutfprev]] + +catch {unset x} + test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { expr {"\x01" eq [testbytestring "\x01"]} } 1 @@ -96,6 +101,7 @@ test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} t string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} {testbytestring knownBug} {# Doesn't work with any TCL_UTF_MAX value + # Would decode to U+110000 but that is outside the Unicode range. string length [testbytestring "\xF4\x90\x80\x80"] } {4} test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring { @@ -105,10 +111,6 @@ test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestr test utf-3.1 {Tcl_UtfCharComplete} { } {} -testConstraint testnumutfchars [llength [info commands testnumutfchars]] -testConstraint testfindfirst [llength [info commands testfindfirst]] -testConstraint testfindlast [llength [info commands testfindlast]] - test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } {0} @@ -116,7 +118,7 @@ test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC2\xA2"] } {1} test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E"] } {7} test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC0\x80"] @@ -125,13 +127,13 @@ test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } {0} test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xC2\xA2"] 2 + testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] 10 + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E"] 10 } {7} test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "\xC0\x80"] 2 + testnumutfchars [testbytestring "\xC0\x80"] 1 } {1} # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { @@ -141,10 +143,13 @@ test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testnumutfchars [testbytestring "\x00"] 2 } {2} test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring \xf0\x9f\x92\xa9] 3 + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} { - testnumutfchars [testbytestring \xf0\x9f\x92\xa9] 4 +test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring smallutf} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 +} {4} +test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 } {2} test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { @@ -154,8 +159,6 @@ test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { testfindlast [testbytestring "abcbc"] 98 } {bc} -testConstraint testutfnext [llength [info commands testutfnext]] - test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. @@ -177,7 +180,7 @@ test utf-6.6 {Tcl_UtfNext} testutfnext { testutfnext A\xE8 } 1 test utf-6.7 {Tcl_UtfNext} testutfnext { - testutfnext A\xF4 + testutfnext A\xF2 } 1 test utf-6.8 {Tcl_UtfNext} testutfnext { testutfnext A\xF8 @@ -198,7 +201,7 @@ test utf-6.13 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xE8 } 1 test utf-6.14 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF4 + testutfnext \xA0\xF2 } 1 test utf-6.15 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xF8 @@ -207,7 +210,7 @@ test utf-6.16 {Tcl_UtfNext} testutfnext { testutfnext \xD0 } 1 test utf-6.17 {Tcl_UtfNext} testutfnext { - testutfnext \xD0A + testutfnext \xD0G } 1 test utf-6.18 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0 @@ -219,7 +222,7 @@ test utf-6.20 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xE8 } 1 test utf-6.21 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF4 + testutfnext \xD0\xF2 } 1 test utf-6.22 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xF8 @@ -228,7 +231,7 @@ test utf-6.23 {Tcl_UtfNext} testutfnext { testutfnext \xE8 } 1 test utf-6.24 {Tcl_UtfNext} testutfnext { - testutfnext \xE8A + testutfnext \xE8G } 1 test utf-6.25 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0 @@ -240,37 +243,37 @@ test utf-6.27 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xE8 } 1 test utf-6.28 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF4 + testutfnext \xE8\xF2 } 1 test utf-6.29 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xF8 } 1 test utf-6.30 {Tcl_UtfNext} testutfnext { - testutfnext \xF4 + testutfnext \xF2 } 1 test utf-6.31 {Tcl_UtfNext} testutfnext { - testutfnext \xF4A + testutfnext \xF2G } 1 test utf-6.32 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0 + testutfnext \xF2\xA0 } 1 test utf-6.33 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xD0 + testutfnext \xF2\xD0 } 1 test utf-6.34 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xE8 + testutfnext \xF2\xE8 } 1 test utf-6.35 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xF4 + testutfnext \xF2\xF2 } 1 test utf-6.36 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xF8 + testutfnext \xF2\xF8 } 1 test utf-6.37 {Tcl_UtfNext} testutfnext { testutfnext \xF8 } 1 test utf-6.38 {Tcl_UtfNext} testutfnext { - testutfnext \xF8A + testutfnext \xF8G } 1 test utf-6.39 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xA0 @@ -282,7 +285,7 @@ test utf-6.41 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xE8 } 1 test utf-6.42 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF4 + testutfnext \xF8\xF2 } 1 test utf-6.43 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xF8 @@ -300,7 +303,7 @@ test utf-6.47 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0\xE8 } 2 test utf-6.48 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF4 + testutfnext \xD0\xA0\xF2 } 2 test utf-6.49 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0\xF8 @@ -318,28 +321,28 @@ test utf-6.53 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xE8 } 1 test utf-6.54 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF4 + testutfnext \xE8\xA0\xF2 } 1 test utf-6.55 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xF8 } 1 test utf-6.56 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0G + testutfnext \xF2\xA0G } 1 test utf-6.57 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0 + testutfnext \xF2\xA0\xA0 } 1 test utf-6.58 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xD0 + testutfnext \xF2\xA0\xD0 } 1 test utf-6.59 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xE8 + testutfnext \xF2\xA0\xE8 } 1 test utf-6.60 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xF4 + testutfnext \xF2\xA0\xF2 } 1 test utf-6.61 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xF8 + testutfnext \xF2\xA0\xF8 } 1 test utf-6.62 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0G @@ -354,67 +357,67 @@ test utf-6.65 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0\xE8 } 3 test utf-6.66 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF4 + testutfnext \xE8\xA0\xA0\xF2 } 3 test utf-6.67 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0\xF8 } 3 test utf-6.68 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0G + testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0 +test utf-6.69.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0 + testutfnext \xF2\xA0\xA0\xA0 } 4 test utf-6.70 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xD0 + testutfnext \xF2\xA0\xA0\xD0 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xE8 + testutfnext \xF2\xA0\xA0\xE8 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xF8 + testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0G +test utf-6.74.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0G } 1 test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0G + testutfnext \xF2\xA0\xA0\xA0G } 4 -test utf-6.75 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0\xA0 +test utf-6.75.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0\xA0 + testutfnext \xF2\xA0\xA0\xA0\xA0 } 4 -test utf-6.76 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0\xD0 +test utf-6.76.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0\xD0 + testutfnext \xF2\xA0\xA0\xA0\xD0 } 4 -test utf-6.77 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0\xE8 +test utf-6.77.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0\xE8 + testutfnext \xF2\xA0\xA0\xA0\xE8 } 4 -test utf-6.78 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0\xF4 +test utf-6.78.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xA0\xF2 } 4 -test utf-6.79 {Tcl_UtfNext} {testutfnext smallutf} { - testutfnext \xF4\xA0\xA0\xA0G\xF8 +test utf-6.79.0 {Tcl_UtfNext} {testutfnext smallutf} { + testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF4\xA0\xA0\xA0G\xF8 + testutfnext \xF2\xA0\xA0\xA0G\xF8 } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x80 @@ -437,10 +440,10 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext smallutf} { +test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext smallutf} { testutfnext \xF0\x90\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { +test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { testutfnext \xF0\x90\x80\x80 } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -455,8 +458,18 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 3 - -testConstraint testutfprev [llength [info commands testutfprev]] +test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext smallutf} { + testutfnext \xF4\x8F\xBF\xBF +} 1 +test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { + testutfnext \xF4\x8F\xBF\xBF +} 4 +test utf-6.91.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext smallutf} { + testutfnext \xF4\x90\x80\x80 +} 1 +test utf-6.91.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { + testutfnext \xF4\x90\x80\x80 +} 4 test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testutfnext { testutfnext \xA0\xA0\xA0 } 3 @@ -489,13 +502,13 @@ test utf-7.4.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xF8\xA0\xA0 2 } 1 test utf-7.5 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4 + testutfprev A\xF2 } 1 test utf-7.5.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 2 + testutfprev A\xF2\xA0\xA0\xA0 2 } 1 test utf-7.5.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xF8\xA0\xA0 2 + testutfprev A\xF2\xF8\xA0\xA0 2 } 1 test utf-7.6 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8 @@ -533,23 +546,23 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} {testutfprev smallutf} { - testutfprev A\xF4\xA0 +test utf-7.10.0 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} {testutfprev smallutf} { - testutfprev A\xF4\xA0\xA0\xA0 3 -} 2 -test utf-7.10.2 {Tcl_UtfPrev} {testutfprev smallutf} { - testutfprev A\xF4\xA0\xF8\xA0 3 -} 2 -test utf-7.10 {Tcl_UtfPrev} {testutfprev fullutf} { - testutfprev A\xF4\xA0 -} 1 test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} { - testutfprev A\xF4\xA0\xA0\xA0 3 + testutfprev A\xF2\xA0 } 1 -test utf-7.10.2 {Tcl_UtfPrev} {testutfprev fullutf} { - testutfprev A\xF4\xA0\xF8\xA0 3 +test utf-7.10.1.0 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF2\xA0\xA0\xA0 3 +} 2 +test utf-7.10.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 3 +} 1 +test utf-7.10.2.0 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF2\xA0\xF8\xA0 3 +} 2 +test utf-7.10.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xF8\xA0 3 } 1 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 @@ -590,23 +603,23 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} {testutfprev smallutf} { - testutfprev A\xF4\xA0\xA0 -} 3 -test utf-7.15.1 {Tcl_UtfPrev} {testutfprev smallutf} { - testutfprev A\xF4\xA0\xA0\xA0 4 +test utf-7.15.0 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.2 {Tcl_UtfPrev} {testutfprev smallutf} { - testutfprev A\xF4\xA0\xA0\xF8 4 -} 3 -test utf-7.15.3 {Tcl_UtfPrev} {testutfprev fullutf} { - testutfprev A\xF4\xA0\xA0 +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0 } 1 -test utf-7.15.4 {Tcl_UtfPrev} {testutfprev fullutf} { - testutfprev A\xF4\xA0\xA0\xA0 4 +test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF2\xA0\xA0\xA0 4 +} 3 +test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 4 } 1 -test utf-7.15.5 {Tcl_UtfPrev} {testutfprev fullutf} { - testutfprev A\xF4\xA0\xA0\xF8 4 +test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev smallutf} { + testutfprev A\xF2\xA0\xA0\xF8 4 +} 3 +test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xF8 4 } 1 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 @@ -728,19 +741,19 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { +test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { testutfprev A\xF0\x90\x80\x80 } 2 test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 } 1 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { +test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { testutfprev A\xF0\x90\x80\x80 4 } 3 test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 4 } 1 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { +test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev smallutf} { testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { @@ -773,6 +786,48 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 +test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev smallutf} { + testutfprev A\xF4\x8F\xBF\xBF +} 2 +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF +} 1 +test utf-7.48.1.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev smallutf} { + testutfprev A\xF4\x8F\xBF\xBF 4 +} 3 +test utf-7.48.1.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF 4 +} 1 +test utf-7.48.2.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev smallutf} { + testutfprev A\xF4\x8F\xBF\xBF 3 +} 2 +test utf-7.48.2.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF 3 +} 1 +test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 2 +} 1 +test utf-7.49.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev smallutf} { + testutfprev A\xF4\x90\x80\x80 +} 2 +test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x90\x80\x80 +} 1 +test utf-7.49.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev smallutf} { + testutfprev A\xF4\x90\x80\x80 4 +} 3 +test utf-7.49.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x90\x80\x80 4 +} 1 +test utf-7.49.4 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev smallutf} { + testutfprev A\xF4\x90\x80\x80 3 +} 2 +test utf-7.49.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x90\x80\x80 3 +} 1 +test utf-7.49.6 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 2 +} 1 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 @@ -786,6 +841,18 @@ test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { string index \u4E4E\u25A\xFF\u543 2 } "\uFF" +test utf-8.5 {Tcl_UniCharAtIndex: high surrogate} smallutf { + string index \uD842 0 +} "\uD842" +test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { + string index \uDC42 0 +} "\uDC42" +test utf-8.7 {Tcl_UniCharAtIndex: Emoji} smallutf { + string index \uD83D\uDE00 0 +} "\uD83D" +test utf-8.8 {Tcl_UniCharAtIndex: Emoji} { + string index \uD83D\uDE00 1 +} "\uDE00" test utf-9.1 {Tcl_UtfAtIndex: index = 0} { string range abcd 0 2 @@ -793,6 +860,12 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 } "\u25A\xFF\u543kl" +test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} smallutf { + string range \uD83D\uDE00G 0 0 +} "\uD83D" +test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} smallutf { + string range \uD83D\uDE00G 1 1 +} "\uDE00" test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { @@ -891,11 +964,11 @@ test utf-11.2 {Tcl_UtfToUpper} { string toupper abc } ABC test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00E3AB -} \u00C3AB + string toupper \xE3gh +} \xC3GH test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01E3AB -} \u01E2AB + string toupper \u01E3gh +} \u01E2GH test utf-11.5 {Tcl_UtfToUpper Georgian (new in Unicode 11)} { string toupper \u10D0\u1C90 } \u1C90\u1C90 @@ -907,14 +980,17 @@ test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00C3AB -} \u00E3ab + string tolower \xC3GH +} \xE3gh test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01E2AB -} \u01E3ab + string tolower \u01E2GH +} \u01E3gh test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { string tolower \u10D0\u1C90 } \u10D0\u10D0 +test utf-12.6 {Tcl_UtfToUpper low/high surrogate)} smallutf { + string tolower \uDC24\uD824 +} \uDC24\uD824 test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -923,8 +999,8 @@ test utf-13.2 {Tcl_UtfToTitle} { string totitle abc } Abc test utf-13.3 {Tcl_UtfToTitle} { - string totitle \u00E3AB -} \u00C3ab + string totitle \xE3GH +} \xC3gh test utf-13.4 {Tcl_UtfToTitle} { string totitle \u01F3AB } \u01F2ab @@ -934,6 +1010,9 @@ test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { string totitle \u1C90\u10D0 } \u1C90\u10D0 +test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} smallutf { + string totitle \uDC24\uD824 +} \uDC24\uD824 test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -952,7 +1031,7 @@ test utf-15.1 {Tcl_UniCharToUpper, negative delta} { string toupper aA } AA test utf-15.2 {Tcl_UniCharToUpper, positive delta} { - string toupper \u0178\u00FF + string toupper \u0178\xFF } \u0178\u0178 test utf-15.3 {Tcl_UniCharToUpper, no delta} { string toupper ! @@ -962,8 +1041,8 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00FF\uA78D\u01C5 -} \u00FF\u00FF\u0265\u01C6 + string tolower \u0178\xFF\uA78D\u01C5 +} \xFF\xFF\u0265\u01C6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! @@ -977,9 +1056,9 @@ test utf-18.2 {Tcl_UniCharToTitle, subtract one for title} { } \u01C5 test utf-18.3 {Tcl_UniCharToTitle, subtract delta for title (positive)} { string totitle \u017F -} \u0053 +} \x53 test utf-18.4 {Tcl_UniCharToTitle, subtract delta for title (negative)} { - string totitle \u00FF + string totitle \xFF } \u0178 test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! @@ -1016,31 +1095,31 @@ test utf-21.5 {unicode graph char in regc_locale.c} { } {1} test utf-21.6 {TclUniCharIsGraph} { # [Bug 3464428] - string is graph \u00A0 + string is graph \xA0 } {0} test utf-21.7 {unicode graph char in regc_locale.c} { # [Bug 3464428] - regexp {[[:graph:]]} \u0020\u00A0\u2028\u2029 + regexp {[[:graph:]]} \x20\xA0\u2028\u2029 } {0} test utf-21.8 {TclUniCharIsPrint} { # [Bug 3464428] - string is print \u0009 + string is print \x09 } {0} test utf-21.9 {unicode print char in regc_locale.c} { # [Bug 3464428] - regexp {[[:print:]]} \u0009 + regexp {[[:print:]]} \x09 } {0} test utf-21.10 {unicode print char in regc_locale.c} { # [Bug 3464428] - regexp {[[:print:]]} \u0009 + regexp {[[:print:]]} \x09 } {0} test utf-21.11 {TclUniCharIsControl} { # [Bug 3464428] - string is control \u0000\u001F\u00AD\u0605\u061C\u180E\u2066\uFEFF + string is control \x00\x1F\xAD\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-21.12 {unicode control char in regc_locale.c} { # [Bug 3464428], [Bug a876646efe] - regexp {^[[:cntrl:]]*$} \u0000\u001F\u00AD\u0605\u061C\u180E\u2066\uFEFF + regexp {^[[:cntrl:]]*$} \x00\x1F\xAD\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-22.1 {TclUniCharIsWordChar} { @@ -1070,15 +1149,13 @@ test utf-24.2 {unicode digit char in regc_locale.c} { test utf-24.3 {TclUniCharIsSpace} { # this returns 1 with Unicode 7/TIP 413 compliance - string is space \u0085\u1680\u180E\u200B\u202F\u2060 + string is space \x85\u1680\u180E\u200B\u202F\u2060 } {1} test utf-24.4 {unicode space char in regc_locale.c} { # this returns 1 with Unicode 7/TIP 413 compliance - list [regexp {^[[:space:]]+$} \u0085\u1680\u180E\u200B\u202F\u2060] [regexp {^\s+$} \u0085\u1680\u180E\u200B\u202F\u2060] + list [regexp {^[[:space:]]+$} \x85\u1680\u180E\u200B\u202F\u2060] [regexp {^\s+$} \x85\u1680\u180E\u200B\u202F\u2060] } {1 1} -testConstraint teststringobj [llength [info commands teststringobj]] - test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \ -setup { testobj freeallvars -- cgit v0.12 From 4bd49520dbb738edab80937a9204984f77838e76 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 14:52:48 +0000 Subject: Revert the other encoding system backport. The blocking and failing tests are illustrations of existing tickets [1004065] and [1122671], recording that the encoding machinery hardcodes assumptions in multiple places that sizeof(Tcl_UniChar) == 2. Closing the segfault bug fix should not be hostage to fixing those old bugs. --- generic/tclEncoding.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 66bec44..6c16827 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2470,33 +2470,20 @@ UtfToUnicodeProc( if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; - } + } src += TclUtfToUniChar(src, &ch); /* * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] + * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. */ #ifdef WORDS_BIGENDIAN -#if TCL_UTF_MAX > 3 - *dst++ = (ch >> 24); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = (ch & 0xFF); -#else *dst++ = (ch >> 8); *dst++ = (ch & 0xFF); -#endif -#else -#if TCL_UTF_MAX > 3 - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = (ch >> 24); #else *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); #endif -#endif } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; -- cgit v0.12 From 58bd16e955784870b23c693902c7422ee7a0a7f4 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 15:41:01 +0000 Subject: Move testing command [testsize] from Windows to generic. Extend it to report sizeof(Tcl_UniChar). --- generic/tclTest.c | 31 +++++++++++++++++++++++++++++++ win/tclWinTest.c | 28 ---------------------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index 8c29aa7..b9fd204 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -281,6 +281,7 @@ static Tcl_CmdProc Testset2Cmd; static Tcl_CmdProc TestseterrorcodeCmd; static Tcl_ObjCmdProc TestsetobjerrorcodeCmd; static Tcl_CmdProc TestsetplatformCmd; +static Tcl_ObjCmdProc TestsizeCmd; static Tcl_CmdProc TeststaticpkgCmd; static Tcl_CmdProc TesttranslatefilenameCmd; static Tcl_CmdProc TestupvarCmd; @@ -592,6 +593,7 @@ Tcltest_Init( TestFindLastCmd, NULL, NULL); Tcl_CreateCommand(interp, "testsetplatform", TestsetplatformCmd, NULL, NULL); + Tcl_CreateObjCommand(interp, "testsize", TestsizeCmd, NULL, NULL); Tcl_CreateCommand(interp, "teststaticpkg", TeststaticpkgCmd, NULL, NULL); Tcl_CreateCommand(interp, "testtranslatefilename", @@ -4122,6 +4124,35 @@ TestsetplatformCmd( return TCL_OK; } +static int +TestsizeCmd( + ClientData clientData, /* Unused */ + Tcl_Interp* interp, /* Tcl interpreter */ + int objc, /* Parameter count */ + Tcl_Obj *const * objv) /* Parameter vector */ +{ + if (objc != 2) { + goto syntax; + } + if (strcmp(Tcl_GetString(objv[1]), "time_t") == 0) { + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(time_t))); + return TCL_OK; + } + if (strcmp(Tcl_GetString(objv[1]), "st_mtime") == 0) { + Tcl_StatBuf *statPtr; + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(statPtr->st_mtime))); + return TCL_OK; + } + if (strcmp(Tcl_GetString(objv[1]), "unichar") == 0) { + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(Tcl_UniChar))); + return TCL_OK; + } + +syntax: + Tcl_WrongNumArgs(interp, 1, objv, "time_t|st_mtime|unichar"); + return TCL_ERROR; +} + /* *---------------------------------------------------------------------- * diff --git a/win/tclWinTest.c b/win/tclWinTest.c index 04878fe..7f49b63 100644 --- a/win/tclWinTest.c +++ b/win/tclWinTest.c @@ -39,8 +39,6 @@ static int TestwinclockCmd(ClientData dummy, Tcl_Interp* interp, int objc, Tcl_Obj *const objv[]); static int TestwinsleepCmd(ClientData dummy, Tcl_Interp* interp, int objc, Tcl_Obj *const objv[]); -static int TestSizeCmd(ClientData dummy, Tcl_Interp* interp, - int objc, Tcl_Obj *const objv[]); static Tcl_ObjCmdProc TestExceptionCmd; static int TestplatformChmod(const char *nativePath, int pmode); static int TestchmodCmd(ClientData dummy, @@ -78,7 +76,6 @@ TclplatformtestInit( Tcl_CreateObjCommand(interp, "testwinclock", TestwinclockCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testwinsleep", TestwinsleepCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testexcept", TestExceptionCmd, NULL, NULL); - Tcl_CreateObjCommand(interp, "testsize", TestSizeCmd, NULL, NULL); return TCL_OK; } @@ -312,31 +309,6 @@ TestwinsleepCmd( return TCL_OK; } -static int -TestSizeCmd( - ClientData clientData, /* Unused */ - Tcl_Interp* interp, /* Tcl interpreter */ - int objc, /* Parameter count */ - Tcl_Obj *const * objv) /* Parameter vector */ -{ - if (objc != 2) { - goto syntax; - } - if (strcmp(Tcl_GetString(objv[1]), "time_t") == 0) { - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(time_t))); - return TCL_OK; - } - if (strcmp(Tcl_GetString(objv[1]), "st_mtime") == 0) { - Tcl_StatBuf *statPtr; - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(statPtr->st_mtime))); - return TCL_OK; - } - -syntax: - Tcl_WrongNumArgs(interp, 1, objv, "time_t|st_mtime"); - return TCL_ERROR; -} - /* *---------------------------------------------------------------------- * -- cgit v0.12 From 3658973ee93cccd5d084b6c77c58d269a7299683 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 15:52:03 +0000 Subject: Use new testing command to constrain tests to (sizeof(Tcl_UniChar) == 2) until bugs are fixed when (sizeof(Tcl_UniChar == 4). --- tests/chanio.test | 4 +++- tests/encoding.test | 6 +++++- tests/io.test | 5 ++++- tests/source.test | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/chanio.test b/tests/chanio.test index 5fae431..c2f561b 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -29,6 +29,8 @@ namespace eval ::tcl::test::io { variable msg variable expected + testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] testConstraint testchannel [llength [info commands testchannel]] testConstraint exec [llength [info commands exec]] testConstraint openpipe 1 @@ -875,7 +877,7 @@ test chan-io-6.44 {Tcl_GetsObj: input saw cr, not followed by cr} {stdio testcha chan close $f set x } [list "bbbbbbbbbbbbbbb" 15 "123456789abcdef" 1 4 "abcd" 0 3 "efg"] -test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} { +test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent ucs2} { # Tcl_ExternalToUtf() set f [open "|[list [interpreter] $path(cat)]" w+] diff --git a/tests/encoding.test b/tests/encoding.test index 8722a93..ad55e26 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -32,6 +32,10 @@ proc runtests {} { testConstraint testencoding [llength [info commands testencoding]] testConstraint exec [llength [info commands exec]] +testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] + + # TclInitEncodingSubsystem is tested by the rest of this file # TclFinalizeEncodingSubsystem is not currently tested @@ -316,7 +320,7 @@ test encoding-15.3 {UtfToUtfProc null character input} { list [string bytelength $x] [string bytelength $y] $z } {1 2 c080} -test encoding-16.1 {UnicodeToUtfProc} { +test encoding-16.1 {UnicodeToUtfProc} ucs2 { set val [encoding convertfrom unicode NN] list $val [format %x [scan $val %c]] } "\u4e4e 4e4e" diff --git a/tests/io.test b/tests/io.test index 04fa1d2..1c18576 100644 --- a/tests/io.test +++ b/tests/io.test @@ -29,6 +29,9 @@ namespace eval ::tcl::test::io { variable msg variable expected +testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] + testConstraint testchannel [llength [info commands testchannel]] testConstraint exec [llength [info commands exec]] testConstraint openpipe 1 @@ -910,7 +913,7 @@ test io-6.44 {Tcl_GetsObj: input saw cr, not followed by cr} {stdio testchannel close $f set x } [list "bbbbbbbbbbbbbbb" 15 "123456789abcdef" 1 4 "abcd" 0 3 "efg"] -test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} { +test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent ucs2} { # Tcl_ExternalToUtf() set f [open "|[list [interpreter] $path(cat)]" w+] diff --git a/tests/source.test b/tests/source.test index dc3c2d8..8511004 100644 --- a/tests/source.test +++ b/tests/source.test @@ -20,6 +20,9 @@ if {[catch {package require tcltest 2.1}]} { namespace eval ::tcl::test::source { namespace import ::tcltest::* +testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] + test source-1.1 {source command} -setup { set x "old x value" set y "old y value" @@ -232,7 +235,7 @@ test source-7.1 {source -encoding test} -setup { } -cleanup { removeFile source.file } -result correct -test source-7.2 {source -encoding test} -setup { +test source-7.2 {source -encoding test} -constraints ucs2 -setup { # This tests for bad interactions between [source -encoding] # and use of the Control-Z character (\u001A) as a cross-platform # EOF character by [source]. Here we write out and the [source] a -- cgit v0.12 From 2edabddfa00c43f7dcb636294060014c31aa6332 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 15:56:34 +0000 Subject: remove merge litter --- tests/util.test | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/util.test b/tests/util.test index a483de1..85c06dd 100644 --- a/tests/util.test +++ b/tests/util.test @@ -15,7 +15,6 @@ if {[lsearch [namespace children] ::tcltest] == -1} { testConstraint testdstring [llength [info commands testdstring]] testConstraint testconcatobj [llength [info commands testconcatobj]] testConstraint testdoubledigits [llength [info commands testdoubledigits]] -testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] # Big test for correct ordering of data in [expr] -- cgit v0.12 From a4abe0e0385a822e1de6e6c700ce1adf47548966 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 Apr 2020 19:45:34 +0000 Subject: Improve the "testutfnext" command. It can now accept both bytes and strings, and it will test whether src[-1] is read without needing test-variations for it. --- generic/tclTest.c | 48 +++++++------ tests/utf.test | 208 +++++++++++++++++++++++++++--------------------------- 2 files changed, 131 insertions(+), 125 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index b9fd204..7a531b4 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -7113,7 +7113,7 @@ SimpleListVolumes(void) /* * Used to check operations of Tcl_UtfNext. * - * Usage: testutfnext $bytes $offset + * Usage: testutfnext -bytestring $bytes */ static int @@ -7123,37 +7123,43 @@ TestUtfNextCmd( int objc, Tcl_Obj *const objv[]) { - int numBytes, offset = 0; + int numBytes; char *bytes; - const char *result; - Tcl_Obj *copy; + const char *result, *first; + char buffer[32]; + static const char tobetested[] = "\xFF\xFE\xF4\xF2\xF0\xEF\xE8\xE3\xE2\xE1\xE0\xC2\xC1\xC0\x82"; + const char *p = tobetested; + + if (objc != 3 || strcmp(Tcl_GetString(objv[1]), "-bytestring")) { + if (objc != 2) { + Tcl_WrongNumArgs(interp, 1, objv, "?-bytestring? bytes"); + return TCL_ERROR; + } + bytes = Tcl_GetStringFromObj(objv[1], &numBytes); + } else { + bytes = (char *) Tcl_GetByteArrayFromObj(objv[2], &numBytes); + } - if (objc < 2 || objc > 3) { - Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); + if (numBytes > sizeof(buffer)-2) { + Tcl_AppendResult(interp, "\"testutfnext\" can only handle 30 bytes", NULL); return TCL_ERROR; } - bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + memcpy(buffer + 1, bytes, numBytes); + buffer[0] = buffer[numBytes + 1] = '\x00'; - if (objc == 3) { - if (TCL_OK != TclGetIntForIndex(interp, objv[2], numBytes, &offset)) { + first = Tcl_UtfNext(buffer + 1); + while ((buffer[0] = *p++) != '\0') { + /* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */ + result = Tcl_UtfNext(buffer + 1); + if (first != result) { + Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL); return TCL_ERROR; } - if (offset < 0) { - offset = 0; - } - if (offset > numBytes) { - offset = numBytes; - } } - copy = Tcl_DuplicateObj(objv[1]); - bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); - bytes[numBytes] = '\0'; - result = Tcl_UtfNext(bytes + offset); - Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - buffer - 1)); - Tcl_DecrRefCount(copy); return TCL_OK; } /* diff --git a/tests/utf.test b/tests/utf.test index 1c79f32..0a81ae3 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -143,7 +143,7 @@ test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. - testutfnext {} + testutfnext -bytestring {} } 1 test utf-6.2 {Tcl_UtfNext} testutfnext { testutfnext A @@ -152,301 +152,301 @@ test utf-6.3 {Tcl_UtfNext} testutfnext { testutfnext AA } 1 test utf-6.4 {Tcl_UtfNext} testutfnext { - testutfnext A\xA0 + testutfnext -bytestring A\xA0 } 1 test utf-6.5 {Tcl_UtfNext} testutfnext { - testutfnext A\xD0 + testutfnext -bytestring A\xD0 } 1 test utf-6.6 {Tcl_UtfNext} testutfnext { - testutfnext A\xE8 + testutfnext -bytestring A\xE8 } 1 test utf-6.7 {Tcl_UtfNext} testutfnext { - testutfnext A\xF2 + testutfnext -bytestring A\xF2 } 1 test utf-6.8 {Tcl_UtfNext} testutfnext { - testutfnext A\xF8 + testutfnext -bytestring A\xF8 } 1 test utf-6.9 {Tcl_UtfNext} testutfnext { - testutfnext \xA0 + testutfnext -bytestring \xA0 } 1 test utf-6.10 {Tcl_UtfNext} testutfnext { - testutfnext \xA0G + testutfnext -bytestring \xA0G } 1 test utf-6.11 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xA0 + testutfnext -bytestring \xA0\xA0 } 1 test utf-6.12 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xD0 + testutfnext -bytestring \xA0\xD0 } 1 test utf-6.13 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xE8 + testutfnext -bytestring \xA0\xE8 } 1 test utf-6.14 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF2 + testutfnext -bytestring \xA0\xF2 } 1 test utf-6.15 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF8 + testutfnext -bytestring \xA0\xF8 } 1 test utf-6.16 {Tcl_UtfNext} testutfnext { - testutfnext \xD0 + testutfnext -bytestring \xD0 } 1 test utf-6.17 {Tcl_UtfNext} testutfnext { - testutfnext \xD0G + testutfnext -bytestring \xD0G } 1 test utf-6.18 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0 + testutfnext -bytestring \xD0\xA0 } 2 test utf-6.19 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xD0 + testutfnext -bytestring \xD0\xD0 } 1 test utf-6.20 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xE8 + testutfnext -bytestring \xD0\xE8 } 1 test utf-6.21 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF2 + testutfnext -bytestring \xD0\xF2 } 1 test utf-6.22 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF8 + testutfnext -bytestring \xD0\xF8 } 1 test utf-6.23 {Tcl_UtfNext} testutfnext { - testutfnext \xE8 + testutfnext -bytestring \xE8 } 1 test utf-6.24 {Tcl_UtfNext} testutfnext { - testutfnext \xE8G + testutfnext -bytestring \xE8G } 1 test utf-6.25 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0 + testutfnext -bytestring \xE8\xA0 } 1 test utf-6.26 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xD0 + testutfnext -bytestring \xE8\xD0 } 1 test utf-6.27 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xE8 + testutfnext -bytestring \xE8\xE8 } 1 test utf-6.28 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF2 + testutfnext -bytestring \xE8\xF2 } 1 test utf-6.29 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF8 + testutfnext -bytestring \xE8\xF8 } 1 test utf-6.30 {Tcl_UtfNext} testutfnext { - testutfnext \xF2 + testutfnext -bytestring \xF2 } 1 test utf-6.31 {Tcl_UtfNext} testutfnext { - testutfnext \xF2G + testutfnext -bytestring \xF2G } 1 test utf-6.32 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0 + testutfnext -bytestring \xF2\xA0 } 1 test utf-6.33 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xD0 + testutfnext -bytestring \xF2\xD0 } 1 test utf-6.34 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xE8 + testutfnext -bytestring \xF2\xE8 } 1 test utf-6.35 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xF2 + testutfnext -bytestring \xF2\xF2 } 1 test utf-6.36 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xF8 + testutfnext -bytestring \xF2\xF8 } 1 test utf-6.37 {Tcl_UtfNext} testutfnext { - testutfnext \xF8 + testutfnext -bytestring \xF8 } 1 test utf-6.38 {Tcl_UtfNext} testutfnext { - testutfnext \xF8G + testutfnext -bytestring \xF8G } 1 test utf-6.39 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xA0 + testutfnext -bytestring \xF8\xA0 } 1 test utf-6.40 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xD0 + testutfnext -bytestring \xF8\xD0 } 1 test utf-6.41 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xE8 + testutfnext -bytestring \xF8\xE8 } 1 test utf-6.42 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF2 + testutfnext -bytestring \xF8\xF2 } 1 test utf-6.43 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF8 + testutfnext -bytestring \xF8\xF8 } 1 test utf-6.44 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0G + testutfnext -bytestring \xD0\xA0G } 2 test utf-6.45 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xA0 + testutfnext -bytestring \xD0\xA0\xA0 } 2 test utf-6.46 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xD0 + testutfnext -bytestring \xD0\xA0\xD0 } 2 test utf-6.47 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xE8 + testutfnext -bytestring \xD0\xA0\xE8 } 2 test utf-6.48 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF2 + testutfnext -bytestring \xD0\xA0\xF2 } 2 test utf-6.49 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF8 + testutfnext -bytestring \xD0\xA0\xF8 } 2 test utf-6.50 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0G + testutfnext -bytestring \xE8\xA0G } 1 test utf-6.51 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0 + testutfnext -bytestring \xE8\xA0\xA0 } 3 test utf-6.52 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xD0 + testutfnext -bytestring \xE8\xA0\xD0 } 1 test utf-6.53 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xE8 + testutfnext -bytestring \xE8\xA0\xE8 } 1 test utf-6.54 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF2 + testutfnext -bytestring \xE8\xA0\xF2 } 1 test utf-6.55 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF8 + testutfnext -bytestring \xE8\xA0\xF8 } 1 test utf-6.56 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0G + testutfnext -bytestring \xF2\xA0G } 1 test utf-6.57 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0 } 1 test utf-6.58 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xD0 } 1 test utf-6.59 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xE8 } 1 test utf-6.60 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xF2 } 1 test utf-6.61 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xF8 + testutfnext -bytestring \xF2\xA0\xF8 } 1 test utf-6.62 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0G + testutfnext -bytestring \xE8\xA0\xA0G } 3 test utf-6.63 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xA0 + testutfnext -bytestring \xE8\xA0\xA0\xA0 } 3 test utf-6.64 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xD0 + testutfnext -bytestring \xE8\xA0\xA0\xD0 } 3 test utf-6.65 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xE8 + testutfnext -bytestring \xE8\xA0\xA0\xE8 } 3 test utf-6.66 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF2 + testutfnext -bytestring \xE8\xA0\xA0\xF2 } 3 test utf-6.67 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF8 + testutfnext -bytestring \xE8\xA0\xA0\xF8 } 3 test utf-6.68 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0G + testutfnext -bytestring \xF2\xA0\xA0G } 1 test utf-6.69.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0 } 1 test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0 } 4 test utf-6.70 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xA0\xD0 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xA0\xE8 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xF8 + testutfnext -bytestring \xF2\xA0\xA0\xF8 } 1 test utf-6.74.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0G + testutfnext -bytestring \xF2\xA0\xA0\xA0G } 1 test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0G + testutfnext -bytestring \xF2\xA0\xA0\xA0G } 4 test utf-6.75.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xA0 } 1 test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xA0 } 4 test utf-6.76.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xD0 } 1 test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xD0 } 4 test utf-6.77.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xE8 } 1 test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xE8 } 4 test utf-6.78.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xF2 } 1 test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xF2 } 4 test utf-6.79.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0G\xF8 + testutfnext -bytestring \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0G\xF8 + testutfnext -bytestring \xF2\xA0\xA0\xA0G\xF8 } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC0\x80 + testutfnext -bytestring \xC0\x80 } 2 test utf-6.81 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC0\x81 + testutfnext -bytestring \xC0\x81 } 1 test utf-6.82 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC1\x80 + testutfnext -bytestring \xC1\x80 } 1 test utf-6.83 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC2\x80 + testutfnext -bytestring \xC2\x80 } 2 test utf-6.84 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xE0\x80\x80 + testutfnext -bytestring \xE0\x80\x80 } 1 test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xE0\xA0\x80 + testutfnext -bytestring \xE0\xA0\x80 } 3 test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xF0\x80\x80\x80 + testutfnext -bytestring \xF0\x80\x80\x80 } 1 test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { - testutfnext \xF0\x90\x80\x80 + testutfnext -bytestring \xF0\x90\x80\x80 } 1 test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { - testutfnext \xF0\x90\x80\x80 + testutfnext -bytestring \xF0\x90\x80\x80 } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { - testutfnext \xA0\xA0 + testutfnext -bytestring \xA0\xA0 } 1 -test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { - testutfnext \xE8\xA0\xA0 1 -} 2 test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { - testutfnext \x80\x80 + testutfnext -bytestring \x80\x80 } 1 -test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { - testutfnext \xF0\x80\x80 1 -} 2 test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { - testutfnext \xF4\x8F\xBF\xBF + testutfnext -bytestring \xF4\x8F\xBF\xBF } 1 test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { - testutfnext \xF4\x8F\xBF\xBF + testutfnext -bytestring \xF4\x8F\xBF\xBF } 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { - testutfnext \xF4\x90\x80\x80 + testutfnext -bytestring \xF4\x90\x80\x80 +} 1 +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext -bytestring \xA0\xA0\xA0 +} 1 +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext -bytestring \x80\x80\x80 } 1 test utf-7.1 {Tcl_UtfPrev} testutfprev { -- cgit v0.12 From df4b16195a9f1fb761209983176716a72ac96cc0 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 22 Apr 2020 10:02:06 +0000 Subject: Determine "testConstraint ucs2" without the need for a testcommand. Rename "compat85" testConstraint to "ucs2", because that's what it actually is. --- tests/chanio.test | 3 +-- tests/encoding.test | 3 +-- tests/io.test | 3 +-- tests/source.test | 3 +-- tests/utf.test | 52 ++++++++++++++++++++++++++-------------------------- 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/tests/chanio.test b/tests/chanio.test index c2f561b..db4544c 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -29,8 +29,7 @@ namespace eval ::tcl::test::io { variable msg variable expected - testConstraint ucs2 [expr { [llength [info commands testsize]] && - ([testsize unichar] == 2) }] + testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] testConstraint testchannel [llength [info commands testchannel]] testConstraint exec [llength [info commands exec]] testConstraint openpipe 1 diff --git a/tests/encoding.test b/tests/encoding.test index ad55e26..af325c1 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -32,8 +32,7 @@ proc runtests {} { testConstraint testencoding [llength [info commands testencoding]] testConstraint exec [llength [info commands exec]] -testConstraint ucs2 [expr { [llength [info commands testsize]] && - ([testsize unichar] == 2) }] +testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] # TclInitEncodingSubsystem is tested by the rest of this file diff --git a/tests/io.test b/tests/io.test index 1c18576..06be982 100644 --- a/tests/io.test +++ b/tests/io.test @@ -29,8 +29,7 @@ namespace eval ::tcl::test::io { variable msg variable expected -testConstraint ucs2 [expr { [llength [info commands testsize]] && - ([testsize unichar] == 2) }] +testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] testConstraint testchannel [llength [info commands testchannel]] testConstraint exec [llength [info commands exec]] diff --git a/tests/source.test b/tests/source.test index 8511004..877921e 100644 --- a/tests/source.test +++ b/tests/source.test @@ -20,8 +20,7 @@ if {[catch {package require tcltest 2.1}]} { namespace eval ::tcl::test::source { namespace import ::tcltest::* -testConstraint ucs2 [expr { [llength [info commands testsize]] && - ([testsize unichar] == 2) }] +testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] test source-1.1 {source command} -setup { set x "old x value" diff --git a/tests/utf.test b/tests/utf.test index 0a81ae3..fd4e396 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,7 +13,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } -testConstraint compat85 [expr {[format %c 0x010000] eq "\uFFFD"}] +testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] testConstraint testbytestring [llength [info commands testbytestring]] @@ -66,13 +66,13 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { string length [testbytestring "\xE4\xB9\x8E"] } {1} -test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { +test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring ucs2} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring fullutf} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {1} -test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { +test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring ucs2} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring fullutf} -body { @@ -126,7 +126,7 @@ test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 3 } {3} -test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { +test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring ucs2} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 } {4} test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring fullutf} { @@ -346,7 +346,7 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext -bytestring \xF2\xA0\xA0G } 1 -test utf-6.69.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.69.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0 } 1 test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { @@ -364,37 +364,37 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext -bytestring \xF2\xA0\xA0\xF8 } 1 -test utf-6.74.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.74.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0G } 1 test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext -bytestring \xF2\xA0\xA0\xA0G } 4 -test utf-6.75.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.75.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xA0 } 1 test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xA0 } 4 -test utf-6.76.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.76.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xD0 } 1 test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xD0 } 4 -test utf-6.77.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.77.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xE8 } 1 test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xE8 } 4 -test utf-6.78.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.78.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xF2 } 1 test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { testutfnext -bytestring \xF2\xA0\xA0\xA0\xF2 } 4 -test utf-6.79.0 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.79.0 {Tcl_UtfNext} {testutfnext ucs2} { testutfnext -bytestring \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { @@ -421,7 +421,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext -bytestring \xF0\x80\x80\x80 } 1 -test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { +test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext ucs2} { testutfnext -bytestring \xF0\x90\x80\x80 } 1 test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { @@ -433,7 +433,7 @@ test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {test test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext -bytestring \x80\x80 } 1 -test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { +test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext ucs2} { testutfnext -bytestring \xF4\x8F\xBF\xBF } 1 test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { @@ -512,19 +512,19 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0 } 2 test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF2\xA0 } 1 -test utf-7.10.1.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.1.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 test utf-7.10.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF2\xA0\xA0\xA0 3 } 1 -test utf-7.10.2.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.2.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.10.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { @@ -569,19 +569,19 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0 } 3 test utf-7.15.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF2\xA0\xA0 } 1 -test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { testutfprev A\xF2\xA0\xA0\xA0 4 } 1 -test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { @@ -617,7 +617,7 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20.0 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.20.0 {Tcl_UtfPrev} {testutfprev ucs2} { testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.20.1 {Tcl_UtfPrev} {testutfprev fullutf} { @@ -683,19 +683,19 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs2} { testutfprev A\xF0\x90\x80\x80 } 4 test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 } 1 -test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs2} { testutfprev A\xF0\x90\x80\x80 4 } 3 test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { testutfprev A\xF0\x90\x80\x80 4 } 1 -test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev ucs2} { testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { @@ -725,19 +725,19 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} { testutfprev A\xF4\x8F\xBF\xBF } 4 test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { testutfprev A\xF4\x8F\xBF\xBF } 1 -test utf-7.48.1.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.1.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 test utf-7.48.1.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { testutfprev A\xF4\x8F\xBF\xBF 4 } 1 -test utf-7.48.2.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.2.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev ucs2} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 test utf-7.48.2.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { -- cgit v0.12 From 0e3e26bcbc60ce9f698551e85e799ed934ba8f21 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 22 Apr 2020 22:13:53 +0000 Subject: (cherry-pick): Update documentation of Tcl_UtfPrev/Tcl_UtfNext back to how it was. Will be updated later, when implementation is ready and agreed upon. --- doc/Utf.3 | 21 +++++------------ generic/tclUtf.c | 70 +++++++++++++++++++------------------------------------- 2 files changed, 30 insertions(+), 61 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 55ef80d..e9bfaa7 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -141,7 +141,7 @@ source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first -byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and +byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0080 and 0x00FF and return 1. .PP \fBTcl_UniCharToUtfDString\fR converts the given Unicode string @@ -217,20 +217,11 @@ returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is considered part of the UTF-8 string. .PP -\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string. -If the UTF-8 string is made up entirely of complete, well-formed, and -valid character byte sequences, and \fIsrc\fR points to the lead byte -of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will -return pointers to the lead bytes of each character in the string, one -character at a time. In any other circumstance, \fBTcl_UtfNext\fR -returns \fIsrc\fR+1. \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR -and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR) -as needed to find the end of the byte sequence. If the string is -\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating -\fBNUL\fR byte. If not, the caller must use the companion routine -\fBTcl_UtfCharComplete\fR to determine whether there is any risk -\fBTcl_UtfNext\fR might read beyond the readable memory occupied -by the string. +Given \fIsrc\fR, a pointer to some location in a UTF-8 string, +\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the +string. The caller must not ask for the next character after the last +character in the string if the string is not terminated by a null +character. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a5e4fd4..e7048ee 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -717,35 +717,13 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * The aim of this routine is to provide a way to iterate forward - * through a UTF-8 string. The caller is expected to pass a non-NULL - * pointer argument /src/ which points to a location within a string. - * (*src) will be read, so /src/ must not point to an unreadable - * location past the end of the string. If /src/ points to the - * beginning of a complete, well-formed and valid UTF_8 byte sequence - * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer - * just past the end of that sequence. In any other circumstance, - * Tcl_UtfNext returns /src/+1. - * - * Because this routine always returns a value > /src/, it is useful - * as a forward iterator that will always make progress. If the string - * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating - * NUL character. If it is not NUL-terminated, the caller must make - * use of the companion routine Tcl_UtfCharComplete to test whether - * there is risk that Tcl_UtfNext will read beyond the end of the string. - * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. - * - * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfNext calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. * * Results: - * A pointer to the start of the next character in the string (or to - * the end of the string) as described above. + * The return value is the pointer to the next character in the UTF-8 + * string. * * Side effects: * None. @@ -786,37 +764,37 @@ Tcl_UtfNext( * * The aim of this routine is to provide a way to move backward * through a UTF-8 string. The caller is expected to pass non-NULL - * pointer arguments /start/ and /src/. /start/ points to the beginning - * of a string, and /src/ (>= /start/) points to a location within (or - * just past the end) of the string. This routine always returns a - * pointer within the string (>= /start/). When (/src/ == /start/), - * it returns /start/. When (/src/ > /start/), it returns a pointer - * (< /src/) and (>= /src/ - TCL_UTF_MAX). Subject to these constraints, - * the routine returns a pointer to the earliest byte in the string that - * starts a character when characters are read starting at /start/ and + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and * that character might include the byte src[-1]. The routine will * examine only those bytes in the range that might be returned. - * It will not examine the byte (*src), and because of that cannot + * It will not examine the byte *src, and because of that cannot * determine for certain in all circumstances whether the character * that begins with the returned pointer will or will not include - * the byte src[-1]. In the scenario where /src/ points to the end of - * a buffer being filled, the returned pointer points to either the + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the * final complete character in the string or to the earliest byte * that might start an incomplete character waiting for more bytes to * complete. * - * Because this routine always returns a value < /src/ until the point - * it is forced to return /start/, it is useful as a backward iterator + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator * through a string that will always make progress and always be * prevented from running past the beginning of the string. * * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfPrev calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: * A pointer to the start of a character in the string as described -- cgit v0.12 From 1e2b38eb6c196d06eea10c27806f201c03c44020 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 23 Apr 2020 12:19:42 +0000 Subject: Testcase cleanup --- tests/utf.test | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 3af70c4..cb650f4 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -16,9 +16,9 @@ if {[lsearch [namespace children] ::tcltest] == -1} { ::tcltest::loadTestedCommands catch [list package require -exact Tcltest [info patchlevel]] -testConstraint ucs2 [expr {[format %c 0x010000] == "\uFFFD"}] -testConstraint fullutf [expr {[format %c 0x010000] != "\uFFFD"}] -testConstraint tip389 [expr {[string length \U010000] == 2}] +testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] +testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] +testConstraint tip389 [expr {[string length \U010000] eq 2}] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testfindfirst [llength [info commands testfindfirst]] @@ -377,7 +377,7 @@ test utf-6.70 {Tcl_UtfNext} testutfnext { test utf-6.71 {Tcl_UtfNext} testutfnext { testutfnext -bytestring \xF2\xA0\xA0\xE8 } 1 -test utf-6.71 {Tcl_UtfNext} testutfnext { +test utf-6.72 {Tcl_UtfNext} testutfnext { testutfnext -bytestring \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { @@ -1064,7 +1064,7 @@ test utf-20.1 {TclUniCharNcmp} { test utf-21.1 {TclUniCharIsAlnum} { # this returns 1 with Unicode 7 compliance string is alnum \u1040\u021F\u0220 -} {1} +} 1 test utf-21.2 {unicode alnum char in regc_locale.c} { # this returns 1 with Unicode 7 compliance list [regexp {^[[:alnum:]]+$} \u1040\u021F\u0220] [regexp {^\w+$} \u1040\u021F\u0220_\u203F\u2040\u2054\uFE33\uFE34\uFE4D\uFE4E\uFE4F\uFF3F] -- cgit v0.12 From b26879ed7753285181237335897881134b86de1b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 23 Apr 2020 13:53:54 +0000 Subject: Fix TclUtfNext() macro. Use it in tclTest.c, so such a mistake can be detected next time. --- generic/tclInt.h | 2 +- generic/tclTest.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/generic/tclInt.h b/generic/tclInt.h index 11c9ec8..780ea30 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -4472,7 +4472,7 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, Tcl_UtfPrev(src, start)) #define TclUtfNext(src) \ - ((((unsigned char) *(src)) < 0xC0) ? src + 1 : Tcl_UtfNext(src)) + ((((unsigned char) *(src)) < 0x80) ? src + 1 : Tcl_UtfNext(src)) /* *---------------------------------------------------------------- diff --git a/generic/tclTest.c b/generic/tclTest.c index 539d188..856e9ea 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -6739,10 +6739,10 @@ TestUtfNextCmd( memcpy(buffer + 1, bytes, numBytes); buffer[0] = buffer[numBytes + 1] = '\x00'; - first = Tcl_UtfNext(buffer + 1); + first = TclUtfNext(buffer + 1); while ((buffer[0] = *p++) != '\0') { /* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */ - result = Tcl_UtfNext(buffer + 1); + result = TclUtfNext(buffer + 1); if (first != result) { Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL); return TCL_ERROR; @@ -6795,7 +6795,7 @@ TestUtfPrevCmd( bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); bytes[numBytes] = '\0'; - result = Tcl_UtfPrev(bytes + offset, bytes); + result = TclUtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); Tcl_DecrRefCount(copy); -- cgit v0.12