diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-04-14 10:17:31 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-04-14 10:17:31 (GMT) |
commit | e59db7e00e94f016d7c222aea7603dbbc8eecb4e (patch) | |
tree | 49eea3f1d82a1ac023889575a2e07d7643ad4b41 | |
parent | 2f98c2ea4d9b29dc3a797522a457585ac5865388 (diff) | |
parent | 920063dce71227734c3cd38eea46fd644ec37ded (diff) | |
download | tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.zip tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.gz tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.bz2 |
Merge 8.6
-rw-r--r-- | doc/Utf.3 | 28 | ||||
-rw-r--r-- | generic/tcl.h | 2 | ||||
-rw-r--r-- | generic/tclCmdAH.c | 2 | ||||
-rw-r--r-- | generic/tclCmdMZ.c | 16 | ||||
-rw-r--r-- | generic/tclDate.c | 2 | ||||
-rw-r--r-- | generic/tclInt.h | 11 | ||||
-rw-r--r-- | generic/tclParse.c | 2 | ||||
-rw-r--r-- | generic/tclResult.c | 11 | ||||
-rw-r--r-- | generic/tclStrToD.c | 6 | ||||
-rw-r--r-- | generic/tclStringObj.c | 24 | ||||
-rw-r--r-- | generic/tclStringTrim.h | 2 | ||||
-rw-r--r-- | generic/tclTest.c | 52 | ||||
-rw-r--r-- | generic/tclUtf.c | 65 | ||||
-rw-r--r-- | generic/tclUtil.c | 404 | ||||
-rw-r--r-- | tests/dstring.test | 64 | ||||
-rw-r--r-- | tests/string.test | 28 | ||||
-rw-r--r-- | tests/utf.test | 163 | ||||
-rw-r--r-- | tests/util.test | 59 | ||||
-rw-r--r-- | unix/tclUnixFile.c | 2 |
19 files changed, 657 insertions, 286 deletions
@@ -259,13 +259,27 @@ string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null character. .PP -Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a -null byte immediately following such a string), \fBTcl_UtfPrev\fR -returns a pointer to the closest preceding byte that starts a UTF-8 -character. -This function will not back up to a position before \fIstart\fR, -the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the -return value will be \fIstart\fR. +\fBTcl_UtfPrev\fR is used to step backward through but not beyond the +UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made +up entirely of complete and well-formed characters, and \fIsrc\fR points +to the lead byte of one of those characters (or to the location one byte +past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will +return pointers to the lead bytes of each character in the string, one +character at a time, terminating when it returns \fIstart\fR. +.PP +When the conditions of completeness and well-formedness may not be satisfied, +a more precise description of the function of \fBTcl_UtfPrev\fR is necessary. +It always returns a pointer greater than or equal to \fIstart\fR; that is, +always a pointer to a location in the string. It always returns a pointer to +a byte that begins a character when scanning for characters beginning +from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it +always returns a pointer less than \fIsrc\fR and greater than or +equal to (\fIsrc\fR - \fBTCL_UTF_MAX\fR). The character that begins +at the returned pointer is the first one that either includes the +byte \fIsrc[-1]\fR, or might include it if the right trail bytes are +present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the +byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte +\fIsrc[-\fBTCL_UTF_MAX\fI-1]\fR. .PP \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the Pascal Ord() function. It returns the Tcl_UniChar represented at the diff --git a/generic/tcl.h b/generic/tcl.h index 369a894..51bd4aa 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2120,7 +2120,7 @@ typedef struct Tcl_EncodingType { */ #ifndef TCL_UTF_MAX -#define TCL_UTF_MAX 3 +#define TCL_UTF_MAX 4 #endif /* diff --git a/generic/tclCmdAH.c b/generic/tclCmdAH.c index ffe0088..056bd47 100644 --- a/generic/tclCmdAH.c +++ b/generic/tclCmdAH.c @@ -211,7 +211,7 @@ Tcl_CaseObjCmd( pat = TclGetString(caseObjv[i]); for (p = pat; *p != '\0'; p++) { - if (TclIsSpaceProc(*p) || (*p == '\\')) { + if (TclIsSpaceProcM(*p) || (*p == '\\')) { break; } } diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 1ddae0f..935d42f 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1845,7 +1845,7 @@ StringIsCmd( * if it is the first "element" that has the failure. */ - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } TclNewStringObj(tmpStr, string1, p-string1); @@ -2519,12 +2519,22 @@ StringStartCmd( cur = 0; if (index > 0) { p = Tcl_UtfAtIndex(string, index); + + TclUtfToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { - TclUtfToUCS4(p, &ch); + int delta = 0; + const char *next; + if (!Tcl_UniCharIsWordChar(ch)) { break; } - p = Tcl_UtfPrev(p, string); + + next = Tcl_UtfPrev(p, string); + do { + next += delta; + delta = TclUtfToUCS4(next, &ch); + } while (next + delta < p); + p = next; } if (cur != index) { cur += 1; diff --git a/generic/tclDate.c b/generic/tclDate.c index 341dd0f..294d4fe 100644 --- a/generic/tclDate.c +++ b/generic/tclDate.c @@ -2681,7 +2681,7 @@ TclDatelex( location->first_column = yyInput - info->dateStart; for ( ; ; ) { - while (TclIsSpaceProc(UCHAR(*yyInput))) { + while (TclIsSpaceProcM(*yyInput)) { yyInput++; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 16e1aa8..8d2e68a 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3080,7 +3080,6 @@ MODULE_SCOPE void TclInitNamespaceSubsystem(void); MODULE_SCOPE void TclInitNotifier(void); MODULE_SCOPE void TclInitObjSubsystem(void); MODULE_SCOPE int TclInterpReady(Tcl_Interp *interp); -MODULE_SCOPE int TclIsSpaceProc(int byte); MODULE_SCOPE int TclIsDigitProc(int byte); MODULE_SCOPE int TclIsBareword(int byte); MODULE_SCOPE Tcl_Obj * TclJoinPath(int elements, Tcl_Obj * const objv[], @@ -3306,6 +3305,16 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); /* + * Many parsing tasks need a common definition of whitespace. + * Use this routine and macro to achieve that and place + * optimization (fragile on changes) in one place. + */ + +MODULE_SCOPE int TclIsSpaceProc(int byte); +# define TclIsSpaceProcM(byte) \ + (((byte) > 0x20) ? 0 : TclIsSpaceProc(byte)) + +/* *---------------------------------------------------------------- * Command procedures in the generic core: *---------------------------------------------------------------- diff --git a/generic/tclParse.c b/generic/tclParse.c index dbffed0..adc559d 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -1761,7 +1761,7 @@ Tcl_ParseBraces( openBrace = 0; break; case '#' : - if (openBrace && TclIsSpaceProc(src[-1])) { + if (openBrace && TclIsSpaceProcM(src[-1])) { Tcl_AppendToObj(Tcl_GetObjResult(parsePtr->interp), ": possible unbalanced brace in comment", -1); goto error; diff --git a/generic/tclResult.c b/generic/tclResult.c index 2336aad..baecf46 100644 --- a/generic/tclResult.c +++ b/generic/tclResult.c @@ -730,6 +730,7 @@ Tcl_AppendElement( char *dst; int size; int flags; + int quoteHash = 1; /* * If the string result is empty, move the object result to the string @@ -766,9 +767,17 @@ Tcl_AppendElement( * then this element will not lead a list, and need not have it's * leading '#' quoted. */ - + quoteHash = 0; + } else { + while ((--dst >= iPtr->appendResult) && TclIsSpaceProcM(*dst)) { + } + quoteHash = !TclNeedSpace(iPtr->appendResult, dst+1); + } + dst = iPtr->appendResult + iPtr->appendUsed; + if (!quoteHash) { flags |= TCL_DONT_QUOTE_HASH; } + iPtr->appendUsed += Tcl_ConvertElement(element, dst, flags); #endif /* !TCL_NO_DEPRECATED */ } diff --git a/generic/tclStrToD.c b/generic/tclStrToD.c index c4b9211..7ef2c60 100644 --- a/generic/tclStrToD.c +++ b/generic/tclStrToD.c @@ -576,7 +576,7 @@ TclParseNumber( * I, N, and whitespace. */ - if (TclIsSpaceProc(c)) { + if (TclIsSpaceProcM(c)) { if (flags & TCL_PARSE_NO_WHITESPACE) { goto endgame; } @@ -1134,7 +1134,7 @@ TclParseNumber( } /* FALLTHROUGH */ case sNANPAREN: - if (TclIsSpaceProc(c)) { + if (TclIsSpaceProcM(c)) { break; } if (numSigDigs < 13) { @@ -1188,7 +1188,7 @@ TclParseNumber( * Accept trailing whitespace. */ - while (len != 0 && TclIsSpaceProc(*p)) { + while (len != 0 && TclIsSpaceProcM(*p)) { p++; len--; } diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index c6d5323..2025674 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1151,10 +1151,7 @@ Tcl_AppendLimitedToObj( { String *stringPtr; int toCopy = 0; - - if (Tcl_IsShared(objPtr)) { - Tcl_Panic("%s called with shared object", "Tcl_AppendLimitedToObj"); - } + int eLen = 0; if (length < 0) { length = (bytes ? strlen(bytes) : 0); @@ -1162,6 +1159,9 @@ Tcl_AppendLimitedToObj( if (length == 0) { return; } + if (limit <= 0) { + return; + } if (length <= limit) { toCopy = length; @@ -1169,8 +1169,12 @@ Tcl_AppendLimitedToObj( if (ellipsis == NULL) { ellipsis = "..."; } - toCopy = (bytes == NULL) ? limit - : Tcl_UtfPrev(bytes+limit+1-strlen(ellipsis), bytes) - bytes; + eLen = strlen(ellipsis); + while (eLen > limit) { + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + } + + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -1179,6 +1183,10 @@ Tcl_AppendLimitedToObj( * objPtr's string rep. */ + if (Tcl_IsShared(objPtr)) { + Tcl_Panic("%s called with shared object", "Tcl_AppendLimitedToObj"); + } + SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); @@ -1194,9 +1202,9 @@ Tcl_AppendLimitedToObj( stringPtr = GET_STRING(objPtr); if (stringPtr->hasUnicode && stringPtr->numChars > 0) { - AppendUtfToUnicodeRep(objPtr, ellipsis, strlen(ellipsis)); + AppendUtfToUnicodeRep(objPtr, ellipsis, eLen); } else { - AppendUtfToUtfRep(objPtr, ellipsis, strlen(ellipsis)); + AppendUtfToUtfRep(objPtr, ellipsis, eLen); } } diff --git a/generic/tclStringTrim.h b/generic/tclStringTrim.h index 030e4ec..7067428 100644 --- a/generic/tclStringTrim.h +++ b/generic/tclStringTrim.h @@ -28,6 +28,8 @@ MODULE_SCOPE const char tclDefaultTrimSet[]; /* * The whitespace trimming set used when [concat]enating. This is a subset of * the above, and deliberately so. + * + * TODO: Find a reasonable way to guarantee in sync with TclIsSpaceProc() */ #define CONCAT_TRIM_SET " \f\v\r\t\n" diff --git a/generic/tclTest.c b/generic/tclTest.c index 3d300cd..157f0c1 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -316,6 +316,7 @@ static Tcl_FSListVolumesProc SimpleListVolumes; static Tcl_FSPathInFilesystemProc SimplePathInFilesystem; static Tcl_Obj * SimpleRedirect(Tcl_Obj *pathPtr); static Tcl_FSMatchInDirectoryProc SimpleMatchInDirectory; +static Tcl_ObjCmdProc TestUtfPrevCmd; static Tcl_ObjCmdProc TestNumUtfCharsCmd; static Tcl_ObjCmdProc TestFindFirstCmd; static Tcl_ObjCmdProc TestFindLastCmd; @@ -577,6 +578,8 @@ Tcltest_Init( NULL, NULL); Tcl_CreateObjCommand(interp, "testsetobjerrorcode", TestsetobjerrorcodeCmd, NULL, NULL); + Tcl_CreateObjCommand(interp, "testutfprev", + TestUtfPrevCmd, (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testnumutfchars", TestNumUtfCharsCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testfindfirst", @@ -6808,6 +6811,55 @@ SimpleListVolumes(void) } /* + * Used to check operations of Tcl_UtfPrev. + * + * Usage: testutfprev $bytes $offset + */ + +static int +TestUtfPrevCmd( + TCL_UNUSED(void *), + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + int numBytes, offset; + char *bytes; + const char *result; + Tcl_Obj *copy; + + if (objc < 2 || objc > 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); + return TCL_ERROR; + } + + bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + + if (objc == 3) { + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + } else { + offset = numBytes; + } + copy = Tcl_DuplicateObj(objv[1]); + bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); + bytes[numBytes] = '\0'; + + result = Tcl_UtfPrev(bytes + offset, bytes); + + Tcl_DecrRefCount(copy); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + return TCL_OK; +} + +/* * Used to check correct string-length determining in Tcl_NumUtfChars */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8d1371a..5908f36 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,6 +64,17 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 +}; + +static const unsigned char complete[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, @@ -697,7 +708,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[(unsigned char)*src]; + return length >= complete[(unsigned char)*src]; } /* @@ -875,15 +886,43 @@ Tcl_UtfNext( * * Tcl_UtfPrev -- * - * Given a pointer to some current location in a UTF-8 string, move - * backwards one character. This works correctly when the pointer is in - * the middle of a UTF-8 character. + * The aim of this routine is to provide a way to move backward + * through a UTF-8 string. The caller is expected to pass non-NULL + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and + * that character might include the byte src[-1]. The routine will + * examine only those bytes in the range that might be returned. + * It will not examine the byte *src, and because of that cannot + * determine for certain in all circumstances whether the character + * that begins with the returned pointer will or will not include + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the + * final complete character in the string or to the earliest byte + * that might start an incomplete character waiting for more bytes to + * complete. + * + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator + * through a string that will always make progress and always be + * prevented from running past the beginning of the string. + * + * In a string where all characters are complete and properly formed, + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: - * The return value is a pointer to the previous character in the UTF-8 - * string. If the current location was already at the beginning of the - * string, the return value will also be a pointer to the beginning of - * the string. + * A pointer to the start of a character in the string as described + * above. * * Side effects: * None. @@ -893,9 +932,8 @@ Tcl_UtfNext( const char * Tcl_UtfPrev( - const char *src, /* The current location in the string. */ - const char *start) /* Pointer to the beginning of the string, to - * avoid going backwards too far. */ + const char *src, /* A location in a UTF-8 string. */ + const char *start) /* Pointer to the beginning of the string */ { const char *look; int i, byte; @@ -913,6 +951,9 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { + if (totalBytes[byte] <= i) { + break; + } return look; } look--; @@ -1975,7 +2016,7 @@ Tcl_UniCharIsSpace( */ if (ch < 0x80) { - return TclIsSpaceProc((char) ch); + return TclIsSpaceProcM((char) ch); } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 89b9f90..6dbccba 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -408,7 +408,7 @@ TclMaxListLength( * No list element before leading white space. */ - count += 1 - TclIsSpaceProc(*bytes); + count += 1 - TclIsSpaceProcM(*bytes); /* * Count white space runs as potential element separators. @@ -418,7 +418,7 @@ TclMaxListLength( if ((numBytes == -1) && (*bytes == '\0')) { break; } - if (TclIsSpaceProc(*bytes)) { + if (TclIsSpaceProcM(*bytes)) { /* * Space run started; bump count. */ @@ -427,7 +427,7 @@ TclMaxListLength( do { bytes++; numBytes -= (numBytes != -1); - } while (numBytes && TclIsSpaceProc(*bytes)); + } while (numBytes && TclIsSpaceProcM(*bytes)); if ((numBytes == 0) || ((numBytes == -1) && (*bytes == '\0'))) { break; } @@ -444,7 +444,7 @@ TclMaxListLength( * No list element following trailing white space. */ - count -= TclIsSpaceProc(bytes[-1]); + count -= TclIsSpaceProcM(bytes[-1]); done: if (endPtr) { @@ -593,7 +593,7 @@ FindElement( */ limit = (string + stringLength); - while ((p < limit) && (TclIsSpaceProc(*p))) { + while ((p < limit) && (TclIsSpaceProcM(*p))) { p++; } if (p == limit) { /* no element found */ @@ -638,7 +638,7 @@ FindElement( } else if (openBraces == 1) { size = (p - elemStart); p++; - if ((p >= limit) || TclIsSpaceProc(*p)) { + if ((p >= limit) || TclIsSpaceProcM(*p)) { goto done; } @@ -648,7 +648,7 @@ FindElement( if (interp != NULL) { p2 = p; - while ((p2 < limit) && (!TclIsSpaceProc(*p2)) + while ((p2 < limit) && (!TclIsSpaceProcM(*p2)) && (p2 < p+20)) { p2++; } @@ -683,23 +683,6 @@ FindElement( break; /* - * Space: ignore if element is in braces or quotes; otherwise - * terminate element. - */ - - case ' ': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - if ((openBraces == 0) && !inQuotes) { - size = (p - elemStart); - goto done; - } - break; - - /* * Double-quote: if element is in quotes then terminate it. */ @@ -707,7 +690,7 @@ FindElement( if (inQuotes) { size = (p - elemStart); p++; - if ((p >= limit) || TclIsSpaceProc(*p)) { + if ((p >= limit) || TclIsSpaceProcM(*p)) { goto done; } @@ -717,7 +700,7 @@ FindElement( if (interp != NULL) { p2 = p; - while ((p2 < limit) && (!TclIsSpaceProc(*p2)) + while ((p2 < limit) && (!TclIsSpaceProcM(*p2)) && (p2 < p+20)) { p2++; } @@ -730,6 +713,20 @@ FindElement( return TCL_ERROR; } break; + + default: + if (TclIsSpaceProcM(*p)) { + /* + * Space: ignore if element is in braces or quotes; + * otherwise terminate element. + */ + if ((openBraces == 0) && !inQuotes) { + size = (p - elemStart); + goto done; + } + } + break; + } p++; } @@ -760,7 +757,7 @@ FindElement( } done: - while ((p < limit) && (TclIsSpaceProc(*p))) { + while ((p < limit) && (TclIsSpaceProcM(*p))) { p++; } *elementPtr = elemStart; @@ -1116,12 +1113,6 @@ TclScanElement( case '[': /* TYPE_SUBS */ case '$': /* TYPE_SUBS */ case ';': /* TYPE_COMMAND_END */ - case ' ': /* TYPE_SPACE */ - case '\f': /* TYPE_SPACE */ - case '\n': /* TYPE_COMMAND_END */ - case '\r': /* TYPE_SPACE */ - case '\t': /* TYPE_SPACE */ - case '\v': /* TYPE_SPACE */ forbidNone = 1; extra++; /* Escape sequences all one byte longer. */ #if COMPAT @@ -1166,6 +1157,15 @@ TclScanElement( } /* TODO: Panic on improper encoding? */ break; + default: + if (TclIsSpaceProcM(*p)) { + forbidNone = 1; + extra++; /* Escape sequences all one byte longer. */ +#if COMPAT + preferBrace = 1; +#endif + } + break; } } length -= (length > 0); @@ -1666,42 +1666,6 @@ Tcl_Backslash( /* *---------------------------------------------------------------------- * - * UtfWellFormedEnd -- - * Checks the end of utf string is malformed, if yes - wraps bytes - * to the given buffer (as well-formed NTS string). The buffer - * argument should be initialized by the caller and ready to use. - * - * Results: - * The bytes with well-formed end of the string. - * - * Side effects: - * Buffer (DString) may be allocated, so must be released. - * - *---------------------------------------------------------------------- - */ - -static inline const char* -UtfWellFormedEnd( - Tcl_DString *buffer, /* Buffer used to hold well-formed string. */ - const char *bytes, /* Pointer to the beginning of the string. */ - int length) /* Length of the string. */ -{ - const char *l = bytes + length; - const char *p = Tcl_UtfPrev(l, bytes); - - if (Tcl_UtfCharComplete(p, l - p)) { - return bytes; - } - /* - * Malformed utf-8 end, be sure we've NTS to safe compare of end-character, - * avoid segfault by access violation out of range. - */ - Tcl_DStringAppend(buffer, bytes, length); - return Tcl_DStringValue(buffer); -} -/* - *---------------------------------------------------------------------- - * * TclTrimRight -- * Takes two counted strings in the Tcl encoding. Conceptually * finds the sub string (offset) to trim from the right side of the @@ -1716,15 +1680,23 @@ UtfWellFormedEnd( *---------------------------------------------------------------------- */ -static inline int -TrimRight( - const char *bytes, /* String to be trimmed... */ - int numBytes, /* ...and its length in bytes */ - const char *trim, /* String of trim characters... */ - int numTrim) /* ...and its length in bytes */ +int +TclTrimRight( + const char *bytes, /* String to be trimmed... */ + int numBytes, /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (bytes[numBytes] == '\0'). */ + const char *trim, /* String of trim characters... */ + int numTrim) /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (trim[numTrim] == '\0'). */ { - const char *p = bytes + numBytes; - int pInc; + const char *pp, *p = bytes + numBytes; + + /* Empty strings -> nothing to do */ + if ((numBytes == 0) || (numTrim == 0)) { + return 0; + } int ch1, ch2; /* @@ -1733,10 +1705,13 @@ TrimRight( do { const char *q = trim; - int bytesLeft = numTrim; + int pInc = 0, bytesLeft = numTrim; - p = Tcl_UtfPrev(p, bytes); - pInc = TclUtfToUCS4(p, &ch1); + pp = Tcl_UtfPrev(p, bytes); + do { + pp += pInc; + pInc = TclUtfToUCS4(pp, &ch1); + } while (pp + pInc < p); /* * Inner loop: scan trim string for match to current character. @@ -1758,44 +1733,13 @@ TrimRight( * No match; trim task done; *p is last non-trimmed char. */ - p += pInc; break; } + p = pp; } while (p > bytes); return numBytes - (p - bytes); } - -int -TclTrimRight( - const char *bytes, /* String to be trimmed... */ - int numBytes, /* ...and its length in bytes */ - const char *trim, /* String of trim characters... */ - int numTrim) /* ...and its length in bytes */ -{ - int res; - Tcl_DString bytesBuf, trimBuf; - - /* Empty strings -> nothing to do */ - if ((numBytes == 0) || (numTrim == 0)) { - return 0; - } - - Tcl_DStringInit(&bytesBuf); - Tcl_DStringInit(&trimBuf); - bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); - trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - - res = TrimRight(bytes, numBytes, trim, numTrim); - if (res > numBytes) { - res = numBytes; - } - - Tcl_DStringFree(&bytesBuf); - Tcl_DStringFree(&trimBuf); - - return res; -} /* *---------------------------------------------------------------------- @@ -1815,16 +1759,25 @@ TclTrimRight( *---------------------------------------------------------------------- */ -static inline int -TrimLeft( - const char *bytes, /* String to be trimmed... */ - int numBytes, /* ...and its length in bytes */ - const char *trim, /* String of trim characters... */ - int numTrim) /* ...and its length in bytes */ +int +TclTrimLeft( + const char *bytes, /* String to be trimmed... */ + int numBytes, /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (bytes[numBytes] == '\0'). */ + const char *trim, /* String of trim characters... */ + int numTrim) /* ...and its length in bytes */ + /* Calls to TclUtfToUniChar() in this routine + * rely on (trim[numTrim] == '\0'). */ { const char *p = bytes; int ch1, ch2; + /* Empty strings -> nothing to do */ + if ((numBytes == 0) || (numTrim == 0)) { + return 0; + } + /* * Outer loop: iterate over string to be trimmed. */ @@ -1863,37 +1816,6 @@ TrimLeft( return p - bytes; } - -int -TclTrimLeft( - const char *bytes, /* String to be trimmed... */ - int numBytes, /* ...and its length in bytes */ - const char *trim, /* String of trim characters... */ - int numTrim) /* ...and its length in bytes */ -{ - int res; - Tcl_DString bytesBuf, trimBuf; - - /* Empty strings -> nothing to do */ - if ((numBytes == 0) || (numTrim == 0)) { - return 0; - } - - Tcl_DStringInit(&bytesBuf); - Tcl_DStringInit(&trimBuf); - bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); - trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - - res = TrimLeft(bytes, numBytes, trim, numTrim); - if (res > numBytes) { - res = numBytes; - } - - Tcl_DStringFree(&bytesBuf); - Tcl_DStringFree(&trimBuf); - - return res; -} /* *---------------------------------------------------------------------- @@ -1915,41 +1837,38 @@ int TclTrim( const char *bytes, /* String to be trimmed... */ int numBytes, /* ...and its length in bytes */ + /* Calls in this routine + * rely on (bytes[numBytes] == '\0'). */ const char *trim, /* String of trim characters... */ int numTrim, /* ...and its length in bytes */ - int *trimRight) /* Offset from the end of the string. */ + /* Calls in this routine + * rely on (trim[numTrim] == '\0'). */ + int *trimRightPtr) /* Offset from the end of the string. */ { - int trimLeft; - Tcl_DString bytesBuf, trimBuf; + int trimLeft = 0, trimRight = 0; - *trimRight = 0; /* Empty strings -> nothing to do */ - if ((numBytes == 0) || (numTrim == 0)) { - return 0; - } - - Tcl_DStringInit(&bytesBuf); - Tcl_DStringInit(&trimBuf); - bytes = UtfWellFormedEnd(&bytesBuf, bytes, numBytes); - trim = UtfWellFormedEnd(&trimBuf, trim, numTrim); - - trimLeft = TrimLeft(bytes, numBytes, trim, numTrim); - if (trimLeft > numBytes) { - trimLeft = numBytes; - } - numBytes -= trimLeft; - /* have to trim yet (first char was already verified within TrimLeft) */ - if (numBytes > 1) { - bytes += trimLeft; - *trimRight = TrimRight(bytes, numBytes, trim, numTrim); - if (*trimRight > numBytes) { - *trimRight = numBytes; + if ((numBytes > 0) && (numTrim > 0)) { + + /* When bytes is NUL-terminated, returns 0 <= trimLeft <= numBytes */ + trimLeft = TclTrimLeft(bytes, numBytes, trim, numTrim); + numBytes -= trimLeft; + + /* If we did not trim the whole string, it starts with a character + * that we will not trim. Skip over it. */ + if (numBytes > 0) { + const char *first = bytes + trimLeft; + bytes = Tcl_UtfNext(first); + numBytes -= (bytes - first); + + if (numBytes > 0) { + /* When bytes is NUL-terminated, returns + * 0 <= trimRight <= numBytes */ + trimRight = TclTrimRight(bytes, numBytes, trim, numTrim); + } } } - - Tcl_DStringFree(&bytesBuf); - Tcl_DStringFree(&trimBuf); - + *trimRightPtr = trimRight; return trimLeft; } @@ -2242,7 +2161,6 @@ Tcl_StringCaseMatch( int nocase) /* 0 for case sensitive, 1 for insensitive */ { int p, charLen; - const char *pstart = pattern; Tcl_UniChar ch1 = 0, ch2 = 0; while (1) { @@ -2407,10 +2325,13 @@ Tcl_StringCaseMatch( break; } } + /* If we reach here, we matched. Need to move past closing ] */ while (*pattern != ']') { if (*pattern == '\0') { - pattern = Tcl_UtfPrev(pattern, pstart); - break; + /* We ran out of pattern after matching something in + * (unclosed!) brackets. So long as we ran out of string + * at the same time, we have a match. Otherwise, not. */ + return (*str == '\0'); } pattern++; } @@ -2839,9 +2760,37 @@ Tcl_DStringAppendElement( { char *dst = dsPtr->string + dsPtr->length; int needSpace = TclNeedSpace(dsPtr->string, dst); - char flags = needSpace ? TCL_DONT_QUOTE_HASH : 0; - int newSize = dsPtr->length + needSpace - + TclScanElement(element, -1, &flags); + char flags = 0; + int quoteHash = 1, newSize; + + if (needSpace) { + /* + * If we need a space to separate the new element from something + * already ending the string, we're not appending the first element + * of any list, so we need not quote any leading hash character. + */ + quoteHash = 0; + } else { + /* + * We don't need a space, maybe because there's some already there. + * Checking whether we might be appending a first element is a bit + * more involved. + * + * Backtrack over all whitespace. + */ + while ((--dst >= dsPtr->string) && TclIsSpaceProcM(*dst)) { + } + + /* Call again without whitespace to confound things. */ + quoteHash = !TclNeedSpace(dsPtr->string, dst+1); + } + if (!quoteHash) { + flags |= TCL_DONT_QUOTE_HASH; + } + newSize = dsPtr->length + needSpace + TclScanElement(element, -1, &flags); + if (!quoteHash) { + flags |= TCL_DONT_QUOTE_HASH; + } /* * Allocate a larger buffer for the string if the current one isn't large @@ -2873,8 +2822,8 @@ Tcl_DStringAppendElement( element = dsPtr->string + offset; } } - dst = dsPtr->string + dsPtr->length; } + dst = dsPtr->string + dsPtr->length; /* * Convert the new string to a list element and copy it into the buffer at @@ -2885,15 +2834,8 @@ Tcl_DStringAppendElement( *dst = ' '; dst++; dsPtr->length++; - - /* - * If we need a space to separate this element from preceding stuff, - * then this element will not lead a list, and need not have it's - * leading '#' quoted. - */ - - flags |= TCL_DONT_QUOTE_HASH; } + dsPtr->length += TclConvertElement(element, -1, dst, flags); dsPtr->string[dsPtr->length] = '\0'; return dsPtr->string; @@ -3520,63 +3462,69 @@ TclNeedSpace( /* * A space is needed unless either: * (a) we're at the start of the string, or - */ + * + * (NOTE: This check is now absorbed into the loop below.) + * if (end == start) { return 0; } + * + */ + /* * (b) we're at the start of a nested list-element, quoted with an open * curly brace; we can be nested arbitrarily deep, so long as the * first curly brace starts an element, so backtrack over open curly * braces that are trailing characters of the string; and - */ + * + * (NOTE: Every character our parser is looking for is a proper + * single-byte encoding of an ASCII value. It does not accept + * overlong encodings. Given that, there's no benefit using + * Tcl_UtfPrev. If it would find what we seek, so would byte-by-byte + * backward scan. Save routine call overhead and risk of wrong + * results should the behavior of Tcl_UtfPrev change in unexpected ways. + * Reconsider this if we ever start treating non-ASCII Unicode + * characters as meaningful list syntax, expanded Unicode spaces as + * element separators, for example.) + * end = Tcl_UtfPrev(end, start); while (*end == '{') { - if (end == start) { - return 0; - } - end = Tcl_UtfPrev(end, start); + if (end == start) { + return 0; + } + end = Tcl_UtfPrev(end, start); + } + + * + */ + + while ((--end >= start) && (*end == '{')) { + } + if (end < start) { + return 0; } /* * (c) the trailing character of the string is already a list-element - * separator (according to TclFindElement); that is, one of these - * characters: - * \u0009 \t TAB - * \u000A \n NEWLINE - * \u000B \v VERTICAL TAB - * \u000C \f FORM FEED - * \u000D \r CARRIAGE RETURN - * \u0020 SPACE - * with the condition that the penultimate character is not a - * backslash. + * separator, Use the same testing routine as TclFindElement to + * enforce consistency. */ - if (*end > 0x20) { + if (TclIsSpaceProcM(*end)) { + int result = 0; + /* - * Performance tweak. All ASCII spaces are <= 0x20. So get a quick - * answer for most characters before comparing against all spaces in - * the switch below. - * - * NOTE: Remove this if other Unicode spaces ever get accepted as - * list-element separators. + * Trailing whitespace might be part of a backslash escape + * sequence. Handle that possibility. */ - return 1; - } - switch (*end) { - case ' ': - case '\t': - case '\n': - case '\r': - case '\v': - case '\f': - if ((end == start) || (end[-1] != '\\')) { - return 0; + while ((--end >= start) && (*end == '\\')) { + result = !result; } + return result; } return 1; } @@ -4172,7 +4120,7 @@ TclCheckBadOctal( * zero. Try to generate a meaningful error message. */ - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } if (*p == '+' || *p == '-') { @@ -4185,7 +4133,7 @@ TclCheckBadOctal( while (isdigit(UCHAR(*p))) { /* INTL: digit. */ p++; } - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } if (*p == '\0') { diff --git a/tests/dstring.test b/tests/dstring.test index 06121a3..5feb355 100644 --- a/tests/dstring.test +++ b/tests/dstring.test @@ -180,16 +180,37 @@ test dstring-2.12 {appending list elements} -constraints testdstring -setup { } -cleanup { testdstring free } -result {x #} -test dstring-2.13 {appending list elements} -constraints testdstring -body { - # This test shows lack of sophistication in Tcl_DStringAppendElement's - # decision about whether #-quoting can be disabled. +test dstring-2.13 {appending list elements} -constraints testdstring -setup { testdstring free +} -body { + # This test checks the sophistication in Tcl_DStringAppendElement's + # decision about whether #-quoting can be disabled. testdstring append "x " -1 testdstring element # testdstring get } -cleanup { testdstring free -} -result {x {#}} +} -result {x #} +test dstring-2.14 {appending list elements} -constraints testdstring -setup { + testdstring free +} -body { + testdstring append " " -1 + testdstring element # + testdstring get +} -cleanup { + testdstring free +} -result { {#}} +test dstring-2.15 {appending list elements} -constraints testdstring -setup { + testdstring free +} -body { + # This test checks the sophistication in Tcl_DStringAppendElement's + # decision about whether #-quoting can be disabled. + testdstring append "x " -1 + testdstring element # + testdstring get +} -cleanup { + testdstring free +} -result {x #} test dstring-3.1 {nested sublists} -constraints testdstring -setup { testdstring free @@ -306,10 +327,11 @@ test dstring-3.9 {appending list elements} -constraints testdstring -setup { } -cleanup { testdstring free } -result {x {x #}} -test dstring-3.10 {appending list elements} -constraints testdstring -body { - # This test shows lack of sophistication in Tcl_DStringAppendElement's - # decision about whether #-quoting can be disabled. +test dstring-3.10 {appending list elements} -constraints testdstring -setup { testdstring free +} -body { + # This test checks the sophistication in Tcl_DStringAppendElement's + # decision about whether #-quoting can be disabled. testdstring append x -1 testdstring start testdstring append "x " -1 @@ -318,7 +340,33 @@ test dstring-3.10 {appending list elements} -constraints testdstring -body { testdstring get } -cleanup { testdstring free -} -result {x {x {#}}} +} -result {x {x #}} +test dstring-3.11 {appending list elements} -constraints testdstring -setup { + testdstring free +} -body { + testdstring append x -1 + testdstring start + testdstring append " " -1 + testdstring element # + testdstring end + testdstring get +} -cleanup { + testdstring free +} -result {x { {#}}} +test dstring-3.12 {appending list elements} -constraints testdstring -setup { + testdstring free +} -body { + # This test checks the sophistication in Tcl_DStringAppendElement's + # decision about whether #-quoting can be disabled. + testdstring append x -1 + testdstring start + testdstring append "x " -1 + testdstring element # + testdstring end + testdstring get +} -cleanup { + testdstring free +} -result {x {x #}} test dstring-4.1 {truncation} -constraints testdstring -setup { testdstring free diff --git a/tests/string.test b/tests/string.test index aca3570..420b996 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1813,6 +1813,34 @@ test string-20.5.$noComp {string trimright} { test string-20.6.$noComp {string trimright, unicode default} { run {string trimright ABC\u1361\x85\x00\xA0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u2028\u2029\u202F\u205F\u3000} } ABC\u1361 +test string-20.7 {string trim on not valid utf-8 sequence (consider NTS as continuation char), bug [c61818e4c9]} knownBug { + set result {} + set a [testbytestring \xc0\x80\x88] + set b foo$a + set m [list \u0000 U \x88 V [testbytestring \x88] W] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \u0000]] + lappend result [string map $m [string trimleft $b fox]] + lappend result [string map $m [string trimleft $b fo\u0000]] + lappend result [string map $m [string trim $b fox]] + lappend result [string map $m [string trim $b fo\u0000]] +} [list {*}[lrepeat 3 fooUV] {*}[lrepeat 2 UV V]] +test string-20.8 {[c61818e4c9] [string trimright] fails when UtfPrev is ok} knownBug { + set result {} + set a [testbytestring \xE8\x80] + set b foo$a + set m [list \xE8 U \x80 V [testbytestring \xE8] W [testbytestring \x80] X]] + lappend result [string map $m $b] + lappend result [string map $m [string trimright $b x]] + lappend result [string map $m [string trimright $b \xE8]] + lappend result [string map $m [string trimright $b [testbytestring \xE8]]] + lappend result [string map $m [string trimright $b \x80]] + lappend result [string map $m [string trimright $b [testbytestring \x80]]] + lappend result [string map $m [string trimright $b \xE8\x80]] + lappend result [string map $m [string trimright $b [testbytestring \xE8\x80]]] + lappend result [string map $m [string trimright $b \u0000]] +} [list {*}[lrepeat 4 fooUV] {*}[lrepeat 2 fooU] {*}[lrepeat 2 foo] fooUV] test string-21.1.$noComp {string wordend} -body { list [catch {run {string wordend a}} msg] $msg diff --git a/tests/utf.test b/tests/utf.test index 507c6f9..9744703 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -155,8 +155,167 @@ test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { test utf-6.1 {Tcl_UtfNext} { } {} -test utf-7.1 {Tcl_UtfPrev} { -} {} +testConstraint testutfprev [llength [info commands testutfprev]] + +test utf-7.1 {Tcl_UtfPrev} testutfprev { + testutfprev {} +} 0 +test utf-7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A +} 0 +test utf-7.3 {Tcl_UtfPrev} testutfprev { + testutfprev AA +} 1 +test utf-7.4 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8 +} 1 +test utf-7.4.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 2 +} 1 +test utf-7.4.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xF8\xA0\xA0 2 +} 1 +test utf-7.5 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4 +} 1 +test utf-7.5.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 2 +} 1 +test utf-7.5.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xF8\xA0\xA0 2 +} 1 +test utf-7.6 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8 +} 1 +test utf-7.6.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 2 +} 1 +test utf-7.6.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xF8\xA0\xA0 2 +} 1 +test utf-7.7 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0 +} 1 +test utf-7.7.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 2 +} 1 +test utf-7.7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xF8\xA0\xA0 2 +} 1 +test utf-7.8 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0 +} 1 +test utf-7.8.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 2 +} 1 +test utf-7.8.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xF8\xA0\xA0 2 +} 1 +test utf-7.9 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0 +} 2 +test utf-7.9.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 3 +} 2 +test utf-7.9.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xF8\xA0 3 +} 2 +test utf-7.10 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0 +} 1 +test utf-7.10.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 3 +} 1 +test utf-7.10.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xF8\xA0 3 +} 1 +test utf-7.11 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0 +} 1 +test utf-7.11.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 3 +} 1 +test utf-7.11.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xF8\xA0 3 +} 1 +test utf-7.12 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0 +} 1 +test utf-7.12.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 3 +} 1 +test utf-7.12.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xF8\xA0 3 +} 1 +test utf-7.13 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0 +} 2 +test utf-7.13.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 3 +} 2 +test utf-7.13.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xF8\xA0 3 +} 2 +test utf-7.14 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0 +} 3 +test utf-7.14.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 4 +} 3 +test utf-7.14.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xF8 4 +} 3 +test utf-7.15 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0 +} 1 +test utf-7.15.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 4 +} 1 +test utf-7.15.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xF8 4 +} 1 +test utf-7.16 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0 +} 1 +test utf-7.16.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 4 +} 1 +test utf-7.16.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xF8 4 +} 1 +test utf-7.17 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0 +} 3 +test utf-7.17.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 4 +} 3 +test utf-7.17.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xF8 4 +} 3 +test utf-7.18 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0 +} 3 +test utf-7.18.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 4 +} 3 +test utf-7.18.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xF8 4 +} 3 +test utf-7.19 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 +} 4 +test utf-7.20 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 +} 1 +test utf-7.21 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 +} 4 +test utf-7.22 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 +} 4 +test utf-7.23 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 +} 4 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 diff --git a/tests/util.test b/tests/util.test index 35bb12d..1d8162c 100644 --- a/tests/util.test +++ b/tests/util.test @@ -387,6 +387,10 @@ test util-5.50 {Tcl_StringMatch} { test util-5.51 {Tcl_StringMatch} { Wrapper_Tcl_StringMatch "" "" } 1 +test util-5.52 {Tcl_StringMatch} { + Wrapper_Tcl_StringMatch \[a\u0000 a\x80 +} 0 + test util-6.1 {Tcl_PrintDouble - using tcl_precision} -constraints precision -setup { set old_precision $::tcl_precision @@ -512,25 +516,64 @@ test util-8.4 {TclNeedSpace - correct UTF8 handling} testdstring { llength [testdstring get] } 2 test util-8.5 {TclNeedSpace - correct UTF8 handling} testdstring { - # Note that in this test TclNeedSpace actually gets it wrong, - # claiming we need a space when we really do not. Extra space - # between list elements is harmless though, and better to have - # extra space in really weird string reps of lists, than to - # invest the effort required to make TclNeedSpace foolproof. testdstring free testdstring append {\\ } -1 testdstring element foo list [llength [testdstring get]] [string length [testdstring get]] -} {2 7} +} {2 6} test util-8.6 {TclNeedSpace - correct UTF8 handling} testdstring { - # Another example of TclNeedSpace harmlessly getting it wrong. testdstring free testdstring append {\\ } -1 testdstring append \{ -1 testdstring element foo testdstring append \} -1 list [llength [testdstring get]] [string length [testdstring get]] -} {2 9} +} {2 8} +test util-8.7 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\ } -1 + testdstring start + testdstring end + + # Should make {\ {}} + list [llength [testdstring get]] [string index [testdstring get] 3] +} {2 \{} +test util-8.8 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\ } -1 + testdstring start + testdstring end + + # Should make {\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 3] +} {2 \{} +test util-8.9 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\\ } -1 + testdstring start + testdstring end + + # Should make {\\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 5] +} {2 \{} +test util-8.10 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\\\\\\ } -1 + testdstring start + testdstring end + + # Should make {\\\\\\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 9] +} {2 \{} +test util-8.11 {TclNeedSpace - watch out for escaped space} { + testdstring free + testdstring append {\\\\\\\\ } -1 + testdstring start + testdstring end + + # Should make {\\\\\\\\ {}} + list [llength [testdstring get]] [string index [testdstring get] 9] +} {2 \{} test util-9.0.0 {Tcl_GetIntForIndex} { string index abcd 0 diff --git a/unix/tclUnixFile.c b/unix/tclUnixFile.c index 602ca63..fdf7904 100644 --- a/unix/tclUnixFile.c +++ b/unix/tclUnixFile.c @@ -105,7 +105,7 @@ TclpFindExecutable( */ while (1) { - while (TclIsSpaceProc(*p)) { + while (TclIsSpaceProcM(*p)) { p++; } name = p; |