From 648162204d4c9bd80cde739b4cad361de6ebe6f1 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 12 May 2020 18:59:04 +0000 Subject: First, experimental implementation of TIP #575. Barely tested, will fail. WIP --- generic/tcl.decls | 17 ++++++++++++++--- generic/tclCmdMZ.c | 2 +- generic/tclCompExpr.c | 8 ++++---- generic/tclDecls.h | 51 +++++++++++++++++++++++++++++++++++--------------- generic/tclEncoding.c | 4 ++-- generic/tclInt.h | 14 -------------- generic/tclParse.c | 8 ++++---- generic/tclStringObj.c | 6 +++--- generic/tclStubInit.c | 34 ++++++++++++++++++++++++++++++--- generic/tclTest.c | 8 ++++---- generic/tclUtf.c | 14 ++++---------- generic/tclUtil.c | 4 ++-- 12 files changed, 105 insertions(+), 65 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index a550411..4ccedd1 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1163,7 +1163,7 @@ declare 325 { const char *Tcl_UtfAtIndex(const char *src, int index) } declare 326 { - int Tcl_UtfCharComplete(const char *src, int length) + int TclUtfCharComplete(const char *src, int length) } declare 327 { int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst) @@ -1175,10 +1175,10 @@ declare 329 { const char *Tcl_UtfFindLast(const char *src, int ch) } declare 330 { - const char *Tcl_UtfNext(const char *src) + const char *TclUtfNext(const char *src) } declare 331 { - const char *Tcl_UtfPrev(const char *src, const char *start) + const char *TclUtfPrev(const char *src, const char *start) } declare 332 { int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, @@ -2402,6 +2402,17 @@ declare 648 { int length, Tcl_DString *dsPtr) } +# TIP #575 +declare 649 { + int Tcl_UtfCharComplete(const char *src, int length) +} +declare 650 { + const char *Tcl_UtfNext(const char *src) +} +declare 651 { + const char *Tcl_UtfPrev(const char *src, const char *start) +} + # ----- BASELINE -- FOR -- 8.7.0 ----- # ############################################################################## diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 56df0dd..8f0465d 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2529,7 +2529,7 @@ StringStartCmd( break; } - next = TclUtfPrev(p, string); + next = Tcl_UtfPrev(p, string); do { next += delta; delta = TclUtfToUCS4(next, &ch); diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 4fb41fc..4d448b9 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1893,7 +1893,7 @@ ParseLexeme( { const char *end; int scanned; - Tcl_UniChar ch = 0; + int ch; Tcl_Obj *literal = NULL; unsigned char byte; @@ -2101,13 +2101,13 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = TclUtfToUniChar(start, &ch); + scanned = TclUtfToUCS4(start, &ch); } else { - char utfBytes[4]; + char utfBytes[8]; memcpy(utfBytes, start, numBytes); utfBytes[numBytes] = '\0'; - scanned = TclUtfToUniChar(utfBytes, &ch); + scanned = TclUtfToUCS4(utfBytes, &ch); } *lexemePtr = INVALID; Tcl_DecrRefCount(literal); diff --git a/generic/tclDecls.h b/generic/tclDecls.h index c713469..7c1b22b 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1000,7 +1000,7 @@ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ EXTERN const char * Tcl_UtfAtIndex(const char *src, int index); /* 326 */ -EXTERN int Tcl_UtfCharComplete(const char *src, int length); +EXTERN int TclUtfCharComplete(const char *src, int length); /* 327 */ EXTERN int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst); @@ -1009,9 +1009,9 @@ EXTERN const char * Tcl_UtfFindFirst(const char *src, int ch); /* 329 */ EXTERN const char * Tcl_UtfFindLast(const char *src, int ch); /* 330 */ -EXTERN const char * Tcl_UtfNext(const char *src); +EXTERN const char * TclUtfNext(const char *src); /* 331 */ -EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +EXTERN const char * TclUtfPrev(const char *src, const char *start); /* 332 */ EXTERN int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, @@ -1921,6 +1921,12 @@ EXTERN char * Tcl_UniCharToUtfDString(const int *uniStr, /* 648 */ EXTERN int * Tcl_UtfToUniCharDString(const char *src, int length, Tcl_DString *dsPtr); +/* 649 */ +EXTERN int Tcl_UtfCharComplete(const char *src, int length); +/* 650 */ +EXTERN const char * Tcl_UtfNext(const char *src); +/* 651 */ +EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2282,12 +2288,12 @@ typedef struct TclStubs { int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ - int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ + int (*tclUtfCharComplete) (const char *src, int length); /* 326 */ int (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ const char * (*tcl_UtfFindLast) (const char *src, int ch); /* 329 */ - const char * (*tcl_UtfNext) (const char *src); /* 330 */ - const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 331 */ + const char * (*tclUtfNext) (const char *src); /* 330 */ + const char * (*tclUtfPrev) (const char *src, const char *start); /* 331 */ int (*tcl_UtfToExternal) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); /* 332 */ char * (*tcl_UtfToExternalDString) (Tcl_Encoding encoding, const char *src, int srcLen, Tcl_DString *dsPtr); /* 333 */ int (*tcl_UtfToLower) (char *src); /* 334 */ @@ -2605,6 +2611,9 @@ typedef struct TclStubs { int (*tcl_UtfToUniChar) (const char *src, int *chPtr); /* 646 */ char * (*tcl_UniCharToUtfDString) (const int *uniStr, int uniLength, Tcl_DString *dsPtr); /* 647 */ int * (*tcl_UtfToUniCharDString) (const char *src, int length, Tcl_DString *dsPtr); /* 648 */ + int (*tcl_UtfCharComplete) (const char *src, int length); /* 649 */ + const char * (*tcl_UtfNext) (const char *src); /* 650 */ + const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 651 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3287,18 +3296,18 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtf) /* 324 */ #define Tcl_UtfAtIndex \ (tclStubsPtr->tcl_UtfAtIndex) /* 325 */ -#define Tcl_UtfCharComplete \ - (tclStubsPtr->tcl_UtfCharComplete) /* 326 */ +#define TclUtfCharComplete \ + (tclStubsPtr->tclUtfCharComplete) /* 326 */ #define Tcl_UtfBackslash \ (tclStubsPtr->tcl_UtfBackslash) /* 327 */ #define Tcl_UtfFindFirst \ (tclStubsPtr->tcl_UtfFindFirst) /* 328 */ #define Tcl_UtfFindLast \ (tclStubsPtr->tcl_UtfFindLast) /* 329 */ -#define Tcl_UtfNext \ - (tclStubsPtr->tcl_UtfNext) /* 330 */ -#define Tcl_UtfPrev \ - (tclStubsPtr->tcl_UtfPrev) /* 331 */ +#define TclUtfNext \ + (tclStubsPtr->tclUtfNext) /* 330 */ +#define TclUtfPrev \ + (tclStubsPtr->tclUtfPrev) /* 331 */ #define Tcl_UtfToExternal \ (tclStubsPtr->tcl_UtfToExternal) /* 332 */ #define Tcl_UtfToExternalDString \ @@ -3933,6 +3942,12 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtfDString) /* 647 */ #define Tcl_UtfToUniCharDString \ (tclStubsPtr->tcl_UtfToUniCharDString) /* 648 */ +#define Tcl_UtfCharComplete \ + (tclStubsPtr->tcl_UtfCharComplete) /* 649 */ +#define Tcl_UtfNext \ + (tclStubsPtr->tcl_UtfNext) /* 650 */ +#define Tcl_UtfPrev \ + (tclStubsPtr->tcl_UtfPrev) /* 651 */ #endif /* defined(USE_TCL_STUBS) */ @@ -4178,10 +4193,16 @@ extern const TclStubs *tclStubsPtr; #define Tcl_Close(interp, chan) Tcl_CloseEx(interp, chan, 0) #endif -#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3) +#undef TclUtfCharComplete +#undef TclUtfNext +#undef TclUtfPrev +#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX < 4) && !defined(TCL_NO_DEPRECATED) # undef Tcl_UtfCharComplete -# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length))) +# undef Tcl_UtfNext +# undef Tcl_UtfPrev +# define Tcl_UtfCharComplete (tclStubsPtr->tclUtfCharComplete) +# define Tcl_UtfNext (tclStubsPtr->tclUtfNext) +# define Tcl_UtfPrev (tclStubsPtr->tclUtfPrev) #endif #endif /* _TCLDECLS */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index ae02821..784d8d6 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2321,7 +2321,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2351,7 +2351,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclInt.h b/generic/tclInt.h index 78d9f93..1b95754 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3252,14 +3252,8 @@ MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar -# define TclUCS4Complete Tcl_UtfCharComplete -# define TclChar16Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 3) : Tcl_UtfCharComplete((src), (length))) #else MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); -# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) -# define TclChar16Complete Tcl_UtfCharComplete #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); @@ -4695,14 +4689,6 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; (numChars) = _count; \ } while (0); -#define TclUtfPrev(src, start) \ - (((src) < (start) + 2) ? (start) : \ - ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \ - Tcl_UtfPrev(src, start)) - -#define TclUtfNext(src) \ - ((((unsigned char) *(src)) < 0x80) ? (src) + 1 : Tcl_UtfNext(src)) - /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclParse.c b/generic/tclParse.c index 132e804..49ee348 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -789,7 +789,7 @@ TclParseBackslash( * written. At most 4 bytes will be written there. */ { const char *p = src+1; - Tcl_UniChar unichar = 0; + int unichar; int result; int count; char buf[4] = ""; @@ -936,13 +936,13 @@ TclParseBackslash( */ if (Tcl_UtfCharComplete(p, numBytes - 1)) { - count = TclUtfToUniChar(p, &unichar) + 1; /* +1 for '\' */ + count = TclUtfToUCS4(p, &unichar) + 1; /* +1 for '\' */ } else { - char utfBytes[4]; + char utfBytes[8]; memcpy(utfBytes, p, numBytes - 1); utfBytes[numBytes - 1] = '\0'; - count = TclUtfToUniChar(utfBytes, &unichar) + 1; + count = TclUtfToUCS4(utfBytes, &unichar) + 1; } result = unichar; break; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 78e49f9..2025674 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1171,10 +1171,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2614,7 +2614,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = TclUtfPrev(end, bytes); + q = Tcl_UtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (int)(end - q))) { end = q; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 2d2bc63..ae9a4e3 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -88,6 +88,31 @@ static void uniCodePanic(void) { # define Tcl_UniCharNcmp (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, unsigned long))(void *)uniCodePanic #endif +#define TclUtfCharComplete UtfCharComplete +#define TclUtfNext UtfNext +#define TclUtfPrev UtfPrev + +static int TclUtfCharComplete(const char *src, int length) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return length < 5; + } + return Tcl_UtfCharComplete(src, length); +} + +static const char *TclUtfNext(const char *src) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return src + 1; + } + return Tcl_UtfNext(src); +} + +static const char *TclUtfPrev(const char *src, const char *start) { + if (((unsigned)((unsigned char)*(src) - 0xF0) < 5) && (src >= start)) { + return src - 1; + } + return Tcl_UtfPrev(src, start); +} + #define TclBN_mp_add mp_add #define TclBN_mp_and mp_and #define TclBN_mp_clamp mp_clamp @@ -1549,12 +1574,12 @@ const TclStubs tclStubs = { Tcl_UniCharToUpper, /* 323 */ Tcl_UniCharToUtf, /* 324 */ Tcl_UtfAtIndex, /* 325 */ - Tcl_UtfCharComplete, /* 326 */ + TclUtfCharComplete, /* 326 */ Tcl_UtfBackslash, /* 327 */ Tcl_UtfFindFirst, /* 328 */ Tcl_UtfFindLast, /* 329 */ - Tcl_UtfNext, /* 330 */ - Tcl_UtfPrev, /* 331 */ + TclUtfNext, /* 330 */ + TclUtfPrev, /* 331 */ Tcl_UtfToExternal, /* 332 */ Tcl_UtfToExternalDString, /* 333 */ Tcl_UtfToLower, /* 334 */ @@ -1872,6 +1897,9 @@ const TclStubs tclStubs = { Tcl_UtfToUniChar, /* 646 */ Tcl_UniCharToUtfDString, /* 647 */ Tcl_UtfToUniCharDString, /* 648 */ + Tcl_UtfCharComplete, /* 649 */ + Tcl_UtfNext, /* 650 */ + Tcl_UtfPrev, /* 651 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclTest.c b/generic/tclTest.c index 1f6882f..78645b6 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -6844,10 +6844,10 @@ TestUtfNextCmd( memcpy(buffer + 1, bytes, numBytes); buffer[0] = buffer[numBytes + 1] = buffer[numBytes + 2] = buffer[numBytes + 3] = '\xA0'; - first = result = TclUtfNext(buffer + 1); + first = result = Tcl_UtfNext(buffer + 1); while ((buffer[0] = *p++) != '\0') { /* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */ - result = TclUtfNext(buffer + 1); + result = Tcl_UtfNext(buffer + 1); if (first != result) { Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL); return TCL_ERROR; @@ -6856,7 +6856,7 @@ TestUtfNextCmd( p = tobetested; while ((buffer[numBytes + 1] = *p++) != '\0') { /* Run Tcl_UtfNext with many more possible bytes at src[end], all should give the same result */ - result = TclUtfNext(buffer + 1); + result = Tcl_UtfNext(buffer + 1); if (first != result) { first = buffer; break; @@ -6904,7 +6904,7 @@ TestUtfPrevCmd( } else { offset = numBytes; } - result = TclUtfPrev(bytes + offset, bytes); + result = Tcl_UtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); return TCL_OK; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7c09283..6f03053 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -88,13 +88,7 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 3,3,3,3,3, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; /* @@ -694,7 +688,7 @@ Tcl_UtfToUniCharDString( p += TclUtfToUCS4(p, &ch); *w++ = ch; } - while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) { + while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { p += TclUtfToUCS4(p, &ch); *w++ = ch; } @@ -752,7 +746,7 @@ Tcl_UtfToChar16DString( *w++ = ch; } while (p < endPtr) { - if (TclChar16Complete(p, endPtr-p)) { + if (Tcl_UtfCharComplete(p, endPtr-p)) { p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } else { @@ -833,7 +827,7 @@ Tcl_NumUtfChars( /* Pointer to the end of string. Never read endPtr[0] */ const char *endPtr = src + length; /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; + const char *optPtr = endPtr - 4; /* * Optimize away the call in this loop. Justified because... diff --git a/generic/tclUtil.c b/generic/tclUtil.c index ebc8656..40b249d 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1707,7 +1707,7 @@ TclTrimRight( const char *q = trim; int pInc = 0, bytesLeft = numTrim; - pp = TclUtfPrev(p, bytes); + pp = Tcl_UtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUCS4(pp, &ch1); @@ -1858,7 +1858,7 @@ TclTrim( * that we will not trim. Skip over it. */ if (numBytes > 0) { const char *first = bytes + trimLeft; - bytes = TclUtfNext(first); + bytes = Tcl_UtfNext(first); numBytes -= (bytes - first); if (numBytes > 0) { -- cgit v0.12 From d6deb4d6d99f3ea3b0f50de0fdf0f06903f41956 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 22 May 2020 15:27:43 +0000 Subject: New "string" subcommands: "nextchar", "nextword", "prevchar", "prevword". Not implemented yet (for now same as "wordstart"/"wordend"). Deprecate "string bytelength". --- generic/tclCmdMZ.c | 8 ++++++++ tests/info.test | 4 ++-- tests/regexp.test | 4 ++-- tests/regexpComp.test | 4 ++-- tests/string.test | 19 ++++++++++--------- 5 files changed, 24 insertions(+), 15 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 8efdb27..0da143e 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2831,6 +2831,7 @@ StringCatCmd( * *---------------------------------------------------------------------- */ +#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9 static int StringBytesCmd( TCL_UNUSED(ClientData), @@ -2849,6 +2850,7 @@ StringBytesCmd( Tcl_SetObjResult(interp, Tcl_NewWideIntObj(length)); return TCL_OK; } +#endif /* *---------------------------------------------------------------------- @@ -3305,7 +3307,9 @@ TclInitStringCmd( Tcl_Interp *interp) /* Current interpreter. */ { static const EnsembleImplMap stringImplMap[] = { +#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9 {"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0}, +#endif {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0}, {"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0}, {"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0}, @@ -3317,6 +3321,10 @@ TclInitStringCmd( {"length", StringLenCmd, TclCompileStringLenCmd, NULL, NULL, 0}, {"map", StringMapCmd, TclCompileStringMapCmd, NULL, NULL, 0}, {"match", StringMatchCmd, TclCompileStringMatchCmd, NULL, NULL, 0}, + {"nextchar", StringEndCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"nextword", StringEndCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"prevchar", StringStartCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"prevword", StringStartCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, {"range", StringRangeCmd, TclCompileStringRangeCmd, NULL, NULL, 0}, {"repeat", StringReptCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, {"replace", StringRplcCmd, TclCompileStringReplaceCmd, NULL, NULL, 0}, diff --git a/tests/info.test b/tests/info.test index ce51523..18f5c7a 100644 --- a/tests/info.test +++ b/tests/info.test @@ -103,8 +103,8 @@ test info-2.5 {info body option, returning bytecompiled bodies} -body { # causing an empty string to be returned [Bug #545644] test info-2.6 {info body option, returning list bodies} { proc foo args [list subst bar] - list [string bytelength [info body foo]] \ - [foo; string bytelength [info body foo]] + list [string length [info body foo]] \ + [foo; string length [info body foo]] } {9 9} proc testinfocmdcount {} { diff --git a/tests/regexp.test b/tests/regexp.test index bae1217..03c55b7 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -760,8 +760,8 @@ test regexp-20.1 {regsub shared object shimmering} -body { set b $a set c abcdefghijklmnopqurstuvwxyz0123456789 regsub $a $c $b d - list $d [string length $d] [string bytelength $d] -} -result [list abcdefghijklmnopqurstuvwxyz0123456789 37 37] + list $d [string length $d] +} -result [list abcdefghijklmnopqurstuvwxyz0123456789 37] test regexp-20.2 {regsub shared object shimmering with -about} -body { eval regexp -about abc } -result {0 {}} diff --git a/tests/regexpComp.test b/tests/regexpComp.test index 8819dd2..390b003 100644 --- a/tests/regexpComp.test +++ b/tests/regexpComp.test @@ -798,9 +798,9 @@ test regexpComp-20.1 {regsub shared object shimmering} { set b $a set c abcdefghijklmnopqurstuvwxyz0123456789 regsub $a $c $b d - list $d [string length $d] [string bytelength $d] + list $d [string length $d] } -} [list abcdefghijklmnopqurstuvwxyz0123456789 37 37] +} [list abcdefghijklmnopqurstuvwxyz0123456789 37] test regexpComp-20.2 {regsub shared object shimmering with -about} { evalInProc { eval regexp -about abc diff --git a/tests/string.test b/tests/string.test index 12821c0..e68bbe3 100644 --- a/tests/string.test +++ b/tests/string.test @@ -33,6 +33,7 @@ testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf16 [expr {[string length \U010000] == 2}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint nodep [info exists tcl_precision] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -72,9 +73,9 @@ if {$noComp} { } -test string-1.1.$noComp {error conditions} { +test string-1.1.$noComp {error conditions} -body { list [catch {run {string gorp a b}} msg] $msg -} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match regexp -result {1 {unknown or ambiguous subcommand "gorp": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-1.2.$noComp {error conditions} { list [catch {run {string}} msg] $msg } {1 {wrong # args: should be "string subcommand ?arg ...?"}} @@ -1024,16 +1025,16 @@ test string-7.16.$noComp {string last, start index} { run {string last \334a \334ad\334ad end-1} } 3 -test string-8.1.$noComp {string bytelength} { +test string-8.1.$noComp {string bytelength} nodep { list [catch {run {string bytelength}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.2.$noComp {string bytelength} { +test string-8.2.$noComp {string bytelength} nodep { list [catch {run {string bytelength a b}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.3.$noComp {string bytelength} { +test string-8.3.$noComp {string bytelength} nodep { run {string bytelength "\xC7"} } 2 -test string-8.4.$noComp {string bytelength} { +test string-8.4.$noComp {string bytelength} nodep { run {string b ""} } 0 @@ -1799,9 +1800,9 @@ test string-19.3.$noComp {string trimleft, unicode default} { test string-20.1.$noComp {string trimright errors} { list [catch {run {string trimright}} msg] $msg } {1 {wrong # args: should be "string trimright string ?chars?"}} -test string-20.2.$noComp {string trimright errors} { +test string-20.2.$noComp {string trimright errors} -body { list [catch {run {string trimg a}} msg] $msg -} {1 {unknown or ambiguous subcommand "trimg": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match regexp -result {1 {unknown or ambiguous subcommand "trimg": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-20.3.$noComp {string trimright} { run {string trimright " XYZ "} } { XYZ} @@ -1894,7 +1895,7 @@ test string-21.16.$noComp {string wordend, unicode} -constraints utf16 -body { test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg -} -result {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-22.2.$noComp {string wordstart} -body { list [catch {run {string wordstart a}} msg] $msg } -result {1 {wrong # args: should be "string wordstart string index"}} -- cgit v0.12 From 0a2c1e30b152c1fcbd3180aadaf6b27039f07421 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 22 May 2020 21:28:16 +0000 Subject: Split more "string" functions. New helper function TclUniCharToUCS4(), not used yet but that's the next step. --- generic/tclCmdMZ.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++++- generic/tclInt.h | 2 + generic/tclUtf.c | 14 +++ 3 files changed, 284 insertions(+), 4 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 0da143e..0e624c6 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2547,6 +2547,146 @@ StringStartCmd( /* *---------------------------------------------------------------------- * + * StringPrevCharCmd -- + * + * This procedure is invoked to process the "string prevchar" Tcl + * command. See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +static int +StringPrevCharCmd( + TCL_UNUSED(ClientData), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + int ch; + const char *p, *string; + int cur, index, length, numChars; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "string index"); + return TCL_ERROR; + } + + string = TclGetStringFromObj(objv[1], &length); + numChars = Tcl_NumUtfChars(string, length); + if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + return TCL_ERROR; + } + string = TclGetStringFromObj(objv[1], &length); + if (index >= numChars) { + index = numChars - 1; + } + cur = 0; + if (index > 0) { + p = Tcl_UtfAtIndex(string, index); + + TclUtfToUCS4(p, &ch); + for (cur = index; cur >= 0; cur--) { + int delta = 0; + const char *next; + + if (!Tcl_UniCharIsWordChar(ch)) { + break; + } + + next = Tcl_UtfPrev(p, string); + do { + next += delta; + delta = TclUtfToUCS4(next, &ch); + } while (next + delta < p); + p = next; + } + if (cur != index) { + cur += 1; + } + } + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * + * StringPrevWordCmd -- + * + * This procedure is invoked to process the "string prevword" Tcl + * command. See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +static int +StringPrevWordCmd( + TCL_UNUSED(ClientData), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + int ch; + const char *p, *string; + int cur, index, length, numChars; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "string index"); + return TCL_ERROR; + } + + string = TclGetStringFromObj(objv[1], &length); + numChars = Tcl_NumUtfChars(string, length); + if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + return TCL_ERROR; + } + string = TclGetStringFromObj(objv[1], &length); + if (index >= numChars) { + index = numChars - 1; + } + cur = 0; + if (index > 0) { + p = Tcl_UtfAtIndex(string, index); + + TclUtfToUCS4(p, &ch); + for (cur = index; cur >= 0; cur--) { + int delta = 0; + const char *next; + + if (!Tcl_UniCharIsWordChar(ch)) { + break; + } + + next = Tcl_UtfPrev(p, string); + do { + next += delta; + delta = TclUtfToUCS4(next, &ch); + } while (next + delta < p); + p = next; + } + if (cur != index) { + cur += 1; + } + } + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * * StringEndCmd -- * * This procedure is invoked to process the "string wordend" Tcl command. @@ -2605,6 +2745,130 @@ StringEndCmd( return TCL_OK; } + +/* + *---------------------------------------------------------------------- + * + * StringNextCharCmd -- + * + * This procedure is invoked to process the "string nextchar" Tcl command. + * See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +static int +StringNextCharCmd( + TCL_UNUSED(ClientData), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + int ch; + const char *p, *end, *string; + int cur, index, length, numChars; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "string index"); + return TCL_ERROR; + } + + string = TclGetStringFromObj(objv[1], &length); + numChars = Tcl_NumUtfChars(string, length); + if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + return TCL_ERROR; + } + string = TclGetStringFromObj(objv[1], &length); + if (index < 0) { + index = 0; + } + if (index < numChars) { + p = Tcl_UtfAtIndex(string, index); + end = string+length; + for (cur = index; p < end; cur++) { + p += TclUtfToUCS4(p, &ch); + if (!Tcl_UniCharIsWordChar(ch)) { + break; + } + } + if (cur == index) { + cur++; + } + } else { + cur = numChars; + } + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + return TCL_OK; +} + + +/* + *---------------------------------------------------------------------- + * + * StringNextWordCmd -- + * + * This procedure is invoked to process the "string nextword" Tcl command. + * See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +static int +StringNextWordCmd( + TCL_UNUSED(ClientData), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + int ch; + const char *p, *end, *string; + int cur, index, length, numChars; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "string index"); + return TCL_ERROR; + } + + string = TclGetStringFromObj(objv[1], &length); + numChars = Tcl_NumUtfChars(string, length); + if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + return TCL_ERROR; + } + string = TclGetStringFromObj(objv[1], &length); + if (index < 0) { + index = 0; + } + if (index < numChars) { + p = Tcl_UtfAtIndex(string, index); + end = string+length; + for (cur = index; p < end; cur++) { + p += TclUtfToUCS4(p, &ch); + if (!Tcl_UniCharIsWordChar(ch)) { + break; + } + } + if (cur == index) { + cur++; + } + } else { + cur = numChars; + } + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + return TCL_OK; +} + /* *---------------------------------------------------------------------- * @@ -3321,10 +3585,10 @@ TclInitStringCmd( {"length", StringLenCmd, TclCompileStringLenCmd, NULL, NULL, 0}, {"map", StringMapCmd, TclCompileStringMapCmd, NULL, NULL, 0}, {"match", StringMatchCmd, TclCompileStringMatchCmd, NULL, NULL, 0}, - {"nextchar", StringEndCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, - {"nextword", StringEndCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, - {"prevchar", StringStartCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, - {"prevword", StringStartCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"nextchar", StringNextCharCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"nextword", StringNextWordCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"prevchar", StringPrevCharCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"prevword", StringPrevWordCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, {"range", StringRangeCmd, TclCompileStringRangeCmd, NULL, NULL, 0}, {"repeat", StringReptCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, {"replace", StringRplcCmd, TclCompileStringReplaceCmd, NULL, NULL, 0}, diff --git a/generic/tclInt.h b/generic/tclInt.h index 1b95754..ef7411a 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3252,8 +3252,10 @@ MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar +# define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1) #else MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); + MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *src, int *ucs4Ptr); #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index fd6ec1b..db2fc02 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2634,6 +2634,20 @@ TclUtfToUCS4( /* Make use of the #undef Tcl_UtfToUniChar above, which already handles UCS4. */ return Tcl_UtfToUniChar(src, ucs4Ptr); } + +int +TclUniCharToUCS4( + const Tcl_UniChar *src, /* The Tcl_UniChar string. */ + int *ucs4Ptr) /* Filled with the UCS4 codepoint represented + * by the Tcl_UniChar string. */ +{ + if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) { + *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000; + return 2; + } + *ucs4Ptr = src[0]; + return 1; +} #endif /* -- cgit v0.12 From 1075137cfc201f4c0aee86f118ed0b7e44febc24 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 23 May 2020 21:51:12 +0000 Subject: Fix testsuite when "string bytelength" doesn't exist. --- library/init.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/init.tcl b/library/init.tcl index 4ea22d8..8ebd29e6 100644 --- a/library/init.tcl +++ b/library/init.tcl @@ -209,9 +209,9 @@ proc unknown args { set errInfo [dict get $opts -errorinfo] set errCode [dict get $opts -errorcode] set cinfo $args - if {[string bytelength $cinfo] > 150} { + if {[string length $cinfo] > 150} { set cinfo [string range $cinfo 0 150] - while {[string bytelength $cinfo] > 150} { + while {[string length $cinfo] > 150} { set cinfo [string range $cinfo 0 end-1] } append cinfo ... -- cgit v0.12 From 017257e0ef3f20643166931986ea36eeee97f049 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 23 May 2020 22:07:53 +0000 Subject: Rewrite "string wordend" to use the Tcl_UniChar array. --- generic/tclCmdMZ.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 0e624c6..88bf2ec 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2709,28 +2709,26 @@ StringEndCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *end, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *end, *string; + int cur, index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); if (index < 0) { index = 0; } - if (index < numChars) { - p = Tcl_UtfAtIndex(string, index); + if (index < length) { + p = &string[index]; end = string+length; for (cur = index; p < end; cur++) { - p += TclUtfToUCS4(p, &ch); + p += TclUniCharToUCS4(p, &ch); if (!Tcl_UniCharIsWordChar(ch)) { break; } @@ -2739,7 +2737,7 @@ StringEndCmd( cur++; } } else { - cur = numChars; + cur = length; } Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); return TCL_OK; -- cgit v0.12 From 2b1daf9bb29fdba966f86c054d96d564b7539684 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 24 May 2020 22:29:07 +0000 Subject: Put back "string bytelength", not _that_ important for this TIP. Document that Tcl_UtfCharComplete() can be used now to protect Tcl_UtfNext() --- doc/Utf.3 | 11 ++++++----- generic/tclCmdMZ.c | 4 ---- library/init.tcl | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index 4b5b162..6ebf57d 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -233,10 +233,10 @@ characters. .PP \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR of \fIlength\fR bytes is long enough to be decoded by -\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee -that the UTF-8 string is properly formed. This routine is used by -procedures that are operating on a byte at a time and need to know if a -full Unicode character has been seen. +\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise. This function +does not guarantee that the UTF-8 string is properly formed. This routine +is used by procedures that are operating on a byte at a time and need to +know if a full Unicode character has been seen. .PP \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It returns the number of Tcl_UniChars that are represented by the UTF-8 string @@ -257,7 +257,8 @@ Given \fIsrc\fR, a pointer to some location in a UTF-8 string, \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null -character. +character. \fBTcl_UtfCharComplete\fR can be used in that case to +make sure enough bytes are available before calling \fBTcl_UtfNext\fR. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 88bf2ec..bbd03d8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -3093,7 +3093,6 @@ StringCatCmd( * *---------------------------------------------------------------------- */ -#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9 static int StringBytesCmd( TCL_UNUSED(ClientData), @@ -3112,7 +3111,6 @@ StringBytesCmd( Tcl_SetObjResult(interp, Tcl_NewWideIntObj(length)); return TCL_OK; } -#endif /* *---------------------------------------------------------------------- @@ -3569,9 +3567,7 @@ TclInitStringCmd( Tcl_Interp *interp) /* Current interpreter. */ { static const EnsembleImplMap stringImplMap[] = { -#if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9 {"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0}, -#endif {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0}, {"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0}, {"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0}, diff --git a/library/init.tcl b/library/init.tcl index 8ebd29e6..4ea22d8 100644 --- a/library/init.tcl +++ b/library/init.tcl @@ -209,9 +209,9 @@ proc unknown args { set errInfo [dict get $opts -errorinfo] set errCode [dict get $opts -errorcode] set cinfo $args - if {[string length $cinfo] > 150} { + if {[string bytelength $cinfo] > 150} { set cinfo [string range $cinfo 0 150] - while {[string length $cinfo] > 150} { + while {[string bytelength $cinfo] > 150} { set cinfo [string range $cinfo 0 end-1] } append cinfo ... -- cgit v0.12 From 2d7e36f00618d7f309c3970366b10fb888b83eea Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 25 May 2020 07:48:15 +0000 Subject: Fix "string is wordchar" in compiled case handling characters > U+FFFF. Adapt testcase exposing the problem. --- generic/tclExecute.c | 6 ++++-- tests/string.test | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 5708772..cc366e7 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5543,9 +5543,11 @@ TEBCresume( ustring1 = Tcl_GetUnicodeFromObj(valuePtr, &length); match = 1; if (length > 0) { + int ch; end = ustring1 + length; - for (p=ustring1 ; p Date: Mon, 25 May 2020 11:53:11 +0000 Subject: Finish implementation of "string nextchar|nextword|prevchar|prevword". Not thourougly test yet, but seems OK at first sight. --- generic/tclCmdMZ.c | 163 ++++++++++++++++++----------------------- generic/tclExecute.c | 2 +- generic/tclInt.h | 6 +- generic/tclUtf.c | 15 +++- tests/string.test | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 290 insertions(+), 96 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index bbd03d8..36b2443 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2499,40 +2499,38 @@ StringStartCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int cur, index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index >= length) { + index = length - 1; } cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); + p = &string[index]; - TclUtfToUCS4(p, &ch); + TclUniCharToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { int delta = 0; - const char *next; + const Tcl_UniChar *next; if (!Tcl_UniCharIsWordChar(ch)) { break; } - next = Tcl_UtfPrev(p, string); + next = TclUCS4Prev(p, string); do { next += delta; - delta = TclUtfToUCS4(next, &ch); + delta = TclUniCharToUCS4(next, &ch); } while (next + delta < p); p = next; } @@ -2568,49 +2566,28 @@ StringPrevCharCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index > length) { + index = length; } - cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); - - TclUtfToUCS4(p, &ch); - for (cur = index; cur >= 0; cur--) { - int delta = 0; - const char *next; - - if (!Tcl_UniCharIsWordChar(ch)) { - break; - } - - next = Tcl_UtfPrev(p, string); - do { - next += delta; - delta = TclUtfToUCS4(next, &ch); - } while (next + delta < p); - p = next; - } - if (cur != index) { - cur += 1; - } + p = &string[index]; + index = TclUCS4Prev(p, string) - string; + } else { + index = 0; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; } @@ -2639,40 +2616,53 @@ StringPrevWordCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int cur, index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index >= length) { + index = length - 1; } cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); + p = &string[index]; - TclUtfToUCS4(p, &ch); + TclUniCharToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { int delta = 0; - const char *next; + const Tcl_UniChar *next; if (!Tcl_UniCharIsWordChar(ch)) { break; } - next = Tcl_UtfPrev(p, string); + next = TclUCS4Prev(p, string); do { next += delta; - delta = TclUtfToUCS4(next, &ch); + delta = TclUniCharToUCS4(next, &ch); + } while (next + delta < p); + p = next; + } + for (; cur >= 0; cur--) { + int delta = 0; + const Tcl_UniChar *next; + + if (Tcl_UniCharIsWordChar(ch)) { + break; + } + + next = TclUCS4Prev(p, string); + do { + next += delta; + delta = TclUniCharToUCS4(next, &ch); } while (next + delta < p); p = next; } @@ -2769,39 +2759,27 @@ StringNextCharCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *end, *string; - int cur, index, length, numChars; + const Tcl_UniChar *string; + int index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); if (index < 0) { index = 0; } - if (index < numChars) { - p = Tcl_UtfAtIndex(string, index); - end = string+length; - for (cur = index; p < end; cur++) { - p += TclUtfToUCS4(p, &ch); - if (!Tcl_UniCharIsWordChar(ch)) { - break; - } - } - if (cur == index) { - cur++; - } + if (index < length) { + index += TclUniCharToUCS4(&string[index], &ch); } else { - cur = numChars; + index = length; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; } @@ -2831,39 +2809,40 @@ StringNextWordCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *end, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *end, *string; + int index, length; if (objc != 3) { Tcl_WrongNumArgs(interp, 1, objv, "string index"); return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); if (index < 0) { index = 0; } - if (index < numChars) { - p = Tcl_UtfAtIndex(string, index); + if (index < length) { + p = &string[index]; end = string+length; - for (cur = index; p < end; cur++) { - p += TclUtfToUCS4(p, &ch); + while (index++, p < end) { + p += TclUniCharToUCS4(p, &ch); if (!Tcl_UniCharIsWordChar(ch)) { break; } } - if (cur == index) { - cur++; + for (; p < end; index++) { + p += TclUniCharToUCS4(p, &ch); + if (Tcl_UniCharIsWordChar(ch)) { + break; + } } } else { - cur = numChars; + index = length; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; } diff --git a/generic/tclExecute.c b/generic/tclExecute.c index cc366e7..c80e12b 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5292,7 +5292,7 @@ TEBCresume( } else if (TclIsPureByteArray(valuePtr)) { objResultPtr = Tcl_NewByteArrayObj( Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1); - } else if (valuePtr->bytes && length == valuePtr->length) { + } else if (valuePtr->bytes && length == valuePtr->length && !(valuePtr->bytes[index] & 0x80)) { objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { diff --git a/generic/tclInt.h b/generic/tclInt.h index ef7411a..e1dedda 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3253,9 +3253,11 @@ MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar # define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1) +# define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src)) #else - MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); - MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *src, int *ucs4Ptr); + MODULE_SCOPE int TclUtfToUCS4(const char *, int *); + MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *, int *); + MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *); #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index db2fc02..807e087 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2642,12 +2642,25 @@ TclUniCharToUCS4( * by the Tcl_UniChar string. */ { if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) { - *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000; + *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000; return 2; } *ucs4Ptr = src[0]; return 1; } + +const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) { + if (src <= ptr + 1) { + return ptr; + } + if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) { + return src - 2; + } + return src - 1; +} + + + #endif /* diff --git a/tests/string.test b/tests/string.test index 184a555..17a6d3c 100644 --- a/tests/string.test +++ b/tests/string.test @@ -2534,6 +2534,206 @@ test string-32.17.$noComp {string is dict, valid dict packed in invalid dict} { string is dict {{a b c d e f g h}} } 0 +test string-33.1.$noComp {string nextchar} -body { + list [catch {run {string nextchar a}} msg] $msg +} -result {1 {wrong # args: should be "string nextchar string index"}} +test string-33.2.$noComp {string nextchar} -body { + list [catch {run {string nextchar a b c}} msg] $msg +} -result {1 {wrong # args: should be "string nextchar string index"}} +test string-33.3.$noComp {string nextchar} -body { + list [catch {run {string nextchar a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-33.4.$noComp {string nextchar} -body { + run {string nextchar abc. -1} +} -result 1 +test string-33.5.$noComp {string nextchar} -body { + run {string nextchar abc. 100} +} -result 4 +test string-33.6.$noComp {string nextchar} -body { + run {string nextchar "word_one two three" 2} +} -result 3 +test string-33.7.$noComp {string nextchar} -body { + run {string nextchar "one .&# three" 5} +} -result 6 +test string-33.8.$noComp {string nextchar} -body { + run {string worde "x.y" 0} +} -result 1 +test string-33.9.$noComp {string nextchar} -body { + run {string worde "x.y" end-1} +} -result 2 +test string-33.10.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\xC7de fg" 0} +} -result 1 +test string-33.11.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\uC700de fg" 0} +} -result 1 +test string-33.12.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\u203Fde fg" 0} +} -result 1 +test string-33.13.$noComp {string nextchar, unicode} -body { + run {string nextchar "xyz\u2045de fg" 0} +} -result 1 +test string-33.14.$noComp {string nextchar, unicode} -body { + run {string nextchar "\uC700\uC700 abc" 8} +} -result 6 +test string-33.15.$noComp {string nextchar, unicode} -constraints utf16 -body { + run {string nextchar "\U1D7CA\U1D7CA abc" 0} +} -result 2 +test string-33.16.$noComp {string nextchar, unicode} -constraints utf16 -body { + run {string nextchar "\U1D7CA\U1D7CA abc" 10} +} -result 8 + +test string-34.1.$noComp {string nextword} -body { + list [catch {run {string nextword a}} msg] $msg +} -result {1 {wrong # args: should be "string nextword string index"}} +test string-34.2.$noComp {string nextword} -body { + list [catch {run {string nextword a b c}} msg] $msg +} -result {1 {wrong # args: should be "string nextword string index"}} +test string-34.3.$noComp {string nextword} -body { + list [catch {run {string nextword a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-34.4.$noComp {string nextword} -body { + run {string nextword abc. -1} +} -result 4 +test string-34.5.$noComp {string nextword} -body { + run {string nextword abc. 100} +} -result 4 +test string-34.6.$noComp {string nextword} -body { + run {string nextword "word_one two three" 2} +} -result 9 +test string-34.7.$noComp {string nextword} -body { + run {string nextword "one .&# three" 5} +} -result 8 +test string-34.8.$noComp {string nextword} -body { + run {string worde "x.y" 0} +} -result 1 +test string-34.9.$noComp {string nextword} -body { + run {string worde "x.y" end-1} +} -result 2 +test string-34.10.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\xC7de fg" 0} +} -result 7 +test string-34.11.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\uC700de fg" 0} +} -result 7 +test string-34.12.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\u203Fde fg" 0} +} -result 7 +test string-34.13.$noComp {string nextword, unicode} -body { + run {string nextword "xyz\u2045\u2045de fg" 0} +} -result 5 +test string-34.14.$noComp {string nextword, unicode} -body { + run {string nextword "\uC700\uC700 abc" 8} +} -result 6 +test string-34.15.$noComp {string nextword, unicode} -body { + run {string nextword "\U1D7CA\U1D7CA abc" 0} +} -result 3 +test string-34.16.$noComp {string nextword, unicode} -constraints utf16 -body { + run {string nextword "\U1D7CA\U1D7CA abc" 10} +} -result 8 + +test string-35.1.$noComp {string prevchar} -body { + list [catch {run {string word a}} msg] $msg +} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +test string-35.2.$noComp {string prevchar} -body { + list [catch {run {string prevchar a}} msg] $msg +} -result {1 {wrong # args: should be "string prevchar string index"}} +test string-35.3.$noComp {string prevchar} -body { + list [catch {run {string prevchar a b c}} msg] $msg +} -result {1 {wrong # args: should be "string prevchar string index"}} +test string-35.4.$noComp {string prevchar} -body { + list [catch {run {string prevchar a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-35.5.$noComp {string prevchar} -body { + run {string prevchar "one two three_words" 400} +} -result 18 +test string-35.6.$noComp {string prevchar} -body { + run {string prevchar "one two three_words" 2} +} -result 1 +test string-35.7.$noComp {string prevchar} -body { + run {string prevchar "one two three_words" -2} +} -result 0 +test string-35.8.$noComp {string prevchar} -body { + run {string prevchar "one .*&^ three" 6} +} -result 5 +test string-35.9.$noComp {string prevchar} -body { + run {string prevchar "one two three" 4} +} -result 3 +test string-35.10.$noComp {string prevchar} -body { + run {string prevchar "one two three" end-5} +} -result 6 +test string-35.11.$noComp {string prevchar, unicode} -body { + run {string prevchar "one tw\xC7o three" 7} +} -result 6 +test string-35.12.$noComp {string prevchar, unicode} -body { + run {string prevchar "ab\uC700\uC700 cdef ghi" 12} +} -result 11 +test string-35.13.$noComp {string prevchar, unicode} -body { + run {string prevchar "\uC700\uC700 abc" 8} +} -result 5 +test string-35.14.$noComp {string prevchar, invalid UTF-8} -constraints testbytestring -body { + # See Bug c61818e4c9 + set demo [testbytestring "abc def\xE0\xA9ghi"] + run {string index $demo [string prevchar $demo 10]} +} -result g +test string-35.15.$noComp {string prevchar, unicode} -body { + run {string prevchar "\U1D7CA\U1D7CA abc" 0} +} -result 0 +test string-35.16.$noComp {string prevchar, unicode} -constraints utf16 -body { + run {string prevchar "\U1D7CA\U1D7CA abc" 10} +} -result 7 + +test string-36.1.$noComp {string prevword} -body { + list [catch {run {string word a}} msg] $msg +} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, compare, equal, first, index, insert, is, last, length, map, match, nextchar, nextword, prevchar, prevword, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +test string-36.2.$noComp {string prevword} -body { + list [catch {run {string prevword a}} msg] $msg +} -result {1 {wrong # args: should be "string prevword string index"}} +test string-36.3.$noComp {string prevword} -body { + list [catch {run {string prevword a b c}} msg] $msg +} -result {1 {wrong # args: should be "string prevword string index"}} +test string-36.4.$noComp {string prevword} -body { + list [catch {run {string prevword a gorp}} msg] $msg +} -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} +test string-36.5.$noComp {string prevword} -body { + run {string prevword "one two three_words" 400} +} -result 7 +test string-36.6.$noComp {string prevword} -body { + run {string prevword "one two three_words" 2} +} -result 0 +test string-36.7.$noComp {string prevword} -body { + run {string prevword "one two three_words" -2} +} -result 0 +test string-36.8.$noComp {string prevword} -body { + run {string prevword "one .*&^ three" 6} +} -result 3 +test string-36.9.$noComp {string prevword} -body { + run {string prevword "one two three" 4} +} -result 3 +test string-36.10.$noComp {string prevword} -body { + run {string prevword "one two three" end-5} +} -result 7 +test string-36.11.$noComp {string prevword, unicode} -body { + run {string prevword "one tw\xC7o three" 7} +} -result 3 +test string-36.12.$noComp {string prevword, unicode} -body { + run {string prevword "ab\uC700\uC700 cdef ghi" 12} +} -result 9 +test string-36.13.$noComp {string prevword, unicode} -body { + run {string prevword "\uC700\uC700 abc" 8} +} -result 2 +test string-36.14.$noComp {string prevword, invalid UTF-8} -constraints testbytestring -body { + # See Bug c61818e4c9 + set demo [testbytestring "abc def\xE0\xA9ghi"] + run {string index $demo [string prevword $demo 10]} +} -result \xA9 +test string-36.15.$noComp {string prevword, unicode} -body { + run {string prevword "\U1D7CA\U1D7CA abc" 0} +} -result 0 +test string-36.16.$noComp {string prevword, unicode} -constraints utf16 -body { + run {string prevword "\U1D7CA\U1D7CA abc" 10} +} -result 4 + }; # foreach noComp {0 1} # cleanup -- cgit v0.12 From a09671a0a00f2d3e4abf4747a072da94b0320459 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 29 May 2020 13:36:58 +0000 Subject: Change implementation "charstart", not behaving as "prevchar" any more. Also optimize charend/charstart for TCL_UTF_MAX>3 (not need to do actual conversion then). --- generic/tclCmdMZ.c | 59 +++++++++++++++++++++++++++++++++++------------------- tests/string.test | 59 +++++++++++++++++++++++++++--------------------------- 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index e15e5c8..63268a4 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2518,7 +2518,7 @@ StringStartCmd( if (index > 0) { p = &string[index]; - TclUniCharToUCS4(p, &ch); + (void)TclUniCharToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { int delta = 0; const Tcl_UniChar *next; @@ -2537,8 +2537,6 @@ StringStartCmd( if (cur != index) { cur += 1; } - } else { - cur = -1; } Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); return TCL_OK; @@ -2568,7 +2566,11 @@ StringCharStartCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - const Tcl_UniChar *p, *string; +#if TCL_UTF_MAX <= 3 + const Tcl_UniChar *src; +#else + const char *src; +#endif int index, length; if (objc != 3) { @@ -2576,18 +2578,23 @@ StringCharStartCmd( return TCL_ERROR; } - string = Tcl_GetUnicodeFromObj(objv[1], &length); +#if TCL_UTF_MAX <= 3 + src = Tcl_GetUnicodeFromObj(objv[1], &length); +#else + src = Tcl_GetStringFromObj(objv[1], &length); + length = Tcl_NumUtfChars(src, length); +#endif if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - if (index > length) { + if (index >= length) { index = length; - } - if (index > 0) { - p = &string[index]; - index = TclUCS4Prev(p, string) - string; - } else { - index = 0; + } else if (index < 0) { + index = -1; +#if TCL_UTF_MAX <= 3 + } else if ((index > 0) && ((src[index-1] & 0xFC00) == 0xD800) && ((src[index] & 0xFC00) == 0xDC00)) { + index--; +#endif } Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; @@ -2646,7 +2653,7 @@ StringEndCmd( cur++; } } else { - cur = -1; + cur = length; } Tcl_SetObjResult(interp, Tcl_NewWideIntObj(cur)); return TCL_OK; @@ -2676,8 +2683,11 @@ StringCharEndCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - int ch; - const Tcl_UniChar *string; +#if TCL_UTF_MAX <= 3 + const Tcl_UniChar *src; +#else + const char *src; +#endif int index, length; if (objc != 3) { @@ -2685,17 +2695,24 @@ StringCharEndCmd( return TCL_ERROR; } - string = Tcl_GetUnicodeFromObj(objv[1], &length); +#if TCL_UTF_MAX <= 3 + src = Tcl_GetUnicodeFromObj(objv[1], &length); +#else + src = Tcl_GetStringFromObj(objv[1], &length); + length = Tcl_NumUtfChars(src, length); +#endif if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - if (index < 0) { + if (++index < 0) { index = 0; } - if (index < length) { - index += TclUniCharToUCS4(&string[index], &ch); - } else { - index = -1; + if (index >= length) { + index = length; +#if TCL_UTF_MAX <= 3 + } else if ((index > 0) && ((src[index-1] & 0xFC00) == 0xD800) && ((src[index] & 0xFC00) == 0xDC00)) { + index++; +#endif } Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); return TCL_OK; diff --git a/tests/string.test b/tests/string.test index d868610..cddd506 100644 --- a/tests/string.test +++ b/tests/string.test @@ -33,7 +33,6 @@ testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf16 [expr {[string length \U010000] == 2}] testConstraint testbytestring [llength [info commands testbytestring]] -testConstraint nodep [info exists tcl_precision] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] @@ -73,9 +72,9 @@ if {$noComp} { } -test string-1.1.$noComp {error conditions} -body { +test string-1.1.$noComp {error conditions} { list [catch {run {string gorp a b}} msg] $msg -} -match regexp -result {1 {unknown or ambiguous subcommand "gorp": must be (bytelength, |)cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-1.2.$noComp {error conditions} { list [catch {run {string}} msg] $msg } {1 {wrong # args: should be "string subcommand ?arg ...?"}} @@ -1025,16 +1024,16 @@ test string-7.16.$noComp {string last, start index} { run {string last \334a \334ad\334ad end-1} } 3 -test string-8.1.$noComp {string bytelength} nodep { +test string-8.1.$noComp {string bytelength} { list [catch {run {string bytelength}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.2.$noComp {string bytelength} nodep { +test string-8.2.$noComp {string bytelength} { list [catch {run {string bytelength a b}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.3.$noComp {string bytelength} nodep { +test string-8.3.$noComp {string bytelength} { run {string bytelength "\xC7"} } 2 -test string-8.4.$noComp {string bytelength} nodep { +test string-8.4.$noComp {string bytelength} { run {string b ""} } 0 @@ -1800,9 +1799,9 @@ test string-19.3.$noComp {string trimleft, unicode default} { test string-20.1.$noComp {string trimright errors} { list [catch {run {string trimright}} msg] $msg } {1 {wrong # args: should be "string trimright string ?chars?"}} -test string-20.2.$noComp {string trimright errors} -body { +test string-20.2.$noComp {string trimright errors} { list [catch {run {string trimg a}} msg] $msg -} -match regexp -result {1 {unknown or ambiguous subcommand "trimg": must be (bytelength, |)cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} {1 {unknown or ambiguous subcommand "trimg": must be bytelength, cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-20.3.$noComp {string trimright} { run {string trimright " XYZ "} } { XYZ} @@ -1858,7 +1857,7 @@ test string-21.4.$noComp {string wordend} -body { } -result 3 test string-21.5.$noComp {string wordend} -body { run {string wordend abc. 100} -} -result -1 +} -result 4 test string-21.6.$noComp {string wordend} -body { run {string wordend "word_one two three" 2} } -result 8 @@ -1885,17 +1884,17 @@ test string-21.13.$noComp {string wordend, unicode} -body { } -result 3 test string-21.14.$noComp {string wordend, unicode} -body { run {string wordend "\uC700\uC700 abc" 8} -} -result -1 +} -result 6 test string-21.15.$noComp {string wordend, unicode} -body { run {string wordend "\U1D7CA\U1D7CA abc" 0} } -result 2 test string-21.16.$noComp {string wordend, unicode} -constraints utf16 -body { run {string wordend "\U1D7CA\U1D7CA abc" 10} -} -result -1 +} -result 8 test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg -} -match regexp -result {1 {unknown or ambiguous subcommand "word": must be (bytelength, |)cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -result {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, charend, charstart, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-22.2.$noComp {string wordstart} -body { list [catch {run {string wordstart a}} msg] $msg } -result {1 {wrong # args: should be "string wordstart string index"}} @@ -1913,7 +1912,7 @@ test string-22.6.$noComp {string wordstart} -body { } -result 0 test string-22.7.$noComp {string wordstart} -body { run {string wordstart "one two three_words" -2} -} -result -1 +} -result 0 test string-22.8.$noComp {string wordstart} -body { run {string wordstart "one .*&^ three" 6} } -result 6 @@ -1939,7 +1938,7 @@ test string-22.14.$noComp {string wordstart, invalid UTF-8} -constraints testbyt } -result g test string-22.15.$noComp {string wordstart, unicode} -body { run {string wordstart "\U1D7CA\U1D7CA abc" 0} -} -result -1 +} -result 0 test string-22.16.$noComp {string wordstart, unicode} -constraints utf16 -body { run {string wordstart "\U1D7CA\U1D7CA abc" 10} } -result 5 @@ -2545,10 +2544,10 @@ test string-33.3.$noComp {string charend} -body { } -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} test string-33.4.$noComp {string charend} -body { run {string charend abc. -1} -} -result 1 +} -result 0 test string-33.5.$noComp {string charend} -body { run {string charend abc. 100} -} -result -1 +} -result 4 test string-33.6.$noComp {string charend} -body { run {string charend "word_one two three" 2} } -result 3 @@ -2575,13 +2574,13 @@ test string-33.13.$noComp {string charend, unicode} -body { } -result 1 test string-33.14.$noComp {string charend, unicode} -body { run {string charend "\uC700\uC700 abc" 8} -} -result -1 +} -result 6 test string-33.15.$noComp {string charend, unicode} -constraints utf16 -body { run {string charend "\U1D7CA\U1D7CA abc" 0} } -result 2 test string-33.16.$noComp {string charend, unicode} -constraints utf16 -body { run {string charend "\U1D7CA\U1D7CA abc" 10} -} -result -1 +} -result 8 test string-34.1.$noComp {string charstart} -body { list [catch {run {string word a}} msg] $msg @@ -2597,42 +2596,42 @@ test string-34.4.$noComp {string charstart} -body { } -result {1 {bad index "gorp": must be integer?[+-]integer? or end?[+-]integer?}} test string-34.5.$noComp {string charstart} -body { run {string charstart "one two three_words" 400} -} -result 18 +} -result 19 test string-34.6.$noComp {string charstart} -body { run {string charstart "one two three_words" 2} -} -result 1 +} -result 2 test string-34.7.$noComp {string charstart} -body { run {string charstart "one two three_words" -2} -} -result 0 +} -result -1 test string-34.8.$noComp {string charstart} -body { run {string charstart "one .*&^ three" 6} -} -result 5 +} -result 6 test string-34.9.$noComp {string charstart} -body { run {string charstart "one two three" 4} -} -result 3 +} -result 4 test string-34.10.$noComp {string charstart} -body { run {string charstart "one two three" end-5} -} -result 6 +} -result 7 test string-34.11.$noComp {string charstart, unicode} -body { run {string charstart "one tw\xC7o three" 7} -} -result 6 +} -result 7 test string-34.12.$noComp {string charstart, unicode} -body { run {string charstart "ab\uC700\uC700 cdef ghi" 12} -} -result 11 +} -result 12 test string-34.13.$noComp {string charstart, unicode} -body { run {string charstart "\uC700\uC700 abc" 8} -} -result 5 +} -result 6 test string-34.14.$noComp {string charstart, invalid UTF-8} -constraints testbytestring -body { # See Bug c61818e4c9 set demo [testbytestring "abc def\xE0\xA9ghi"] run {string index $demo [string charstart $demo 10]} -} -result g +} -result h test string-34.15.$noComp {string charstart, unicode} -body { run {string charstart "\U1D7CA\U1D7CA abc" 0} } -result 0 test string-34.16.$noComp {string charstart, unicode} -constraints utf16 -body { run {string charstart "\U1D7CA\U1D7CA abc" 10} -} -result 7 +} -result 8 }; # foreach noComp {0 1} -- cgit v0.12