diff options
Diffstat (limited to 'generic')
| -rw-r--r-- | generic/regc_locale.c | 2 | ||||
| -rw-r--r-- | generic/tcl.decls | 20 | ||||
| -rw-r--r-- | generic/tclCmdIL.c | 4 | ||||
| -rw-r--r-- | generic/tclCmdMZ.c | 173 | ||||
| -rw-r--r-- | generic/tclCompExpr.c | 8 | ||||
| -rw-r--r-- | generic/tclDecls.h | 56 | ||||
| -rw-r--r-- | generic/tclEncoding.c | 4 | ||||
| -rw-r--r-- | generic/tclExecute.c | 2 | ||||
| -rw-r--r-- | generic/tclIndexObj.c | 2 | ||||
| -rw-r--r-- | generic/tclInt.h | 17 | ||||
| -rw-r--r-- | generic/tclParse.c | 8 | ||||
| -rw-r--r-- | generic/tclStringObj.c | 6 | ||||
| -rw-r--r-- | generic/tclStubInit.c | 36 | ||||
| -rw-r--r-- | generic/tclTest.c | 5 | ||||
| -rw-r--r-- | generic/tclUtf.c | 128 | ||||
| -rw-r--r-- | generic/tclUtil.c | 14 |
16 files changed, 345 insertions, 140 deletions
diff --git a/generic/regc_locale.c b/generic/regc_locale.c index c90dd64..cc4681b 100644 --- a/generic/regc_locale.c +++ b/generic/regc_locale.c @@ -1269,7 +1269,7 @@ casecmp( size_t len) /* exact length of comparison */ { for (; len > 0; len--, x++, y++) { - if ((*x!=*y) && (Tcl_UniCharToLower(*x) != Tcl_UniCharToLower(*y))) { + if ((*x!=*y) && (Tcl_UniCharFold(*x) != Tcl_UniCharFold(*y))) { return 1; } } diff --git a/generic/tcl.decls b/generic/tcl.decls index e49ed66..3785558 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1163,7 +1163,7 @@ declare 325 { const char *Tcl_UtfAtIndex(const char *src, int index) } declare 326 { - int Tcl_UtfCharComplete(const char *src, int length) + int TclUtfCharComplete(const char *src, int length) } declare 327 { int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst) @@ -1175,10 +1175,10 @@ declare 329 { const char *Tcl_UtfFindLast(const char *src, int ch) } declare 330 { - const char *Tcl_UtfNext(const char *src) + const char *TclUtfNext(const char *src) } declare 331 { - const char *Tcl_UtfPrev(const char *src, const char *start) + const char *TclUtfPrev(const char *src, const char *start) } declare 332 { int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, @@ -2402,6 +2402,20 @@ declare 648 { int length, Tcl_DString *dsPtr) } +# TIP #575 +declare 649 { + int Tcl_UtfCharComplete(const char *src, int length) +} +declare 650 { + const char *Tcl_UtfNext(const char *src) +} +declare 651 { + const char *Tcl_UtfPrev(const char *src, const char *start) +} +declare 652 { + int Tcl_UniCharFold(int ch) +} + # ----- BASELINE -- FOR -- 8.7.0 ----- # ############################################################################## diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c index da8dc65..cf900a3 100644 --- a/generic/tclCmdIL.c +++ b/generic/tclCmdIL.c @@ -4782,8 +4782,8 @@ DictionaryCompare( * other interesting punctuations occur). */ - uniLeftLower = Tcl_UniCharToLower(uniLeft); - uniRightLower = Tcl_UniCharToLower(uniRight); + uniLeftLower = Tcl_UniCharFold(uniLeft); + uniRightLower = Tcl_UniCharFold(uniRight); } else { diff = UCHAR(*left) - UCHAR(*right); break; diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index c47490a..cf4240a 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -629,10 +629,10 @@ Tcl_RegsubObjCmd( wlen = 0; } } else { - wsrclc = Tcl_UniCharToLower(*wsrc); + wsrclc = Tcl_UniCharFold(*wsrc); for (p = wfirstChar = wstring; wstring < wend; wstring++) { if ((*wstring == *wsrc || - (nocase && Tcl_UniCharToLower(*wstring)==wsrclc)) && + (nocase && Tcl_UniCharFold(*wstring)==wsrclc)) && (slen==1 || (strCmpFn(wstring, wsrc, (unsigned long) slen) == 0))) { if (numMatches == 0) { @@ -2096,10 +2096,10 @@ StringMapCmd( ustring1 = end; } else { mapString = Tcl_GetUnicodeFromObj(mapElemv[1], &mapLen); - u2lc = (nocase ? Tcl_UniCharToLower(*ustring2) : 0); + u2lc = (nocase ? Tcl_UniCharFold(*ustring2) : 0); for (; ustring1 < end; ustring1++) { if (((*ustring1 == *ustring2) || - (nocase&&Tcl_UniCharToLower(*ustring1)==u2lc)) && + (nocase&&Tcl_UniCharFold(*ustring1)==u2lc)) && (length2==1 || strCmpFn(ustring1, ustring2, (unsigned long) length2) == 0)) { if (p != ustring1) { @@ -2134,7 +2134,7 @@ StringMapCmd( mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index], mapLens+index); if (nocase && ((index % 2) == 0)) { - u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]); + u2lc[index/2] = Tcl_UniCharFold(*mapStrings[index]); } } for (p = ustring1; ustring1 < end; ustring1++) { @@ -2146,7 +2146,7 @@ StringMapCmd( ustring2 = mapStrings[index]; length2 = mapLens[index]; if ((length2 > 0) && ((*ustring1 == *ustring2) || (nocase && - (Tcl_UniCharToLower(*ustring1) == u2lc[index/2]))) && + (Tcl_UniCharFold(*ustring1) == u2lc[index/2]))) && /* Restrict max compare length. */ (end-ustring1 >= length2) && ((length2 == 1) || !strCmpFn(ustring2, ustring1, length2))) { @@ -2500,8 +2500,8 @@ StringStartCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *string; + int cur, index, length; Tcl_Obj *obj; if (objc != 3) { @@ -2509,32 +2509,30 @@ StringStartCmd( return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - if (index >= numChars) { - index = numChars - 1; + if (index >= length) { + index = length - 1; } cur = 0; if (index > 0) { - p = Tcl_UtfAtIndex(string, index); + p = &string[index]; - TclUtfToUCS4(p, &ch); + (void)TclUniCharToUCS4(p, &ch); for (cur = index; cur >= 0; cur--) { int delta = 0; - const char *next; + const Tcl_UniChar *next; if (!Tcl_UniCharIsWordChar(ch)) { break; } - next = TclUtfPrev(p, string); + next = TclUCS4Prev(p, string); do { next += delta; - delta = TclUtfToUCS4(next, &ch); + delta = TclUniCharToUCS4(next, &ch); } while (next + delta < p); p = next; } @@ -2550,6 +2548,64 @@ StringStartCmd( /* *---------------------------------------------------------------------- * + * StringCharStartCmd -- + * + * This procedure is invoked to process the "string charstart" Tcl + * command. See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +static int +StringCharStartCmd( + TCL_UNUSED(ClientData), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ +#if TCL_UTF_MAX <= 3 + const Tcl_UniChar *src; +#else + const char *src; +#endif + int index, length; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "string index"); + return TCL_ERROR; + } + +#if TCL_UTF_MAX <= 3 + src = Tcl_GetUnicodeFromObj(objv[1], &length); +#else + src = Tcl_GetStringFromObj(objv[1], &length); + length = Tcl_NumUtfChars(src, length); +#endif + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { + return TCL_ERROR; + } + if (index >= length) { + index = length; + } else if (index < 0) { + index = -1; +#if TCL_UTF_MAX <= 3 + } else if ((index > 0) && ((src[index-1] & 0xFC00) == 0xD800) && ((src[index] & 0xFC00) == 0xDC00)) { + index--; +#endif + } + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * * StringEndCmd -- * * This procedure is invoked to process the "string wordend" Tcl command. @@ -2572,8 +2628,8 @@ StringEndCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int ch; - const char *p, *end, *string; - int cur, index, length, numChars; + const Tcl_UniChar *p, *end, *string; + int cur, index, length; Tcl_Obj *obj; if (objc != 3) { @@ -2581,20 +2637,18 @@ StringEndCmd( return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); - numChars = Tcl_NumUtfChars(string, length); - if (TclGetIntForIndexM(interp, objv[2], numChars-1, &index) != TCL_OK) { + string = Tcl_GetUnicodeFromObj(objv[1], &length); + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } - string = TclGetStringFromObj(objv[1], &length); if (index < 0) { index = 0; } - if (index < numChars) { - p = Tcl_UtfAtIndex(string, index); + if (index < length) { + p = &string[index]; end = string+length; for (cur = index; p < end; cur++) { - p += TclUtfToUCS4(p, &ch); + p += TclUniCharToUCS4(p, &ch); if (!Tcl_UniCharIsWordChar(ch)) { break; } @@ -2603,7 +2657,7 @@ StringEndCmd( cur++; } } else { - cur = numChars; + cur = length; } TclNewIntObj(obj, cur); Tcl_SetObjResult(interp, obj); @@ -2613,6 +2667,65 @@ StringEndCmd( /* *---------------------------------------------------------------------- * + * StringCharEndCmd -- + * + * This procedure is invoked to process the "string charend" Tcl command. + * See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + +static int +StringCharEndCmd( + TCL_UNUSED(ClientData), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ +#if TCL_UTF_MAX <= 3 + const Tcl_UniChar *src; +#else + const char *src; +#endif + int index, length; + + if (objc != 3) { + Tcl_WrongNumArgs(interp, 1, objv, "string index"); + return TCL_ERROR; + } + +#if TCL_UTF_MAX <= 3 + src = Tcl_GetUnicodeFromObj(objv[1], &length); +#else + src = Tcl_GetStringFromObj(objv[1], &length); + length = Tcl_NumUtfChars(src, length); +#endif + if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { + return TCL_ERROR; + } + if (++index < 0) { + index = 0; + } + if (index >= length) { + index = length; +#if TCL_UTF_MAX <= 3 + } else if ((index > 0) && ((src[index-1] & 0xFC00) == 0xD800) && ((src[index] & 0xFC00) == 0xDC00)) { + index++; +#endif + } + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(index)); + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * * StringEqualCmd -- * * This procedure is invoked to process the "string equal" Tcl command. @@ -3312,6 +3425,8 @@ TclInitStringCmd( static const EnsembleImplMap stringImplMap[] = { {"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0}, {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0}, + {"charend", StringCharEndCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, + {"charstart", StringCharStartCmd, TclCompileBasic2ArgCmd, NULL, NULL, 0}, {"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0}, {"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0}, {"first", StringFirstCmd, TclCompileStringFirstCmd, NULL, NULL, 0}, diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index aabd764..f35038f 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1895,7 +1895,7 @@ ParseLexeme( { const char *end; int scanned; - Tcl_UniChar ch = 0; + int ch; Tcl_Obj *literal = NULL; unsigned char byte; @@ -2103,13 +2103,13 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = TclUtfToUniChar(start, &ch); + scanned = TclUtfToUCS4(start, &ch); } else { - char utfBytes[4]; + char utfBytes[8]; memcpy(utfBytes, start, numBytes); utfBytes[numBytes] = '\0'; - scanned = TclUtfToUniChar(utfBytes, &ch); + scanned = TclUtfToUCS4(utfBytes, &ch); } *lexemePtr = INVALID; Tcl_DecrRefCount(literal); diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 9ea6838..31bb2d4 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -999,7 +999,7 @@ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ EXTERN const char * Tcl_UtfAtIndex(const char *src, int index); /* 326 */ -EXTERN int Tcl_UtfCharComplete(const char *src, int length); +EXTERN int TclUtfCharComplete(const char *src, int length); /* 327 */ EXTERN int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst); @@ -1008,9 +1008,9 @@ EXTERN const char * Tcl_UtfFindFirst(const char *src, int ch); /* 329 */ EXTERN const char * Tcl_UtfFindLast(const char *src, int ch); /* 330 */ -EXTERN const char * Tcl_UtfNext(const char *src); +EXTERN const char * TclUtfNext(const char *src); /* 331 */ -EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +EXTERN const char * TclUtfPrev(const char *src, const char *start); /* 332 */ EXTERN int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, @@ -1920,6 +1920,14 @@ EXTERN char * Tcl_UniCharToUtfDString(const int *uniStr, /* 648 */ EXTERN int * Tcl_UtfToUniCharDString(const char *src, int length, Tcl_DString *dsPtr); +/* 649 */ +EXTERN int Tcl_UtfCharComplete(const char *src, int length); +/* 650 */ +EXTERN const char * Tcl_UtfNext(const char *src); +/* 651 */ +EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +/* 652 */ +EXTERN int Tcl_UniCharFold(int ch); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2281,12 +2289,12 @@ typedef struct TclStubs { int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ - int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ + int (*tclUtfCharComplete) (const char *src, int length); /* 326 */ int (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ const char * (*tcl_UtfFindLast) (const char *src, int ch); /* 329 */ - const char * (*tcl_UtfNext) (const char *src); /* 330 */ - const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 331 */ + const char * (*tclUtfNext) (const char *src); /* 330 */ + const char * (*tclUtfPrev) (const char *src, const char *start); /* 331 */ int (*tcl_UtfToExternal) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); /* 332 */ char * (*tcl_UtfToExternalDString) (Tcl_Encoding encoding, const char *src, int srcLen, Tcl_DString *dsPtr); /* 333 */ int (*tcl_UtfToLower) (char *src); /* 334 */ @@ -2604,6 +2612,10 @@ typedef struct TclStubs { int (*tcl_UtfToUniChar) (const char *src, int *chPtr); /* 646 */ char * (*tcl_UniCharToUtfDString) (const int *uniStr, int uniLength, Tcl_DString *dsPtr); /* 647 */ int * (*tcl_UtfToUniCharDString) (const char *src, int length, Tcl_DString *dsPtr); /* 648 */ + int (*tcl_UtfCharComplete) (const char *src, int length); /* 649 */ + const char * (*tcl_UtfNext) (const char *src); /* 650 */ + const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 651 */ + int (*tcl_UniCharFold) (int ch); /* 652 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3286,18 +3298,18 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtf) /* 324 */ #define Tcl_UtfAtIndex \ (tclStubsPtr->tcl_UtfAtIndex) /* 325 */ -#define Tcl_UtfCharComplete \ - (tclStubsPtr->tcl_UtfCharComplete) /* 326 */ +#define TclUtfCharComplete \ + (tclStubsPtr->tclUtfCharComplete) /* 326 */ #define Tcl_UtfBackslash \ (tclStubsPtr->tcl_UtfBackslash) /* 327 */ #define Tcl_UtfFindFirst \ (tclStubsPtr->tcl_UtfFindFirst) /* 328 */ #define Tcl_UtfFindLast \ (tclStubsPtr->tcl_UtfFindLast) /* 329 */ -#define Tcl_UtfNext \ - (tclStubsPtr->tcl_UtfNext) /* 330 */ -#define Tcl_UtfPrev \ - (tclStubsPtr->tcl_UtfPrev) /* 331 */ +#define TclUtfNext \ + (tclStubsPtr->tclUtfNext) /* 330 */ +#define TclUtfPrev \ + (tclStubsPtr->tclUtfPrev) /* 331 */ #define Tcl_UtfToExternal \ (tclStubsPtr->tcl_UtfToExternal) /* 332 */ #define Tcl_UtfToExternalDString \ @@ -3932,6 +3944,14 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtfDString) /* 647 */ #define Tcl_UtfToUniCharDString \ (tclStubsPtr->tcl_UtfToUniCharDString) /* 648 */ +#define Tcl_UtfCharComplete \ + (tclStubsPtr->tcl_UtfCharComplete) /* 649 */ +#define Tcl_UtfNext \ + (tclStubsPtr->tcl_UtfNext) /* 650 */ +#define Tcl_UtfPrev \ + (tclStubsPtr->tcl_UtfPrev) /* 651 */ +#define Tcl_UniCharFold \ + (tclStubsPtr->tcl_UniCharFold) /* 652 */ #endif /* defined(USE_TCL_STUBS) */ @@ -4177,10 +4197,16 @@ extern const TclStubs *tclStubsPtr; #define Tcl_Close(interp, chan) Tcl_CloseEx(interp, chan, 0) #endif -#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3) +#undef TclUtfCharComplete +#undef TclUtfNext +#undef TclUtfPrev +#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX < 4) && !defined(TCL_NO_DEPRECATED) # undef Tcl_UtfCharComplete -# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length))) +# undef Tcl_UtfNext +# undef Tcl_UtfPrev +# define Tcl_UtfCharComplete (tclStubsPtr->tclUtfCharComplete) +# define Tcl_UtfNext (tclStubsPtr->tclUtfNext) +# define Tcl_UtfPrev (tclStubsPtr->tclUtfPrev) #endif #define Tcl_CreateSlave Tcl_CreateChild #define Tcl_GetSlave Tcl_GetChild diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 48ab3cf..9718f37 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2323,7 +2323,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2353,7 +2353,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 54c147d..d1d7037 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5323,7 +5323,7 @@ TEBCresume( } else if (TclIsPureByteArray(valuePtr)) { objResultPtr = Tcl_NewByteArrayObj( Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1); - } else if (valuePtr->bytes && length == valuePtr->length) { + } else if (valuePtr->bytes && length == valuePtr->length && !(valuePtr->bytes[index] & 0x80)) { objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { diff --git a/generic/tclIndexObj.c b/generic/tclIndexObj.c index 6ae2075..dc1fe24 100644 --- a/generic/tclIndexObj.c +++ b/generic/tclIndexObj.c @@ -785,7 +785,7 @@ PrefixLongestObjCmd( * Adjust in case we stopped in the middle of a UTF char. */ - resultLength = TclUtfPrev(&resultString[i+1], + resultLength = Tcl_UtfPrev(&resultString[i+1], resultString) - resultString; break; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 9dde88b..71459d1 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3252,15 +3252,11 @@ MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar # define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1) -# define TclUCS4Complete Tcl_UtfCharComplete -# define TclChar16Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 3) : Tcl_UtfCharComplete((src), (length))) +# define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src)) #else - MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); - MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *src, int *ucs4Ptr); -# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) -# define TclChar16Complete Tcl_UtfCharComplete + MODULE_SCOPE int TclUtfToUCS4(const char *, int *); + MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *, int *); + MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *); #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); @@ -4700,11 +4696,6 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; (numChars) = _count; \ } while (0); -#define TclUtfPrev(src, start) \ - (((src) < (start) + 2) ? (start) : \ - ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \ - Tcl_UtfPrev(src, start)) - /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclParse.c b/generic/tclParse.c index daad31d..d56a41d 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -789,7 +789,7 @@ TclParseBackslash( * written. At most 4 bytes will be written there. */ { const char *p = src+1; - Tcl_UniChar unichar = 0; + int unichar; int result; int count; char buf[4] = ""; @@ -936,13 +936,13 @@ TclParseBackslash( */ if (Tcl_UtfCharComplete(p, numBytes - 1)) { - count = TclUtfToUniChar(p, &unichar) + 1; /* +1 for '\' */ + count = TclUtfToUCS4(p, &unichar) + 1; /* +1 for '\' */ } else { - char utfBytes[4]; + char utfBytes[8]; memcpy(utfBytes, p, numBytes - 1); utfBytes[numBytes - 1] = '\0'; - count = TclUtfToUniChar(utfBytes, &unichar) + 1; + count = TclUtfToUCS4(utfBytes, &unichar) + 1; } result = unichar; break; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 03aceaf..522a740 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1171,10 +1171,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2616,7 +2616,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = TclUtfPrev(end, bytes); + q = Tcl_UtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (int)(end - q))) { end = q; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 36cb9b5..903ad10 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -88,6 +88,32 @@ static void uniCodePanic(void) { # define Tcl_UniCharNcmp (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, unsigned long))(void *)uniCodePanic #endif +#define TclUtfCharComplete UtfCharComplete +#define TclUtfNext UtfNext +#define TclUtfPrev UtfPrev + +static int TclUtfCharComplete(const char *src, int length) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return length < 3; + } + return Tcl_UtfCharComplete(src, length); +} + +static const char *TclUtfNext(const char *src) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return src + 1; + } + return Tcl_UtfNext(src); +} + +static const char *TclUtfPrev(const char *src, const char *start) { + if ((src >= start + 3) && ((src[-1] & 0xC0) == 0x80) + && ((src[-2] & 0xC0) == 0x80) && ((src[-3] & 0xC0) == 0x80)) { + return src - 3; + } + return Tcl_UtfPrev(src, start); +} + #define TclBN_mp_add mp_add #define TclBN_mp_and mp_and #define TclBN_mp_clamp mp_clamp @@ -1549,12 +1575,12 @@ const TclStubs tclStubs = { Tcl_UniCharToUpper, /* 323 */ Tcl_UniCharToUtf, /* 324 */ Tcl_UtfAtIndex, /* 325 */ - Tcl_UtfCharComplete, /* 326 */ + TclUtfCharComplete, /* 326 */ Tcl_UtfBackslash, /* 327 */ Tcl_UtfFindFirst, /* 328 */ Tcl_UtfFindLast, /* 329 */ - Tcl_UtfNext, /* 330 */ - Tcl_UtfPrev, /* 331 */ + TclUtfNext, /* 330 */ + TclUtfPrev, /* 331 */ Tcl_UtfToExternal, /* 332 */ Tcl_UtfToExternalDString, /* 333 */ Tcl_UtfToLower, /* 334 */ @@ -1872,6 +1898,10 @@ const TclStubs tclStubs = { Tcl_UtfToUniChar, /* 646 */ Tcl_UniCharToUtfDString, /* 647 */ Tcl_UtfToUniCharDString, /* 648 */ + Tcl_UtfCharComplete, /* 649 */ + Tcl_UtfNext, /* 650 */ + Tcl_UtfPrev, /* 651 */ + Tcl_UniCharFold, /* 652 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclTest.c b/generic/tclTest.c index a8ca463..0c14e8f 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -19,6 +19,9 @@ #ifndef USE_TCL_STUBS # define USE_TCL_STUBS #endif +#ifndef TCL_NO_DEPRECATED +# define TCL_NO_DEPRECATED +#endif #include "tclInt.h" #ifdef TCL_WITH_EXTERNAL_TOMMATH # include "tommath.h" @@ -6960,7 +6963,7 @@ TestUtfPrevCmd( } else { offset = numBytes; } - result = TclUtfPrev(bytes + offset, bytes); + result = Tcl_UtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); return TCL_OK; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 11bde5c..807e087 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,20 +64,12 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + static const unsigned char complete[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -88,15 +80,9 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 3,3,3,3,3, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + /* * Functions used only in this module. */ @@ -694,7 +680,7 @@ Tcl_UtfToUniCharDString( p += TclUtfToUCS4(p, &ch); *w++ = ch; } - while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) { + while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { p += TclUtfToUCS4(p, &ch); *w++ = ch; } @@ -752,7 +738,7 @@ Tcl_UtfToChar16DString( *w++ = ch; } while (p < endPtr) { - if (TclChar16Complete(p, endPtr-p)) { + if (Tcl_UtfCharComplete(p, endPtr-p)) { p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } else { @@ -833,7 +819,7 @@ Tcl_NumUtfChars( /* Pointer to the end of string. Never read endPtr[0] */ const char *endPtr = src + length; /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; + const char *optPtr = endPtr - 4; /* * Optimize away the call in this loop. Justified because... @@ -1064,7 +1050,7 @@ Tcl_UtfPrev( * it (the fallback) is correct. */ - || (trailBytesSeen >= complete[byte])) { + || (trailBytesSeen >= totalBytes[byte])) { /* * That is, (1 + trailBytesSeen > needed). * We've examined more bytes than needed to complete @@ -1105,19 +1091,14 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < 4); /* - * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * We've seen 4 trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, - * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as - * far as we can. + * accepting the fallback. */ -#if TCL_UTF_MAX > 3 return fallback; -#else - return src - TCL_UTF_MAX; -#endif } /* @@ -1576,8 +1557,8 @@ Tcl_UtfNcasecmp( return -ch2; } #endif - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); + ch1 = Tcl_UniCharFold(ch1); + ch2 = Tcl_UniCharFold(ch2); if (ch1 != ch2) { return (ch1 - ch2); } @@ -1671,8 +1652,8 @@ TclUtfCasecmp( return -ch2; } #endif - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); + ch1 = Tcl_UniCharFold(ch1); + ch2 = Tcl_UniCharFold(ch2); if (ch1 != ch2) { return ch1 - ch2; } @@ -1744,6 +1725,38 @@ Tcl_UniCharToLower( /* Clear away extension bits, if any */ return ch & 0x1FFFFF; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharFold -- + * + * Compute the lowercase equivalent of the given Unicode character. + * + * Results: + * Returns the lowercase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharFold( + int ch) /* Unicode character to convert. */ +{ + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); + + if ((mode & 0x02) && (mode != 0x7)) { + ch += GetDelta(info); + } + } + /* Clear away extension bits, if any */ + return ch & 0x1FFFFF; +} /* *---------------------------------------------------------------------- @@ -1885,8 +1898,8 @@ Tcl_UniCharNcasecmp( { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { - Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); - Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + Tcl_UniChar lcs = Tcl_UniCharFold(*ucs); + Tcl_UniChar lct = Tcl_UniCharFold(*uct); if (lcs != lct) { return (lcs - lct); @@ -2274,7 +2287,7 @@ Tcl_UniCharCaseMatch( return 1; } if (nocase) { - p = Tcl_UniCharToLower(p); + p = Tcl_UniCharFold(p); } while (1) { /* @@ -2326,13 +2339,13 @@ Tcl_UniCharCaseMatch( Tcl_UniChar startChar, endChar; uniPattern++; - ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); + ch1 = (nocase ? Tcl_UniCharFold(*uniStr) : *uniStr); uniStr++; while (1) { if ((*uniPattern == ']') || (*uniPattern == 0)) { return 0; } - startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + startChar = (nocase ? Tcl_UniCharFold(*uniPattern) : *uniPattern); uniPattern++; if (*uniPattern == '-') { @@ -2340,7 +2353,7 @@ Tcl_UniCharCaseMatch( if (*uniPattern == 0) { return 0; } - endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + endChar = (nocase ? Tcl_UniCharFold(*uniPattern) : *uniPattern); uniPattern++; if (((startChar <= ch1) && (ch1 <= endChar)) @@ -2382,8 +2395,8 @@ Tcl_UniCharCaseMatch( */ if (nocase) { - if (Tcl_UniCharToLower(*uniStr) != - Tcl_UniCharToLower(*uniPattern)) { + if (Tcl_UniCharFold(*uniStr) != + Tcl_UniCharFold(*uniPattern)) { return 0; } } else if (*uniStr != *uniPattern) { @@ -2466,7 +2479,7 @@ TclUniCharMatch( } p = *pattern; if (nocase) { - p = Tcl_UniCharToLower(p); + p = Tcl_UniCharFold(p); } while (1) { /* @@ -2478,7 +2491,7 @@ TclUniCharMatch( if ((p != '[') && (p != '?') && (p != '\\')) { if (nocase) { while ((string < stringEnd) && (p != *string) - && (p != Tcl_UniCharToLower(*string))) { + && (p != Tcl_UniCharFold(*string))) { string++; } } else { @@ -2519,20 +2532,20 @@ TclUniCharMatch( Tcl_UniChar ch1, startChar, endChar; pattern++; - ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + ch1 = (nocase ? Tcl_UniCharFold(*string) : *string); string++; while (1) { if ((*pattern == ']') || (pattern == patternEnd)) { return 0; } - startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + startChar = (nocase ? Tcl_UniCharFold(*pattern) : *pattern); pattern++; if (*pattern == '-') { pattern++; if (pattern == patternEnd) { return 0; } - endChar = (nocase ? Tcl_UniCharToLower(*pattern) + endChar = (nocase ? Tcl_UniCharFold(*pattern) : *pattern); pattern++; if (((startChar <= ch1) && (ch1 <= endChar)) @@ -2574,7 +2587,7 @@ TclUniCharMatch( */ if (nocase) { - if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + if (Tcl_UniCharFold(*string) != Tcl_UniCharFold(*pattern)) { return 0; } } else if (*string != *pattern) { @@ -2629,12 +2642,25 @@ TclUniCharToUCS4( * by the Tcl_UniChar string. */ { if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) { - *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000; + *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000; return 2; } *ucs4Ptr = src[0]; return 1; } + +const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) { + if (src <= ptr + 1) { + return ptr; + } + if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) { + return src - 2; + } + return src - 1; +} + + + #endif /* diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 170a85e..0cf5d98 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1707,7 +1707,7 @@ TclTrimRight( const char *q = trim; int pInc = 0, bytesLeft = numTrim; - pp = TclUtfPrev(p, bytes); + pp = Tcl_UtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUCS4(pp, &ch1); @@ -2208,7 +2208,7 @@ Tcl_StringCaseMatch( } else { TclUtfToUCS4(pattern, &ch2); if (nocase) { - ch2 = Tcl_UniCharToLower(ch2); + ch2 = Tcl_UniCharFold(ch2); } } @@ -2223,7 +2223,7 @@ Tcl_StringCaseMatch( if (nocase) { while (*str) { charLen = TclUtfToUCS4(str, &ch1); - if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) { + if (ch2==ch1 || ch2==Tcl_UniCharFold(ch1)) { break; } str += charLen; @@ -2282,7 +2282,7 @@ Tcl_StringCaseMatch( } else { str += TclUtfToUCS4(str, &ch1); if (nocase) { - ch1 = Tcl_UniCharToLower(ch1); + ch1 = Tcl_UniCharFold(ch1); } } while (1) { @@ -2296,7 +2296,7 @@ Tcl_StringCaseMatch( } else { pattern += TclUtfToUCS4(pattern, &startChar); if (nocase) { - startChar = Tcl_UniCharToLower(startChar); + startChar = Tcl_UniCharFold(startChar); } } if (*pattern == '-') { @@ -2311,7 +2311,7 @@ Tcl_StringCaseMatch( } else { pattern += TclUtfToUCS4(pattern, &endChar); if (nocase) { - endChar = Tcl_UniCharToLower(endChar); + endChar = Tcl_UniCharFold(endChar); } } if (((startChar <= ch1) && (ch1 <= endChar)) @@ -2360,7 +2360,7 @@ Tcl_StringCaseMatch( str += TclUtfToUCS4(str, &ch1); pattern += TclUtfToUCS4(pattern, &ch2); if (nocase) { - if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) { + if (Tcl_UniCharFold(ch1) != Tcl_UniCharFold(ch2)) { return 0; } } else if (ch1 != ch2) { |
