diff options
-rw-r--r-- | doc/Utf.3 | 11 | ||||
-rw-r--r-- | generic/tcl.decls | 17 | ||||
-rw-r--r-- | generic/tclCompExpr.c | 2 | ||||
-rw-r--r-- | generic/tclDecls.h | 51 | ||||
-rw-r--r-- | generic/tclEncoding.c | 4 | ||||
-rw-r--r-- | generic/tclIndexObj.c | 2 | ||||
-rw-r--r-- | generic/tclInt.h | 11 | ||||
-rw-r--r-- | generic/tclParse.c | 2 | ||||
-rw-r--r-- | generic/tclStringObj.c | 6 | ||||
-rw-r--r-- | generic/tclStubInit.c | 35 | ||||
-rw-r--r-- | generic/tclTest.c | 5 | ||||
-rw-r--r-- | generic/tclUtf.c | 47 | ||||
-rw-r--r-- | generic/tclUtil.c | 6 | ||||
-rw-r--r-- | tests/utf.test | 150 |
14 files changed, 205 insertions, 144 deletions
@@ -233,10 +233,10 @@ characters. .PP \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR of \fIlength\fR bytes is long enough to be decoded by -\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee -that the UTF-8 string is properly formed. This routine is used by -procedures that are operating on a byte at a time and need to know if a -full Unicode character has been seen. +\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise. This function +does not guarantee that the UTF-8 string is properly formed. This routine +is used by procedures that are operating on a byte at a time and need to +know if a full Unicode character has been seen. .PP \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It returns the number of Tcl_UniChars that are represented by the UTF-8 string @@ -257,7 +257,8 @@ Given \fIsrc\fR, a pointer to some location in a UTF-8 string, \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null -character. +character. \fBTcl_UtfCharComplete\fR can be used in that case to +make sure enough bytes are available before calling \fBTcl_UtfNext\fR. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made diff --git a/generic/tcl.decls b/generic/tcl.decls index 5895946..b9744ba 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1163,7 +1163,7 @@ declare 325 { const char *Tcl_UtfAtIndex(const char *src, int index) } declare 326 { - int Tcl_UtfCharComplete(const char *src, int length) + int TclUtfCharComplete(const char *src, int length) } declare 327 { int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst) @@ -1175,10 +1175,10 @@ declare 329 { const char *Tcl_UtfFindLast(const char *src, int ch) } declare 330 { - const char *Tcl_UtfNext(const char *src) + const char *TclUtfNext(const char *src) } declare 331 { - const char *Tcl_UtfPrev(const char *src, const char *start) + const char *TclUtfPrev(const char *src, const char *start) } declare 332 { int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, @@ -2413,6 +2413,17 @@ declare 653 { unsigned char *TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) } +# TIP #575 +declare 654 { + int Tcl_UtfCharComplete(const char *src, int length) +} +declare 655 { + const char *Tcl_UtfNext(const char *src) +} +declare 656 { + const char *Tcl_UtfPrev(const char *src, const char *start) +} + # ----- BASELINE -- FOR -- 8.7.0 ----- # ############################################################################## diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index fa15fba..03aebe3 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -2145,7 +2145,7 @@ ParseLexeme( */ if (!TclIsBareword(*start) || *start == '_') { - if (TclUCS4Complete(start, numBytes)) { + if (Tcl_UtfCharComplete(start, numBytes)) { scanned = TclUtfToUCS4(start, &ch); } else { char utfBytes[8]; diff --git a/generic/tclDecls.h b/generic/tclDecls.h index dc39657..68a7c66 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -999,7 +999,7 @@ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ EXTERN const char * Tcl_UtfAtIndex(const char *src, int index); /* 326 */ -EXTERN int Tcl_UtfCharComplete(const char *src, int length); +EXTERN int TclUtfCharComplete(const char *src, int length); /* 327 */ EXTERN int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst); @@ -1008,9 +1008,9 @@ EXTERN const char * Tcl_UtfFindFirst(const char *src, int ch); /* 329 */ EXTERN const char * Tcl_UtfFindLast(const char *src, int ch); /* 330 */ -EXTERN const char * Tcl_UtfNext(const char *src); +EXTERN const char * TclUtfNext(const char *src); /* 331 */ -EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +EXTERN const char * TclUtfPrev(const char *src, const char *start); /* 332 */ EXTERN int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, @@ -1931,6 +1931,12 @@ EXTERN Tcl_UniChar * TclGetUnicodeFromObj(Tcl_Obj *objPtr, /* 653 */ EXTERN unsigned char * TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr); +/* 654 */ +EXTERN int Tcl_UtfCharComplete(const char *src, int length); +/* 655 */ +EXTERN const char * Tcl_UtfNext(const char *src); +/* 656 */ +EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2292,12 +2298,12 @@ typedef struct TclStubs { int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ - int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ + int (*tclUtfCharComplete) (const char *src, int length); /* 326 */ int (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ const char * (*tcl_UtfFindLast) (const char *src, int ch); /* 329 */ - const char * (*tcl_UtfNext) (const char *src); /* 330 */ - const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 331 */ + const char * (*tclUtfNext) (const char *src); /* 330 */ + const char * (*tclUtfPrev) (const char *src, const char *start); /* 331 */ int (*tcl_UtfToExternal) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); /* 332 */ char * (*tcl_UtfToExternalDString) (Tcl_Encoding encoding, const char *src, int srcLen, Tcl_DString *dsPtr); /* 333 */ int (*tcl_UtfToLower) (char *src); /* 334 */ @@ -2620,6 +2626,9 @@ typedef struct TclStubs { char * (*tclGetStringFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 651 */ Tcl_UniChar * (*tclGetUnicodeFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 652 */ unsigned char * (*tclGetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */ + int (*tcl_UtfCharComplete) (const char *src, int length); /* 654 */ + const char * (*tcl_UtfNext) (const char *src); /* 655 */ + const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3302,18 +3311,18 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtf) /* 324 */ #define Tcl_UtfAtIndex \ (tclStubsPtr->tcl_UtfAtIndex) /* 325 */ -#define Tcl_UtfCharComplete \ - (tclStubsPtr->tcl_UtfCharComplete) /* 326 */ +#define TclUtfCharComplete \ + (tclStubsPtr->tclUtfCharComplete) /* 326 */ #define Tcl_UtfBackslash \ (tclStubsPtr->tcl_UtfBackslash) /* 327 */ #define Tcl_UtfFindFirst \ (tclStubsPtr->tcl_UtfFindFirst) /* 328 */ #define Tcl_UtfFindLast \ (tclStubsPtr->tcl_UtfFindLast) /* 329 */ -#define Tcl_UtfNext \ - (tclStubsPtr->tcl_UtfNext) /* 330 */ -#define Tcl_UtfPrev \ - (tclStubsPtr->tcl_UtfPrev) /* 331 */ +#define TclUtfNext \ + (tclStubsPtr->tclUtfNext) /* 330 */ +#define TclUtfPrev \ + (tclStubsPtr->tclUtfPrev) /* 331 */ #define Tcl_UtfToExternal \ (tclStubsPtr->tcl_UtfToExternal) /* 332 */ #define Tcl_UtfToExternalDString \ @@ -3956,6 +3965,12 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tclGetUnicodeFromObj) /* 652 */ #define TclGetByteArrayFromObj \ (tclStubsPtr->tclGetByteArrayFromObj) /* 653 */ +#define Tcl_UtfCharComplete \ + (tclStubsPtr->tcl_UtfCharComplete) /* 654 */ +#define Tcl_UtfNext \ + (tclStubsPtr->tcl_UtfNext) /* 655 */ +#define Tcl_UtfPrev \ + (tclStubsPtr->tcl_UtfPrev) /* 656 */ #endif /* defined(USE_TCL_STUBS) */ @@ -4230,10 +4245,16 @@ extern const TclStubs *tclStubsPtr; #define Tcl_Close(interp, chan) Tcl_CloseEx(interp, chan, 0) #endif -#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3) +#undef TclUtfCharComplete +#undef TclUtfNext +#undef TclUtfPrev +#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX < 4) && !defined(TCL_NO_DEPRECATED) # undef Tcl_UtfCharComplete -# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length))) +# undef Tcl_UtfNext +# undef Tcl_UtfPrev +# define Tcl_UtfCharComplete (tclStubsPtr->tclUtfCharComplete) +# define Tcl_UtfNext (tclStubsPtr->tclUtfNext) +# define Tcl_UtfPrev (tclStubsPtr->tclUtfPrev) #endif #define Tcl_CreateSlave Tcl_CreateChild #define Tcl_GetSlave Tcl_GetChild diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index c4ef159..ea2d6fa 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2323,7 +2323,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2353,7 +2353,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclIndexObj.c b/generic/tclIndexObj.c index f08278b..30d6cbd 100644 --- a/generic/tclIndexObj.c +++ b/generic/tclIndexObj.c @@ -785,7 +785,7 @@ PrefixLongestObjCmd( * Adjust in case we stopped in the middle of a UTF char. */ - resultLength = TclUtfPrev(&resultString[i+1], + resultLength = Tcl_UtfPrev(&resultString[i+1], resultString) - resultString; break; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 66bbc17..96315fb 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3253,16 +3253,10 @@ MODULE_SCOPE int TclUtfCount(int ch); # define TclUtfToUCS4 Tcl_UtfToUniChar # define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1) # define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src)) -# define TclUCS4Complete Tcl_UtfCharComplete -# define TclChar16Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 3) : Tcl_UtfCharComplete((src), (length))) #else MODULE_SCOPE int TclUtfToUCS4(const char *, int *); MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *, int *); MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *); -# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) -# define TclChar16Complete Tcl_UtfCharComplete #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); @@ -4696,11 +4690,6 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; (numChars) = _count; \ } while (0); -#define TclUtfPrev(src, start) \ - (((src) < (start) + 2) ? (start) : \ - ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \ - Tcl_UtfPrev(src, start)) - /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclParse.c b/generic/tclParse.c index df6c9bf..4de0356 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -935,7 +935,7 @@ TclParseBackslash( * #217987] test subst-3.2 */ - if (TclUCS4Complete(p, numBytes - 1)) { + if (Tcl_UtfCharComplete(p, numBytes - 1)) { count = TclUtfToUCS4(p, &unichar) + 1; /* +1 for '\' */ } else { char utfBytes[8]; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 50b473a..bb718ce 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1199,10 +1199,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2644,7 +2644,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = TclUtfPrev(end, bytes); + q = Tcl_UtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (int)(end - q))) { end = q; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index dd7dc26..7321805 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -90,6 +90,32 @@ static void uniCodePanic(void) { # define Tcl_UniCharNcmp (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, unsigned long))(void *)uniCodePanic #endif +#define TclUtfCharComplete UtfCharComplete +#define TclUtfNext UtfNext +#define TclUtfPrev UtfPrev + +static int TclUtfCharComplete(const char *src, int length) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return length < 3; + } + return Tcl_UtfCharComplete(src, length); +} + +static const char *TclUtfNext(const char *src) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return src + 1; + } + return Tcl_UtfNext(src); +} + +static const char *TclUtfPrev(const char *src, const char *start) { + if ((src >= start + 3) && ((src[-1] & 0xC0) == 0x80) + && ((src[-2] & 0xC0) == 0x80) && ((src[-3] & 0xC0) == 0x80)) { + return src - 3; + } + return Tcl_UtfPrev(src, start); +} + #define TclBN_mp_add mp_add #define TclBN_mp_and mp_and #define TclBN_mp_clamp mp_clamp @@ -1575,12 +1601,12 @@ const TclStubs tclStubs = { Tcl_UniCharToUpper, /* 323 */ Tcl_UniCharToUtf, /* 324 */ Tcl_UtfAtIndex, /* 325 */ - Tcl_UtfCharComplete, /* 326 */ + TclUtfCharComplete, /* 326 */ Tcl_UtfBackslash, /* 327 */ Tcl_UtfFindFirst, /* 328 */ Tcl_UtfFindLast, /* 329 */ - Tcl_UtfNext, /* 330 */ - Tcl_UtfPrev, /* 331 */ + TclUtfNext, /* 330 */ + TclUtfPrev, /* 331 */ Tcl_UtfToExternal, /* 332 */ Tcl_UtfToExternalDString, /* 333 */ Tcl_UtfToLower, /* 334 */ @@ -1903,6 +1929,9 @@ const TclStubs tclStubs = { TclGetStringFromObj, /* 651 */ TclGetUnicodeFromObj, /* 652 */ TclGetByteArrayFromObj, /* 653 */ + Tcl_UtfCharComplete, /* 654 */ + Tcl_UtfNext, /* 655 */ + Tcl_UtfPrev, /* 656 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclTest.c b/generic/tclTest.c index 45b5ca3..1c35ab2 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -19,6 +19,9 @@ #ifndef USE_TCL_STUBS # define USE_TCL_STUBS #endif +#ifndef TCL_NO_DEPRECATED +# define TCL_NO_DEPRECATED +#endif #include "tclInt.h" #ifdef TCL_WITH_EXTERNAL_TOMMATH # include "tommath.h" @@ -6962,7 +6965,7 @@ TestUtfPrevCmd( } else { offset = numBytes; } - result = TclUtfPrev(bytes + offset, bytes); + result = Tcl_UtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); return TCL_OK; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 5db7343..017688b 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,20 +64,12 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + static const unsigned char complete[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -88,15 +80,9 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 3,3,3,3,3, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + /* * Functions used only in this module. */ @@ -694,7 +680,7 @@ Tcl_UtfToUniCharDString( p += TclUtfToUCS4(p, &ch); *w++ = ch; } - while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) { + while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { p += TclUtfToUCS4(p, &ch); *w++ = ch; } @@ -752,7 +738,7 @@ Tcl_UtfToChar16DString( *w++ = ch; } while (p < endPtr) { - if (TclChar16Complete(p, endPtr-p)) { + if (Tcl_UtfCharComplete(p, endPtr-p)) { p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } else { @@ -833,7 +819,7 @@ Tcl_NumUtfChars( /* Pointer to the end of string. Never read endPtr[0] */ const char *endPtr = src + length; /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; + const char *optPtr = endPtr - 4; /* * Optimize away the call in this loop. Justified because... @@ -1064,7 +1050,7 @@ Tcl_UtfPrev( * it (the fallback) is correct. */ - || (trailBytesSeen >= complete[byte])) { + || (trailBytesSeen >= totalBytes[byte])) { /* * That is, (1 + trailBytesSeen > needed). * We've examined more bytes than needed to complete @@ -1105,19 +1091,14 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < 4); /* - * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * We've seen 4 trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, - * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as - * far as we can. + * accepting the fallback. */ -#if TCL_UTF_MAX > 3 return fallback; -#else - return src - TCL_UTF_MAX; -#endif } /* @@ -1744,7 +1725,7 @@ Tcl_UniCharToLower( /* Clear away extension bits, if any */ return ch & 0x1FFFFF; } - + /* *---------------------------------------------------------------------- * diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 1904e2f..d5ec040 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1707,11 +1707,7 @@ TclTrimRight( const char *q = trim; int pInc = 0, bytesLeft = numTrim; - pp = TclUtfPrev(p, bytes); -#if TCL_UTF_MAX < 4 /* Needed because TclUtfPrev() cannot always jump back */ - /* sufficiently. See [d43f96c1a8] */ - pp = TclUtfPrev(pp, bytes); -#endif + pp = Tcl_UtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUCS4(pp, &ch1); diff --git a/tests/utf.test b/tests/utf.test index b6c23ba..d4db773 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -216,7 +216,10 @@ test utf-6.9 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.10 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xA0]G } 1 -test utf-6.11 {Tcl_UtfNext} {testutfnext testbytestring} { +test utf-6.11.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \xA0\xA0\x00] +} 1 +test utf-6.11.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\x00] } 2 test utf-6.12 {Tcl_UtfNext} {testutfnext testbytestring} { @@ -273,19 +276,19 @@ test utf-6.28 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.29 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xE8\xF8] } 1 -test utf-6.30.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.30.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2] } 1 -test utf-6.30.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.30.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\x00] } 1 test utf-6.31 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xF2]G } 1 -test utf-6.32.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.32.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0] } 1 -test utf-6.32.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.32.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\x00] } 1 test utf-6.33 {Tcl_UtfNext} {testutfnext testbytestring} { @@ -396,10 +399,10 @@ test utf-6.67 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.68 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xF2\xA0\xA0]G } 1 -test utf-6.69.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.69.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0] } 1 -test utf-6.69.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.69.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0] } 4 test utf-6.70 {Tcl_UtfNext} {testutfnext testbytestring} { @@ -414,40 +417,40 @@ test utf-6.72 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.73 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xF2\xA0\xA0\xF8] } 1 -test utf-6.74.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.74.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G } 1 -test utf-6.74.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.74.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G } 4 -test utf-6.75.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.75.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] } 1 -test utf-6.75.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.75.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] } 4 -test utf-6.76.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.76.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xD0] } 1 -test utf-6.76.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.76.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xD0] } 4 -test utf-6.77.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.77.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xE8] } 1 -test utf-6.77.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.77.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xE8] } 4 -test utf-6.78.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.78.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xF2] } 1 -test utf-6.78.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.78.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xF2] } 4 -test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8] } 1 -test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8] } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -471,37 +474,55 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { test utf-6.86 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { testutfnext [testbytestring \xF0\x80\x80\x80] } 1 -test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring ucs2_utf16} { +test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF0\x90\x80\x80] } 1 -test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring ucs4} { +test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF0\x90\x80\x80] } 4 -test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring} { +test utf-6.88.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \xA0\xA0\x00] +} 1 +test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\x00] } 2 -test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.89.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \x80\x80\x00] +} 2 +test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x00] } 2 -test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs2_utf16} { +test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF4\x8F\xBF\xBF] } 1 -test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs4} { +test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF4\x8F\xBF\xBF] } 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring} { testutfnext [testbytestring \xF4\x90\x80\x80] } 1 -test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring} { +test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \xA0\xA0\xA0] +} 1 +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\xA0] } 3 -test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \x80\x80\x80] +} 1 +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x80] } 3 -test utf-6.94 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \xA0\xA0\xA0\xA0] +} 1 +test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\xA0\xA0] } 3 -test utf-6.95 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.95.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} { + testutfnext [testbytestring \x80\x80\x80\x80] +} 1 +test utf-6.95.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x80\x80] } 3 @@ -661,30 +682,33 @@ test utf-7.17.1 {Tcl_UtfPrev} {testutfprev testbytestring} { test utf-7.17.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xA0\xF8] 4 } 3 -test utf-7.18.0 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +test utf-7.18.0 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0] -} 1 -test utf-7.18.1 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 3 +test utf-7.18.1 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 4 -} 1 -test utf-7.18.2 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 3 +test utf-7.18.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0\xF8] 4 -} 1 -test utf-7.19 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 3 +test utf-7.19 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xF8\xA0\xA0\xA0] -} 2 -test utf-7.20 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 4 +test utf-7.20.0 {Tcl_UtfPrev} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF2\xA0\xA0\xA0] -} 2 -test utf-7.21 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 4 +test utf-7.20.1 {Tcl_UtfPrev} {testutfprev testbytestring fullutf} { + testutfprev A[testbytestring \xF2\xA0\xA0\xA0] +} 1 +test utf-7.21 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A\u8820[testbytestring \xA0] -} 2 -test utf-7.22 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 4 +test utf-7.22 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xA0\xA0] -} 2 -test utf-7.23 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 4 +test utf-7.23 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0\xA0] -} 2 +} 4 test utf-7.24 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xC0\x81] } 2 @@ -706,9 +730,9 @@ test utf-7.28 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xE0\x80\x80] 2 } 1 -test utf-7.29 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring utf16} { +test utf-7.29 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xF0\x80\x80\x80] -} 2 +} 4 test utf-7.30 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xF0\x80\x80\x80] 4 } 3 @@ -736,9 +760,12 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xE0\xA0\x80] 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring utf16} { +test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF0\x90\x80\x80] -} 2 +} 4 +test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring fullutf} { + testutfprev A[testbytestring \xF0\x90\x80\x80] +} 1 test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF0\x90\x80\x80] 4 } 3 @@ -763,9 +790,9 @@ test utf-7.44 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestrin test utf-7.45 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestring} { testutfprev [testbytestring \xA0\xA0\xA0] } 2 -test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestring utf16} { +test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestring} { testutfprev [testbytestring \xA0\xA0\xA0\xA0] -} 1 +} 3 test utf-7.47 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev testbytestring} { testutfprev [testbytestring \xE8\xA0] } 0 @@ -775,27 +802,30 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} tes test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev testbytestring} { testutfprev [testbytestring \xE8\xA0\x00] 2 } 0 -test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring utf16} { +test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] -} 2 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { +} 4 +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring fullutf} { + testutfprev A[testbytestring \xF4\x8F\xBF\xBF] +} 1 +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring fullutf} { +test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring fullutf} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 4 } 1 -test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { +test utf-7.48.4 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 3 } 2 -test utf-7.48.4 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring fullutf} { +test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring fullutf} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 3 } 1 -test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { +test utf-7.48.6 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 2 } 1 -test utf-7.49.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring utf16} { +test utf-7.49.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { testutfprev A[testbytestring \xF4\x90\x80\x80] -} 2 +} 4 test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { testutfprev A[testbytestring \xF4\x90\x80\x80] 4 } 3 |