diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-15 12:31:41 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-15 12:31:41 (GMT) |
commit | c646a67fd295620422863a3ab3ac74eff316aec3 (patch) | |
tree | 4bbdf6fc46ff8f43aae43f83aee84ba9441588b8 | |
parent | 3345ff1956ae6d7e9411917d2619c0f88ccefd2a (diff) | |
parent | 6e657cfb83595bc0481d833b708554a44e2142fb (diff) | |
download | tcl-c646a67fd295620422863a3ab3ac74eff316aec3.zip tcl-c646a67fd295620422863a3ab3ac74eff316aec3.tar.gz tcl-c646a67fd295620422863a3ab3ac74eff316aec3.tar.bz2 |
Merge 8.7 (this is the TIP #575 implementation for Tcl 9.0)
-rw-r--r-- | doc/Utf.3 | 15 | ||||
-rw-r--r-- | generic/tcl.decls | 16 | ||||
-rw-r--r-- | generic/tclCompExpr.c | 2 | ||||
-rw-r--r-- | generic/tclDecls.h | 56 | ||||
-rw-r--r-- | generic/tclEncoding.c | 4 | ||||
-rw-r--r-- | generic/tclIndexObj.c | 2 | ||||
-rw-r--r-- | generic/tclInt.h | 11 | ||||
-rw-r--r-- | generic/tclParse.c | 2 | ||||
-rw-r--r-- | generic/tclStringObj.c | 6 | ||||
-rw-r--r-- | generic/tclStubInit.c | 17 | ||||
-rw-r--r-- | generic/tclTest.c | 5 | ||||
-rw-r--r-- | generic/tclUtf.c | 49 | ||||
-rw-r--r-- | generic/tclUtil.c | 6 | ||||
-rw-r--r-- | tests/utf.test | 109 |
14 files changed, 141 insertions, 159 deletions
@@ -231,10 +231,10 @@ characters. .PP \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR of \fIlength\fR bytes is long enough to be decoded by -\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee -that the UTF-8 string is properly formed. This routine is used by -procedures that are operating on a byte at a time and need to know if a -full Unicode character has been seen. +\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise. This function +does not guarantee that the UTF-8 string is properly formed. This routine +is used by procedures that are operating on a byte at a time and need to +know if a full Unicode character has been seen. .PP \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It returns the number of Tcl_UniChars that are represented by the UTF-8 string @@ -255,7 +255,8 @@ Given \fIsrc\fR, a pointer to some location in a UTF-8 string, \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null -character. +character. \fBTcl_UtfCharComplete\fR can be used in that case to +make sure enough bytes are available before calling \fBTcl_UtfNext\fR. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made @@ -272,12 +273,12 @@ always a pointer to a location in the string. It always returns a pointer to a byte that begins a character when scanning for characters beginning from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it always returns a pointer less than \fIsrc\fR and greater than or -equal to (\fIsrc\fR - \fBTCL_UTF_MAX\fR). The character that begins +equal to (\fIsrc\fR - 4). The character that begins at the returned pointer is the first one that either includes the byte \fIsrc[-1]\fR, or might include it if the right trail bytes are present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte -\fIsrc[-\fBTCL_UTF_MAX\fI-1]\fR. +\fIsrc[-5]\fR. .PP \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the Pascal Ord() function. It returns the Unicode character represented at the diff --git a/generic/tcl.decls b/generic/tcl.decls index 5b1c3d2..790d9fb 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1209,7 +1209,7 @@ declare 325 { const char *Tcl_UtfAtIndex(const char *src, size_t index) } declare 326 { - int Tcl_UtfCharComplete(const char *src, size_t length) + int TclUtfCharComplete(const char *src, size_t length) } declare 327 { size_t Tcl_UtfBackslash(const char *src, int *readPtr, char *dst) @@ -1221,10 +1221,10 @@ declare 329 { const char *Tcl_UtfFindLast(const char *src, int ch) } declare 330 { - const char *Tcl_UtfNext(const char *src) + const char *TclUtfNext(const char *src) } declare 331 { - const char *Tcl_UtfPrev(const char *src, const char *start) + const char *TclUtfPrev(const char *src, const char *start) } declare 332 { int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, @@ -2475,8 +2475,16 @@ declare 652 { declare 653 { unsigned char *Tcl_GetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) } + +# TIP #575 +declare 654 { + int Tcl_UtfCharComplete(const char *src, size_t length) +} +declare 655 { + const char *Tcl_UtfNext(const char *src) +} declare 656 { - void TclUnusedStubEntry(void) + const char *Tcl_UtfPrev(const char *src, const char *start) } # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 847a240..8de3e7d 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -2148,7 +2148,7 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { size_t scanned; - if (TclUCS4Complete(start, numBytes)) { + if (Tcl_UtfCharComplete(start, numBytes)) { scanned = TclUtfToUCS4(start, &ch); } else { char utfBytes[8]; diff --git a/generic/tclDecls.h b/generic/tclDecls.h index a1454a5..d29d344 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -859,7 +859,7 @@ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ EXTERN const char * Tcl_UtfAtIndex(const char *src, size_t index); /* 326 */ -EXTERN int Tcl_UtfCharComplete(const char *src, size_t length); +EXTERN int TclUtfCharComplete(const char *src, size_t length); /* 327 */ EXTERN size_t Tcl_UtfBackslash(const char *src, int *readPtr, char *dst); @@ -868,9 +868,9 @@ EXTERN const char * Tcl_UtfFindFirst(const char *src, int ch); /* 329 */ EXTERN const char * Tcl_UtfFindLast(const char *src, int ch); /* 330 */ -EXTERN const char * Tcl_UtfNext(const char *src); +EXTERN const char * TclUtfNext(const char *src); /* 331 */ -EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +EXTERN const char * TclUtfPrev(const char *src, const char *start); /* 332 */ EXTERN int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, @@ -1740,10 +1740,12 @@ EXTERN Tcl_UniChar * Tcl_GetUnicodeFromObj(Tcl_Obj *objPtr, /* 653 */ EXTERN unsigned char * Tcl_GetByteArrayFromObj(Tcl_Obj *objPtr, size_t *lengthPtr); -/* Slot 654 is reserved */ -/* Slot 655 is reserved */ +/* 654 */ +EXTERN int Tcl_UtfCharComplete(const char *src, size_t length); +/* 655 */ +EXTERN const char * Tcl_UtfNext(const char *src); /* 656 */ -EXTERN void TclUnusedStubEntry(void); +EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2081,12 +2083,12 @@ typedef struct TclStubs { int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, size_t index); /* 325 */ - int (*tcl_UtfCharComplete) (const char *src, size_t length); /* 326 */ + int (*tclUtfCharComplete) (const char *src, size_t length); /* 326 */ size_t (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ const char * (*tcl_UtfFindLast) (const char *src, int ch); /* 329 */ - const char * (*tcl_UtfNext) (const char *src); /* 330 */ - const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 331 */ + const char * (*tclUtfNext) (const char *src); /* 330 */ + const char * (*tclUtfPrev) (const char *src, const char *start); /* 331 */ int (*tcl_UtfToExternal) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, size_t srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, size_t dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); /* 332 */ char * (*tcl_UtfToExternalDString) (Tcl_Encoding encoding, const char *src, size_t srcLen, Tcl_DString *dsPtr); /* 333 */ int (*tcl_UtfToLower) (char *src); /* 334 */ @@ -2409,9 +2411,9 @@ typedef struct TclStubs { char * (*tcl_GetStringFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 651 */ Tcl_UniChar * (*tcl_GetUnicodeFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 652 */ unsigned char * (*tcl_GetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */ - void (*reserved654)(void); - void (*reserved655)(void); - void (*tclUnusedStubEntry) (void); /* 656 */ + int (*tcl_UtfCharComplete) (const char *src, size_t length); /* 654 */ + const char * (*tcl_UtfNext) (const char *src); /* 655 */ + const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3030,18 +3032,18 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtf) /* 324 */ #define Tcl_UtfAtIndex \ (tclStubsPtr->tcl_UtfAtIndex) /* 325 */ -#define Tcl_UtfCharComplete \ - (tclStubsPtr->tcl_UtfCharComplete) /* 326 */ +#define TclUtfCharComplete \ + (tclStubsPtr->tclUtfCharComplete) /* 326 */ #define Tcl_UtfBackslash \ (tclStubsPtr->tcl_UtfBackslash) /* 327 */ #define Tcl_UtfFindFirst \ (tclStubsPtr->tcl_UtfFindFirst) /* 328 */ #define Tcl_UtfFindLast \ (tclStubsPtr->tcl_UtfFindLast) /* 329 */ -#define Tcl_UtfNext \ - (tclStubsPtr->tcl_UtfNext) /* 330 */ -#define Tcl_UtfPrev \ - (tclStubsPtr->tcl_UtfPrev) /* 331 */ +#define TclUtfNext \ + (tclStubsPtr->tclUtfNext) /* 330 */ +#define TclUtfPrev \ + (tclStubsPtr->tclUtfPrev) /* 331 */ #define Tcl_UtfToExternal \ (tclStubsPtr->tcl_UtfToExternal) /* 332 */ #define Tcl_UtfToExternalDString \ @@ -3668,10 +3670,12 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_GetUnicodeFromObj) /* 652 */ #define Tcl_GetByteArrayFromObj \ (tclStubsPtr->tcl_GetByteArrayFromObj) /* 653 */ -/* Slot 654 is reserved */ -/* Slot 655 is reserved */ -#define TclUnusedStubEntry \ - (tclStubsPtr->tclUnusedStubEntry) /* 656 */ +#define Tcl_UtfCharComplete \ + (tclStubsPtr->tcl_UtfCharComplete) /* 654 */ +#define Tcl_UtfNext \ + (tclStubsPtr->tcl_UtfNext) /* 655 */ +#define Tcl_UtfPrev \ + (tclStubsPtr->tcl_UtfPrev) /* 656 */ #endif /* defined(USE_TCL_STUBS) */ @@ -3922,11 +3926,9 @@ extern const TclStubs *tclStubsPtr; #define Tcl_Close(interp, chan) Tcl_CloseEx(interp, chan, 0) -#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX <= 3) -# undef Tcl_UtfCharComplete -# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= TCL_UTF_MAX) : tclStubsPtr->tcl_UtfCharComplete((src), (length))) -#endif +#undef TclUtfCharComplete +#undef TclUtfNext +#undef TclUtfPrev #ifndef TCL_NO_DEPRECATED # define Tcl_CreateSlave Tcl_CreateChild # define Tcl_GetSlave Tcl_GetChild diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1a19652..2601614 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2261,7 +2261,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2289,7 +2289,7 @@ UtfToUtfProc( *dst++ = 0; src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclIndexObj.c b/generic/tclIndexObj.c index 02d0e31..3a9e07f 100644 --- a/generic/tclIndexObj.c +++ b/generic/tclIndexObj.c @@ -718,7 +718,7 @@ PrefixLongestObjCmd( * Adjust in case we stopped in the middle of a UTF char. */ - resultLength = TclUtfPrev(&resultString[i+1], + resultLength = Tcl_UtfPrev(&resultString[i+1], resultString) - resultString; break; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 0d545bc..30e3ab7 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3184,16 +3184,10 @@ MODULE_SCOPE size_t TclUtfCount(int ch); # define TclUtfToUCS4 Tcl_UtfToUniChar # define TclUniCharToUCS4(src, ptr) (*ptr = *(src),1) # define TclUCS4Prev(src, ptr) (((src) > (ptr)) ? ((src) - 1) : (src)) -# define TclUCS4Complete Tcl_UtfCharComplete -# define TclChar16Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 3) : Tcl_UtfCharComplete((src), (length))) #else MODULE_SCOPE int TclUtfToUCS4(const char *, int *); MODULE_SCOPE int TclUniCharToUCS4(const Tcl_UniChar *, int *); MODULE_SCOPE const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *, const Tcl_UniChar *); -# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) -# define TclChar16Complete Tcl_UtfCharComplete #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(void *clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); @@ -4629,11 +4623,6 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; (numChars) = _count; \ } while (0); -#define TclUtfPrev(src, start) \ - (((src) < (start) + 2) ? (start) : \ - ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \ - Tcl_UtfPrev(src, start)) - /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclParse.c b/generic/tclParse.c index f894906..614401f 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -936,7 +936,7 @@ TclParseBackslash( * #217987] test subst-3.2 */ - if (TclUCS4Complete(p, numBytes - 1)) { + if (Tcl_UtfCharComplete(p, numBytes - 1)) { count = TclUtfToUCS4(p, &unichar) + 1; /* +1 for '\' */ } else { char utfBytes[8]; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 78e4c1d..78eafa7 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1127,10 +1127,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2537,7 +2537,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = TclUtfPrev(end, bytes); + q = Tcl_UtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (end - q))) { end = q; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index e1ba73e..c9169bb 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -78,6 +78,11 @@ static void uniCodePanic() { # define Tcl_SetUnicodeObj (void(*)(Tcl_Obj *, const Tcl_UniChar *, size_t))(void *)uniCodePanic #endif +#define TclUtfCharComplete Tcl_UtfCharComplete +#define TclUtfNext Tcl_UtfNext +#define TclUtfPrev Tcl_UtfPrev + + #define TclBN_mp_add mp_add #define TclBN_mp_add_d mp_add_d #define TclBN_mp_and mp_and @@ -1086,12 +1091,12 @@ const TclStubs tclStubs = { Tcl_UniCharToUpper, /* 323 */ Tcl_UniCharToUtf, /* 324 */ Tcl_UtfAtIndex, /* 325 */ - Tcl_UtfCharComplete, /* 326 */ + TclUtfCharComplete, /* 326 */ Tcl_UtfBackslash, /* 327 */ Tcl_UtfFindFirst, /* 328 */ Tcl_UtfFindLast, /* 329 */ - Tcl_UtfNext, /* 330 */ - Tcl_UtfPrev, /* 331 */ + TclUtfNext, /* 330 */ + TclUtfPrev, /* 331 */ Tcl_UtfToExternal, /* 332 */ Tcl_UtfToExternalDString, /* 333 */ Tcl_UtfToLower, /* 334 */ @@ -1414,9 +1419,9 @@ const TclStubs tclStubs = { Tcl_GetStringFromObj, /* 651 */ Tcl_GetUnicodeFromObj, /* 652 */ Tcl_GetByteArrayFromObj, /* 653 */ - 0, /* 654 */ - 0, /* 655 */ - TclUnusedStubEntry, /* 656 */ + Tcl_UtfCharComplete, /* 654 */ + Tcl_UtfNext, /* 655 */ + Tcl_UtfPrev, /* 656 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclTest.c b/generic/tclTest.c index df61dbb..5c719c1 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -19,6 +19,9 @@ #ifndef USE_TCL_STUBS # define USE_TCL_STUBS #endif +#ifndef TCL_NO_DEPRECATED +# define TCL_NO_DEPRECATED +#endif #include "tclInt.h" #ifdef TCL_WITH_EXTERNAL_TOMMATH # include "tommath.h" @@ -6963,7 +6966,7 @@ TestUtfPrevCmd( } else { offset = numBytes; } - result = TclUtfPrev(bytes + offset, bytes); + result = Tcl_UtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewWideIntObj(result - bytes)); return TCL_OK; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 447a5c9..9e49e87 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,20 +64,12 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + static const unsigned char complete[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -88,17 +80,9 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - /* Tcl_UtfToUniChar() accesses src[1] and src[2] to check whether - * the UTF-8 sequence is valid, so we cannot use 1 here. */ - 3,3,3,3,3, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; - + /* * Functions used only in this module. */ @@ -697,7 +681,7 @@ Tcl_UtfToUniCharDString( p += TclUtfToUCS4(p, &ch); *w++ = ch; } - while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) { + while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { p += TclUtfToUCS4(p, &ch); *w++ = ch; } @@ -755,7 +739,7 @@ Tcl_UtfToChar16DString( *w++ = ch; } while (p < endPtr) { - if (TclChar16Complete(p, endPtr-p)) { + if (Tcl_UtfCharComplete(p, endPtr-p)) { p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } else { @@ -836,7 +820,7 @@ Tcl_NumUtfChars( /* Pointer to the end of string. Never read endPtr[0] */ const char *endPtr = src + length; /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; + const char *optPtr = endPtr - 4; /* * Optimize away the call in this loop. Justified because... @@ -1071,7 +1055,7 @@ Tcl_UtfPrev( * it (the fallback) is correct. */ - || (trailBytesSeen >= complete[byte])) { + || (trailBytesSeen >= totalBytes[byte])) { /* * That is, (1 + trailBytesSeen > needed). * We've examined more bytes than needed to complete @@ -1112,19 +1096,14 @@ Tcl_UtfPrev( /* Continue the search backwards... */ look--; - } while (trailBytesSeen < TCL_UTF_MAX); + } while (trailBytesSeen < 4); /* - * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * We've seen 4 trail bytes, so we know there will not be a * properly formed byte sequence to find, and we can stop looking, - * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as - * far as we can. + * accepting the fallback. */ -#if TCL_UTF_MAX > 3 return fallback; -#else - return src - TCL_UTF_MAX; -#endif } /* @@ -1757,7 +1736,7 @@ Tcl_UniCharToLower( /* Clear away extension bits, if any */ return ch & 0x1FFFFF; } - + /* *---------------------------------------------------------------------- * diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 55b12ac..c9c3b8f 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1655,11 +1655,7 @@ TclTrimRight( const char *q = trim; size_t pInc = 0, bytesLeft = numTrim; - pp = TclUtfPrev(p, bytes); -#if TCL_UTF_MAX < 4 /* Needed because TclUtfPrev() cannot always jump back */ - /* sufficiently. See [d43f96c1a8] */ - pp = TclUtfPrev(pp, bytes); -#endif + pp = Tcl_UtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUCS4(pp, &ch1); diff --git a/tests/utf.test b/tests/utf.test index b6c4597..05e30c7 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -21,7 +21,6 @@ testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] testConstraint utf16 [expr {[string length [format %c 0x10000]] == 2}] testConstraint ucs4 [expr {[testConstraint fullutf] && [string length [format %c 0x10000]] == 1}] -testConstraint ucs2_utf16 [expr {![testConstraint ucs4]}] testConstraint Uesc [expr {"\U0041" eq "A"}] testConstraint pre388 [expr {"\x741" eq "A"}] @@ -285,19 +284,19 @@ test utf-6.28 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.29 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xE8\xF8] } 1 -test utf-6.30.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.30.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2] } 1 -test utf-6.30.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.30.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\x00] } 1 test utf-6.31 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xF2]G } 1 -test utf-6.32.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.32.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0] } 1 -test utf-6.32.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.32.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\x00] } 1 test utf-6.33 {Tcl_UtfNext} {testutfnext testbytestring} { @@ -408,10 +407,10 @@ test utf-6.67 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.68 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xF2\xA0\xA0]G } 1 -test utf-6.69.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.69.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0] } 1 -test utf-6.69.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.69.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0] } 4 test utf-6.70 {Tcl_UtfNext} {testutfnext testbytestring} { @@ -426,40 +425,40 @@ test utf-6.72 {Tcl_UtfNext} {testutfnext testbytestring} { test utf-6.73 {Tcl_UtfNext} {testutfnext testbytestring} { testutfnext [testbytestring \xF2\xA0\xA0\xF8] } 1 -test utf-6.74.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.74.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G } 1 -test utf-6.74.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.74.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0]G } 4 -test utf-6.75.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.75.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] } 1 -test utf-6.75.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.75.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xA0] } 4 -test utf-6.76.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.76.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xD0] } 1 -test utf-6.76.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.76.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xD0] } 4 -test utf-6.77.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.77.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xE8] } 1 -test utf-6.77.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.77.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xE8] } 4 -test utf-6.78.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.78.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xF2] } 1 -test utf-6.78.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.78.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0\xF2] } 4 -test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2_utf16} { +test utf-6.79.0 {Tcl_UtfNext} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8] } 1 -test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring ucs4} { +test utf-6.79.1 {Tcl_UtfNext} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF2\xA0\xA0\xA0G\xF8] } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -483,28 +482,28 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { test utf-6.86 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring} { testutfnext [testbytestring \xF0\x80\x80\x80] } 1 -test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring ucs2_utf16} { +test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF0\x90\x80\x80] } 1 -test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring ucs4} { +test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF0\x90\x80\x80] } 4 test utf-6.88.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0\xA0\x00] } 1 -test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring} { +test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\x00] } 2 test utf-6.89.0 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \x80\x80\x00] } 1 -test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x00] } 2 -test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs2_utf16} { +test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xF4\x8F\xBF\xBF] } 1 -test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring ucs4} { +test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xF4\x8F\xBF\xBF] } 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytestring} { @@ -513,25 +512,25 @@ test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext testbytest test utf-6.92.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0\xA0\xA0] } 1 -test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring} { +test utf-6.92.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\xA0] } 3 test utf-6.93.0 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \x80\x80\x80] } 1 -test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.93.1 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x80] } 3 test utf-6.94.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \xA0\xA0\xA0\xA0] } 1 -test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.94.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \xA0\xA0\xA0\xA0] } 3 test utf-6.95.0 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring ucs2} { testutfnext [testbytestring \x80\x80\x80\x80] } 1 -test utf-6.95.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring} { +test utf-6.95.1 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} {testutfnext testbytestring fullutf} { testutfnext [testbytestring \x80\x80\x80\x80] } 3 @@ -691,33 +690,33 @@ test utf-7.17.1 {Tcl_UtfPrev} {testutfprev testbytestring} { test utf-7.17.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xA0\xF8] 4 } 3 -test utf-7.18.0 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +test utf-7.18.0 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0] -} 1 -test utf-7.18.1 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 3 +test utf-7.18.1 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0\xA0] 4 -} 1 -test utf-7.18.2 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 3 +test utf-7.18.2 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0\xF8] 4 -} 1 -test utf-7.19 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 3 +test utf-7.19 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xF8\xA0\xA0\xA0] -} 2 +} 4 test utf-7.20.0 {Tcl_UtfPrev} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF2\xA0\xA0\xA0] } 4 -test utf-7.20.1 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +test utf-7.20.1 {Tcl_UtfPrev} {testutfprev testbytestring fullutf} { testutfprev A[testbytestring \xF2\xA0\xA0\xA0] -} 2 -test utf-7.21 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 1 +test utf-7.21 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A\u8820[testbytestring \xA0] -} 2 -test utf-7.22 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 4 +test utf-7.22 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xD0\xA0\xA0\xA0] -} 2 -test utf-7.23 {Tcl_UtfPrev} {testutfprev testbytestring utf16} { +} 4 +test utf-7.23 {Tcl_UtfPrev} {testutfprev testbytestring} { testutfprev A[testbytestring \xA0\xA0\xA0\xA0] -} 2 +} 4 test utf-7.24 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xC0\x81] } 2 @@ -739,9 +738,9 @@ test utf-7.28 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xE0\x80\x80] 2 } 1 -test utf-7.29 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring utf16} { +test utf-7.29 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xF0\x80\x80\x80] -} 2 +} 4 test utf-7.30 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { testutfprev A[testbytestring \xF0\x80\x80\x80] 4 } 3 @@ -772,9 +771,9 @@ test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring} { test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF0\x90\x80\x80] } 4 -test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring utf16} { +test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring fullutf} { testutfprev A[testbytestring \xF0\x90\x80\x80] -} 2 +} 1 test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF0\x90\x80\x80] 4 } 3 @@ -799,9 +798,9 @@ test utf-7.44 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestrin test utf-7.45 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestring} { testutfprev [testbytestring \xA0\xA0\xA0] } 2 -test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestring utf16} { +test utf-7.46 {Tcl_UtfPrev -- no lead byte at start} {testutfprev testbytestring} { testutfprev [testbytestring \xA0\xA0\xA0\xA0] -} 1 +} 3 test utf-7.47 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {testutfprev testbytestring} { testutfprev [testbytestring \xE8\xA0] } 0 @@ -814,9 +813,9 @@ test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} { test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring utf16} { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring fullutf} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] -} 2 +} 1 test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring ucs2} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 4 } 3 @@ -832,9 +831,9 @@ test utf-7.48.5 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbyte test utf-7.48.6 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { testutfprev A[testbytestring \xF4\x8F\xBF\xBF] 2 } 1 -test utf-7.49.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring utf16} { +test utf-7.49.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { testutfprev A[testbytestring \xF4\x90\x80\x80] -} 2 +} 4 test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev testbytestring} { testutfprev A[testbytestring \xF4\x90\x80\x80] 4 } 3 |