From 648162204d4c9bd80cde739b4cad361de6ebe6f1 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 12 May 2020 18:59:04 +0000 Subject: First, experimental implementation of TIP #575. Barely tested, will fail. WIP --- generic/tcl.decls | 17 ++++++++++++++--- generic/tclCmdMZ.c | 2 +- generic/tclCompExpr.c | 8 ++++---- generic/tclDecls.h | 51 +++++++++++++++++++++++++++++++++++--------------- generic/tclEncoding.c | 4 ++-- generic/tclInt.h | 14 -------------- generic/tclParse.c | 8 ++++---- generic/tclStringObj.c | 6 +++--- generic/tclStubInit.c | 34 ++++++++++++++++++++++++++++++--- generic/tclTest.c | 8 ++++---- generic/tclUtf.c | 14 ++++---------- generic/tclUtil.c | 4 ++-- 12 files changed, 105 insertions(+), 65 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index a550411..4ccedd1 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1163,7 +1163,7 @@ declare 325 { const char *Tcl_UtfAtIndex(const char *src, int index) } declare 326 { - int Tcl_UtfCharComplete(const char *src, int length) + int TclUtfCharComplete(const char *src, int length) } declare 327 { int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst) @@ -1175,10 +1175,10 @@ declare 329 { const char *Tcl_UtfFindLast(const char *src, int ch) } declare 330 { - const char *Tcl_UtfNext(const char *src) + const char *TclUtfNext(const char *src) } declare 331 { - const char *Tcl_UtfPrev(const char *src, const char *start) + const char *TclUtfPrev(const char *src, const char *start) } declare 332 { int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, @@ -2402,6 +2402,17 @@ declare 648 { int length, Tcl_DString *dsPtr) } +# TIP #575 +declare 649 { + int Tcl_UtfCharComplete(const char *src, int length) +} +declare 650 { + const char *Tcl_UtfNext(const char *src) +} +declare 651 { + const char *Tcl_UtfPrev(const char *src, const char *start) +} + # ----- BASELINE -- FOR -- 8.7.0 ----- # ############################################################################## diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 56df0dd..8f0465d 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2529,7 +2529,7 @@ StringStartCmd( break; } - next = TclUtfPrev(p, string); + next = Tcl_UtfPrev(p, string); do { next += delta; delta = TclUtfToUCS4(next, &ch); diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c index 4fb41fc..4d448b9 100644 --- a/generic/tclCompExpr.c +++ b/generic/tclCompExpr.c @@ -1893,7 +1893,7 @@ ParseLexeme( { const char *end; int scanned; - Tcl_UniChar ch = 0; + int ch; Tcl_Obj *literal = NULL; unsigned char byte; @@ -2101,13 +2101,13 @@ ParseLexeme( if (!TclIsBareword(*start) || *start == '_') { if (Tcl_UtfCharComplete(start, numBytes)) { - scanned = TclUtfToUniChar(start, &ch); + scanned = TclUtfToUCS4(start, &ch); } else { - char utfBytes[4]; + char utfBytes[8]; memcpy(utfBytes, start, numBytes); utfBytes[numBytes] = '\0'; - scanned = TclUtfToUniChar(utfBytes, &ch); + scanned = TclUtfToUCS4(utfBytes, &ch); } *lexemePtr = INVALID; Tcl_DecrRefCount(literal); diff --git a/generic/tclDecls.h b/generic/tclDecls.h index c713469..7c1b22b 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1000,7 +1000,7 @@ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ EXTERN const char * Tcl_UtfAtIndex(const char *src, int index); /* 326 */ -EXTERN int Tcl_UtfCharComplete(const char *src, int length); +EXTERN int TclUtfCharComplete(const char *src, int length); /* 327 */ EXTERN int Tcl_UtfBackslash(const char *src, int *readPtr, char *dst); @@ -1009,9 +1009,9 @@ EXTERN const char * Tcl_UtfFindFirst(const char *src, int ch); /* 329 */ EXTERN const char * Tcl_UtfFindLast(const char *src, int ch); /* 330 */ -EXTERN const char * Tcl_UtfNext(const char *src); +EXTERN const char * TclUtfNext(const char *src); /* 331 */ -EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +EXTERN const char * TclUtfPrev(const char *src, const char *start); /* 332 */ EXTERN int Tcl_UtfToExternal(Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, @@ -1921,6 +1921,12 @@ EXTERN char * Tcl_UniCharToUtfDString(const int *uniStr, /* 648 */ EXTERN int * Tcl_UtfToUniCharDString(const char *src, int length, Tcl_DString *dsPtr); +/* 649 */ +EXTERN int Tcl_UtfCharComplete(const char *src, int length); +/* 650 */ +EXTERN const char * Tcl_UtfNext(const char *src); +/* 651 */ +EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2282,12 +2288,12 @@ typedef struct TclStubs { int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ - int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ + int (*tclUtfCharComplete) (const char *src, int length); /* 326 */ int (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ const char * (*tcl_UtfFindLast) (const char *src, int ch); /* 329 */ - const char * (*tcl_UtfNext) (const char *src); /* 330 */ - const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 331 */ + const char * (*tclUtfNext) (const char *src); /* 330 */ + const char * (*tclUtfPrev) (const char *src, const char *start); /* 331 */ int (*tcl_UtfToExternal) (Tcl_Interp *interp, Tcl_Encoding encoding, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); /* 332 */ char * (*tcl_UtfToExternalDString) (Tcl_Encoding encoding, const char *src, int srcLen, Tcl_DString *dsPtr); /* 333 */ int (*tcl_UtfToLower) (char *src); /* 334 */ @@ -2605,6 +2611,9 @@ typedef struct TclStubs { int (*tcl_UtfToUniChar) (const char *src, int *chPtr); /* 646 */ char * (*tcl_UniCharToUtfDString) (const int *uniStr, int uniLength, Tcl_DString *dsPtr); /* 647 */ int * (*tcl_UtfToUniCharDString) (const char *src, int length, Tcl_DString *dsPtr); /* 648 */ + int (*tcl_UtfCharComplete) (const char *src, int length); /* 649 */ + const char * (*tcl_UtfNext) (const char *src); /* 650 */ + const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 651 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3287,18 +3296,18 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtf) /* 324 */ #define Tcl_UtfAtIndex \ (tclStubsPtr->tcl_UtfAtIndex) /* 325 */ -#define Tcl_UtfCharComplete \ - (tclStubsPtr->tcl_UtfCharComplete) /* 326 */ +#define TclUtfCharComplete \ + (tclStubsPtr->tclUtfCharComplete) /* 326 */ #define Tcl_UtfBackslash \ (tclStubsPtr->tcl_UtfBackslash) /* 327 */ #define Tcl_UtfFindFirst \ (tclStubsPtr->tcl_UtfFindFirst) /* 328 */ #define Tcl_UtfFindLast \ (tclStubsPtr->tcl_UtfFindLast) /* 329 */ -#define Tcl_UtfNext \ - (tclStubsPtr->tcl_UtfNext) /* 330 */ -#define Tcl_UtfPrev \ - (tclStubsPtr->tcl_UtfPrev) /* 331 */ +#define TclUtfNext \ + (tclStubsPtr->tclUtfNext) /* 330 */ +#define TclUtfPrev \ + (tclStubsPtr->tclUtfPrev) /* 331 */ #define Tcl_UtfToExternal \ (tclStubsPtr->tcl_UtfToExternal) /* 332 */ #define Tcl_UtfToExternalDString \ @@ -3933,6 +3942,12 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUtfDString) /* 647 */ #define Tcl_UtfToUniCharDString \ (tclStubsPtr->tcl_UtfToUniCharDString) /* 648 */ +#define Tcl_UtfCharComplete \ + (tclStubsPtr->tcl_UtfCharComplete) /* 649 */ +#define Tcl_UtfNext \ + (tclStubsPtr->tcl_UtfNext) /* 650 */ +#define Tcl_UtfPrev \ + (tclStubsPtr->tcl_UtfPrev) /* 651 */ #endif /* defined(USE_TCL_STUBS) */ @@ -4178,10 +4193,16 @@ extern const TclStubs *tclStubsPtr; #define Tcl_Close(interp, chan) Tcl_CloseEx(interp, chan, 0) #endif -#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3) +#undef TclUtfCharComplete +#undef TclUtfNext +#undef TclUtfPrev +#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX < 4) && !defined(TCL_NO_DEPRECATED) # undef Tcl_UtfCharComplete -# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length))) +# undef Tcl_UtfNext +# undef Tcl_UtfPrev +# define Tcl_UtfCharComplete (tclStubsPtr->tclUtfCharComplete) +# define Tcl_UtfNext (tclStubsPtr->tclUtfNext) +# define Tcl_UtfPrev (tclStubsPtr->tclUtfPrev) #endif #endif /* _TCLDECLS */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index ae02821..784d8d6 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2321,7 +2321,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2351,7 +2351,7 @@ UtfToUtfProc( *dst++ = 0; *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!TclUCS4Complete(src, srcEnd - src)) { + } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an diff --git a/generic/tclInt.h b/generic/tclInt.h index 78d9f93..1b95754 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3252,14 +3252,8 @@ MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar -# define TclUCS4Complete Tcl_UtfCharComplete -# define TclChar16Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 3) : Tcl_UtfCharComplete((src), (length))) #else MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); -# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) -# define TclChar16Complete Tcl_UtfCharComplete #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); @@ -4695,14 +4689,6 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; (numChars) = _count; \ } while (0); -#define TclUtfPrev(src, start) \ - (((src) < (start) + 2) ? (start) : \ - ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \ - Tcl_UtfPrev(src, start)) - -#define TclUtfNext(src) \ - ((((unsigned char) *(src)) < 0x80) ? (src) + 1 : Tcl_UtfNext(src)) - /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclParse.c b/generic/tclParse.c index 132e804..49ee348 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -789,7 +789,7 @@ TclParseBackslash( * written. At most 4 bytes will be written there. */ { const char *p = src+1; - Tcl_UniChar unichar = 0; + int unichar; int result; int count; char buf[4] = ""; @@ -936,13 +936,13 @@ TclParseBackslash( */ if (Tcl_UtfCharComplete(p, numBytes - 1)) { - count = TclUtfToUniChar(p, &unichar) + 1; /* +1 for '\' */ + count = TclUtfToUCS4(p, &unichar) + 1; /* +1 for '\' */ } else { - char utfBytes[4]; + char utfBytes[8]; memcpy(utfBytes, p, numBytes - 1); utfBytes[numBytes - 1] = '\0'; - count = TclUtfToUniChar(utfBytes, &unichar) + 1; + count = TclUtfToUCS4(utfBytes, &unichar) + 1; } result = unichar; break; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 78e49f9..2025674 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1171,10 +1171,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2614,7 +2614,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = TclUtfPrev(end, bytes); + q = Tcl_UtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (int)(end - q))) { end = q; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 2d2bc63..ae9a4e3 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -88,6 +88,31 @@ static void uniCodePanic(void) { # define Tcl_UniCharNcmp (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, unsigned long))(void *)uniCodePanic #endif +#define TclUtfCharComplete UtfCharComplete +#define TclUtfNext UtfNext +#define TclUtfPrev UtfPrev + +static int TclUtfCharComplete(const char *src, int length) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return length < 5; + } + return Tcl_UtfCharComplete(src, length); +} + +static const char *TclUtfNext(const char *src) { + if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { + return src + 1; + } + return Tcl_UtfNext(src); +} + +static const char *TclUtfPrev(const char *src, const char *start) { + if (((unsigned)((unsigned char)*(src) - 0xF0) < 5) && (src >= start)) { + return src - 1; + } + return Tcl_UtfPrev(src, start); +} + #define TclBN_mp_add mp_add #define TclBN_mp_and mp_and #define TclBN_mp_clamp mp_clamp @@ -1549,12 +1574,12 @@ const TclStubs tclStubs = { Tcl_UniCharToUpper, /* 323 */ Tcl_UniCharToUtf, /* 324 */ Tcl_UtfAtIndex, /* 325 */ - Tcl_UtfCharComplete, /* 326 */ + TclUtfCharComplete, /* 326 */ Tcl_UtfBackslash, /* 327 */ Tcl_UtfFindFirst, /* 328 */ Tcl_UtfFindLast, /* 329 */ - Tcl_UtfNext, /* 330 */ - Tcl_UtfPrev, /* 331 */ + TclUtfNext, /* 330 */ + TclUtfPrev, /* 331 */ Tcl_UtfToExternal, /* 332 */ Tcl_UtfToExternalDString, /* 333 */ Tcl_UtfToLower, /* 334 */ @@ -1872,6 +1897,9 @@ const TclStubs tclStubs = { Tcl_UtfToUniChar, /* 646 */ Tcl_UniCharToUtfDString, /* 647 */ Tcl_UtfToUniCharDString, /* 648 */ + Tcl_UtfCharComplete, /* 649 */ + Tcl_UtfNext, /* 650 */ + Tcl_UtfPrev, /* 651 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclTest.c b/generic/tclTest.c index 1f6882f..78645b6 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -6844,10 +6844,10 @@ TestUtfNextCmd( memcpy(buffer + 1, bytes, numBytes); buffer[0] = buffer[numBytes + 1] = buffer[numBytes + 2] = buffer[numBytes + 3] = '\xA0'; - first = result = TclUtfNext(buffer + 1); + first = result = Tcl_UtfNext(buffer + 1); while ((buffer[0] = *p++) != '\0') { /* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */ - result = TclUtfNext(buffer + 1); + result = Tcl_UtfNext(buffer + 1); if (first != result) { Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL); return TCL_ERROR; @@ -6856,7 +6856,7 @@ TestUtfNextCmd( p = tobetested; while ((buffer[numBytes + 1] = *p++) != '\0') { /* Run Tcl_UtfNext with many more possible bytes at src[end], all should give the same result */ - result = TclUtfNext(buffer + 1); + result = Tcl_UtfNext(buffer + 1); if (first != result) { first = buffer; break; @@ -6904,7 +6904,7 @@ TestUtfPrevCmd( } else { offset = numBytes; } - result = TclUtfPrev(bytes + offset, bytes); + result = Tcl_UtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); return TCL_OK; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 7c09283..6f03053 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -88,13 +88,7 @@ static const unsigned char complete[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* End of "continuation byte section" */ 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4, -#else - 3,3,3,3,3, -#endif - 1,1,1,1,1,1,1,1,1,1,1 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 }; /* @@ -694,7 +688,7 @@ Tcl_UtfToUniCharDString( p += TclUtfToUCS4(p, &ch); *w++ = ch; } - while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) { + while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { p += TclUtfToUCS4(p, &ch); *w++ = ch; } @@ -752,7 +746,7 @@ Tcl_UtfToChar16DString( *w++ = ch; } while (p < endPtr) { - if (TclChar16Complete(p, endPtr-p)) { + if (Tcl_UtfCharComplete(p, endPtr-p)) { p += Tcl_UtfToChar16(p, &ch); *w++ = ch; } else { @@ -833,7 +827,7 @@ Tcl_NumUtfChars( /* Pointer to the end of string. Never read endPtr[0] */ const char *endPtr = src + length; /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; + const char *optPtr = endPtr - 4; /* * Optimize away the call in this loop. Justified because... diff --git a/generic/tclUtil.c b/generic/tclUtil.c index ebc8656..40b249d 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1707,7 +1707,7 @@ TclTrimRight( const char *q = trim; int pInc = 0, bytesLeft = numTrim; - pp = TclUtfPrev(p, bytes); + pp = Tcl_UtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUCS4(pp, &ch1); @@ -1858,7 +1858,7 @@ TclTrim( * that we will not trim. Skip over it. */ if (numBytes > 0) { const char *first = bytes + trimLeft; - bytes = TclUtfNext(first); + bytes = Tcl_UtfNext(first); numBytes -= (bytes - first); if (numBytes > 0) { -- cgit v0.12