From 60e9b38dfbeb8b382fd60528363fe726331ac4db Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 29 Mar 2022 19:40:33 +0000 Subject: Add UTF-16 versions of Tcl_NumUtfChars/Tcl_UtfAtIndex to the stub table. Should have been part of TIP #542. Needed for Tk's "glyph_indexing_2" branch --- generic/tcl.decls | 10 ++++-- generic/tclDecls.h | 35 ++++++++++++++----- generic/tclInt.h | 4 +-- generic/tclStringObj.c | 8 ++--- generic/tclStubInit.c | 7 ++-- generic/tclUtf.c | 91 +++++++++++++++++++++++++++++++++++++++++--------- 6 files changed, 122 insertions(+), 33 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index 5a03bd2..b943edd 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1162,7 +1162,7 @@ declare 311 { const Tcl_Time *timePtr) } declare 312 { - size_t Tcl_NumUtfChars(const char *src, size_t length) + size_t TclNumUtfChars(const char *src, size_t length) } declare 313 { size_t Tcl_ReadChars(Tcl_Channel channel, Tcl_Obj *objPtr, @@ -1206,7 +1206,7 @@ declare 324 { int Tcl_UniCharToUtf(int ch, char *buf) } declare 325 { - const char *Tcl_UtfAtIndex(const char *src, size_t index) + const char *TclUtfAtIndex(const char *src, size_t index) } declare 326 { int TclUtfCharComplete(const char *src, size_t length) @@ -2516,6 +2516,12 @@ declare 660 { declare 668 { size_t Tcl_UniCharLen(const int *uniStr) } +declare 669 { + size_t Tcl_NumUtfChars(const char *src, size_t length) +} +declare 671 { + const char *Tcl_UtfAtIndex(const char *src, size_t index) +} # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclDecls.h b/generic/tclDecls.h index cc33cf8..3e2b8cb 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -828,7 +828,7 @@ EXTERN void Tcl_ConditionNotify(Tcl_Condition *condPtr); EXTERN void Tcl_ConditionWait(Tcl_Condition *condPtr, Tcl_Mutex *mutexPtr, const Tcl_Time *timePtr); /* 312 */ -EXTERN size_t Tcl_NumUtfChars(const char *src, size_t length); +EXTERN size_t TclNumUtfChars(const char *src, size_t length); /* 313 */ EXTERN size_t Tcl_ReadChars(Tcl_Channel channel, Tcl_Obj *objPtr, size_t charsToRead, int appendFlag); @@ -857,7 +857,7 @@ EXTERN int Tcl_UniCharToUpper(int ch); /* 324 */ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ -EXTERN const char * Tcl_UtfAtIndex(const char *src, size_t index); +EXTERN const char * TclUtfAtIndex(const char *src, size_t index); /* 326 */ EXTERN int TclUtfCharComplete(const char *src, size_t length); /* 327 */ @@ -1774,6 +1774,11 @@ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, /* Slot 667 is reserved */ /* 668 */ EXTERN size_t Tcl_UniCharLen(const int *uniStr); +/* 669 */ +EXTERN size_t Tcl_NumUtfChars(const char *src, size_t length); +/* Slot 670 is reserved */ +/* 671 */ +EXTERN const char * Tcl_UtfAtIndex(const char *src, size_t index); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2097,7 +2102,7 @@ typedef struct TclStubs { void (*tcl_MutexUnlock) (Tcl_Mutex *mutexPtr); /* 309 */ void (*tcl_ConditionNotify) (Tcl_Condition *condPtr); /* 310 */ void (*tcl_ConditionWait) (Tcl_Condition *condPtr, Tcl_Mutex *mutexPtr, const Tcl_Time *timePtr); /* 311 */ - size_t (*tcl_NumUtfChars) (const char *src, size_t length); /* 312 */ + size_t (*tclNumUtfChars) (const char *src, size_t length); /* 312 */ size_t (*tcl_ReadChars) (Tcl_Channel channel, Tcl_Obj *objPtr, size_t charsToRead, int appendFlag); /* 313 */ void (*reserved314)(void); void (*reserved315)(void); @@ -2110,7 +2115,7 @@ typedef struct TclStubs { int (*tcl_UniCharToTitle) (int ch); /* 322 */ int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ - const char * (*tcl_UtfAtIndex) (const char *src, size_t index); /* 325 */ + const char * (*tclUtfAtIndex) (const char *src, size_t index); /* 325 */ int (*tclUtfCharComplete) (const char *src, size_t length); /* 326 */ size_t (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ @@ -2454,6 +2459,9 @@ typedef struct TclStubs { void (*reserved666)(void); void (*reserved667)(void); size_t (*tcl_UniCharLen) (const int *uniStr); /* 668 */ + size_t (*tcl_NumUtfChars) (const char *src, size_t length); /* 669 */ + void (*reserved670)(void); + const char * (*tcl_UtfAtIndex) (const char *src, size_t index); /* 671 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -3046,8 +3054,8 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_ConditionNotify) /* 310 */ #define Tcl_ConditionWait \ (tclStubsPtr->tcl_ConditionWait) /* 311 */ -#define Tcl_NumUtfChars \ - (tclStubsPtr->tcl_NumUtfChars) /* 312 */ +#define TclNumUtfChars \ + (tclStubsPtr->tclNumUtfChars) /* 312 */ #define Tcl_ReadChars \ (tclStubsPtr->tcl_ReadChars) /* 313 */ /* Slot 314 is reserved */ @@ -3070,8 +3078,8 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tcl_UniCharToUpper) /* 323 */ #define Tcl_UniCharToUtf \ (tclStubsPtr->tcl_UniCharToUtf) /* 324 */ -#define Tcl_UtfAtIndex \ - (tclStubsPtr->tcl_UtfAtIndex) /* 325 */ +#define TclUtfAtIndex \ + (tclStubsPtr->tclUtfAtIndex) /* 325 */ #define TclUtfCharComplete \ (tclStubsPtr->tclUtfCharComplete) /* 326 */ #define Tcl_UtfBackslash \ @@ -3736,6 +3744,11 @@ extern const TclStubs *tclStubsPtr; /* Slot 667 is reserved */ #define Tcl_UniCharLen \ (tclStubsPtr->tcl_UniCharLen) /* 668 */ +#define Tcl_NumUtfChars \ + (tclStubsPtr->tcl_NumUtfChars) /* 669 */ +/* Slot 670 is reserved */ +#define Tcl_UtfAtIndex \ + (tclStubsPtr->tcl_UtfAtIndex) /* 671 */ #endif /* defined(USE_TCL_STUBS) */ @@ -3937,6 +3950,12 @@ extern const TclStubs *tclStubsPtr; # define Tcl_UtfToUniChar Tcl_UtfToChar16 # undef Tcl_UniCharLen # define Tcl_UniCharLen Tcl_Char16Len +#if !defined(BUILD_tcl) +# undef Tcl_NumUtfChars +# define Tcl_NumUtfChars TclNumUtfChars +# undef Tcl_UtfAtIndex +# define Tcl_UtfAtIndex TclUtfAtIndex +#endif #endif #if defined(USE_TCL_STUBS) # define Tcl_WCharToUtfDString (sizeof(wchar_t) != sizeof(short) \ diff --git a/generic/tclInt.h b/generic/tclInt.h index 1eb486e..edd0172 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -4666,12 +4666,12 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; * of counting along a string of all one-byte characters. The ANSI C * "prototype" for this macro is: * - * MODULE_SCOPE void TclNumUtfChars(int numChars, const char *bytes, + * MODULE_SCOPE void TclNumUtfCharsM(int numChars, const char *bytes, * size_t numBytes); *---------------------------------------------------------------- */ -#define TclNumUtfChars(numChars, bytes, numBytes) \ +#define TclNumUtfCharsM(numChars, bytes, numBytes) \ do { \ size_t _count, _i = (numBytes); \ unsigned char *_str = (unsigned char *) (bytes); \ diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index c8d9df7..2755cf6 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -440,7 +440,7 @@ Tcl_GetCharLength( */ if (numChars == TCL_INDEX_NONE) { - TclNumUtfChars(numChars, objPtr->bytes, objPtr->length); + TclNumUtfCharsM(numChars, objPtr->bytes, objPtr->length); stringPtr->numChars = numChars; } return numChars; @@ -543,7 +543,7 @@ Tcl_GetUniChar( */ if (stringPtr->numChars == TCL_INDEX_NONE) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + TclNumUtfCharsM(stringPtr->numChars, objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { return (unsigned char) objPtr->bytes[index]; @@ -709,7 +709,7 @@ Tcl_GetRange( */ if (stringPtr->numChars == TCL_INDEX_NONE) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + TclNumUtfCharsM(stringPtr->numChars, objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { if (last >= stringPtr->numChars) { @@ -4045,7 +4045,7 @@ ExtendUnicodeRepWithString( numOrigChars = stringPtr->numChars; } if (numAppendChars == TCL_INDEX_NONE) { - TclNumUtfChars(numAppendChars, bytes, numBytes); + TclNumUtfCharsM(numAppendChars, bytes, numBytes); } needed = numOrigChars + numAppendChars; diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index ea7083f..59036ec 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1005,7 +1005,7 @@ const TclStubs tclStubs = { Tcl_MutexUnlock, /* 309 */ Tcl_ConditionNotify, /* 310 */ Tcl_ConditionWait, /* 311 */ - Tcl_NumUtfChars, /* 312 */ + TclNumUtfChars, /* 312 */ Tcl_ReadChars, /* 313 */ 0, /* 314 */ 0, /* 315 */ @@ -1018,7 +1018,7 @@ const TclStubs tclStubs = { Tcl_UniCharToTitle, /* 322 */ Tcl_UniCharToUpper, /* 323 */ Tcl_UniCharToUtf, /* 324 */ - Tcl_UtfAtIndex, /* 325 */ + TclUtfAtIndex, /* 325 */ TclUtfCharComplete, /* 326 */ Tcl_UtfBackslash, /* 327 */ Tcl_UtfFindFirst, /* 328 */ @@ -1362,6 +1362,9 @@ const TclStubs tclStubs = { 0, /* 666 */ 0, /* 667 */ Tcl_UniCharLen, /* 668 */ + Tcl_NumUtfChars, /* 669 */ + 0, /* 670 */ + Tcl_UtfAtIndex, /* 671 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e353b7f..09e464f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -799,6 +799,7 @@ Tcl_UtfCharComplete( *--------------------------------------------------------------------------- */ +#undef Tcl_NumUtfChars size_t Tcl_NumUtfChars( const char *src, /* The UTF-8 string to measure. */ @@ -851,6 +852,58 @@ Tcl_NumUtfChars( return i; } +size_t +TclNumUtfChars( + const char *src, /* The UTF-8 string to measure. */ + size_t length) /* The length of the string in bytes, or + * TCL_INDEX_NONE for strlen(src). */ +{ + unsigned short ch = 0; + size_t i = 0; + + if (length == TCL_INDEX_NONE) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while (*src != '\0') { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + } else { + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - 4; + + /* + * Optimize away the call in this loop. Justified because... + * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while (src <= optPtr + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + /* Loop over the remaining string where call must happen */ + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += Tcl_UtfToChar16(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } + i++; + } + } + return i; +} + /* *--------------------------------------------------------------------------- * @@ -1167,34 +1220,42 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ +#undef Tcl_UtfAtIndex const char * Tcl_UtfAtIndex( const char *src, /* The UTF-8 string. */ size_t index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; -#if TCL_UTF_MAX < 4 - size_t len = 0; -#endif + int ch = 0; if (index != TCL_INDEX_NONE) { while (index--) { -#if TCL_UTF_MAX < 4 - src += (len = TclUtfToUniChar(src, &ch)); -#else - src += TclUtfToUniChar(src, &ch); -#endif + src += Tcl_UtfToUniChar(src, &ch); } -#if TCL_UTF_MAX < 4 - if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following high Surrogate */ - src += TclUtfToUniChar(src, &ch); - } -#endif } return src; } +const char * +TclUtfAtIndex( + const char *src, /* The UTF-8 string. */ + size_t index) /* The position of the desired character. */ +{ + unsigned short ch = 0; + size_t len = 0; + + if (index != TCL_INDEX_NONE) { + while (index--) { + src += (len = Tcl_UtfToChar16(src, &ch)); + } + if ((ch >= 0xD800) && (len < 3)) { + /* Index points at character following high Surrogate */ + src += Tcl_UtfToChar16(src, &ch); + } + } + return src; +} + /* *--------------------------------------------------------------------------- * -- cgit v0.12