From b5e6f95ea90be59cb281c089806f0e446e1272bc Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 22 Mar 2022 15:38:25 +0000 Subject: Feature-complete --- generic/tcl.decls | 9 ++++++++ generic/tclDecls.h | 15 +++++++++++++ generic/tclInt.h | 12 +++++++---- generic/tclStringObj.c | 8 +++---- generic/tclStubInit.c | 3 +++ generic/tclUtf.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++- tests/string.test | 2 +- 7 files changed, 96 insertions(+), 10 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index f5b2e78..fa844e0 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2446,6 +2446,15 @@ declare 660 { declare 668 { int Tcl_UniCharLen(const int *uniStr) } +declare 669 { + int TclNumUtfChars(const char *src, int length) +} +declare 670 { + int TclGetCharLength(Tcl_Obj *objPtr) +} +declare 671 { + const char *TclUtfAtIndex(const char *src, int index) +} # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 1952641..9f5e798 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1957,6 +1957,12 @@ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, /* Slot 667 is reserved */ /* 668 */ EXTERN int Tcl_UniCharLen(const int *uniStr); +/* 669 */ +EXTERN int TclNumUtfChars(const char *src, int length); +/* 670 */ +EXTERN int TclGetCharLength(Tcl_Obj *objPtr); +/* 671 */ +EXTERN const char * TclUtfAtIndex(const char *src, int index); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2661,6 +2667,9 @@ typedef struct TclStubs { void (*reserved666)(void); void (*reserved667)(void); int (*tcl_UniCharLen) (const int *uniStr); /* 668 */ + int (*tclNumUtfChars) (const char *src, int length); /* 669 */ + int (*tclGetCharLength) (Tcl_Obj *objPtr); /* 670 */ + const char * (*tclUtfAtIndex) (const char *src, int index); /* 671 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -4020,6 +4029,12 @@ extern const TclStubs *tclStubsPtr; /* Slot 667 is reserved */ #define Tcl_UniCharLen \ (tclStubsPtr->tcl_UniCharLen) /* 668 */ +#define TclNumUtfChars \ + (tclStubsPtr->tclNumUtfChars) /* 669 */ +#define TclGetCharLength \ + (tclStubsPtr->tclGetCharLength) /* 670 */ +#define TclUtfAtIndex \ + (tclStubsPtr->tclUtfAtIndex) /* 671 */ #endif /* defined(USE_TCL_STUBS) */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 538bca3..73d6386 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3322,12 +3322,12 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); #if TCL_UTF_MAX > 3 MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); - MODULE_SCOPE int TclGetCharLength(Tcl_Obj *); MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); - MODULE_SCOPE const char *TclUtfAtIndex(const char *, int); +# undef Tcl_NumUtfChars +# define Tcl_NumUtfChars TclNumUtfChars # undef Tcl_GetCharLength # define Tcl_GetCharLength TclGetCharLength # undef Tcl_UtfAtIndex @@ -3335,11 +3335,15 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); #else # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj # define TclNewUnicodeObj Tcl_NewUnicodeObj -# define TclGetCharLength Tcl_GetCharLength # define TclAppendUnicodeToObj Tcl_AppendUnicodeToObj # define TclUniCharNcasecmp Tcl_UniCharNcasecmp # define TclUniCharCaseMatch Tcl_UniCharCaseMatch # define TclUniCharNcmp Tcl_UniCharNcmp +# undef TclNumUtfChars +# define TclNumUtfChars Tcl_NumUtfChars +# undef TclGetCharLength +# define TclGetCharLength Tcl_GetCharLength +# undef TclUtfAtIndex # define TclUtfAtIndex Tcl_UtfAtIndex #endif @@ -4764,7 +4768,7 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; *---------------------------------------------------------------- */ -#define TclNumUtfChars(numChars, bytes, numBytes) \ +#define TclNumUtfChars_UNUSED(numChars, bytes, numBytes) \ do { \ int _count, _i = (numBytes); \ unsigned char *_str = (unsigned char *) (bytes); \ diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 6417e1b..2b11877 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -637,7 +637,7 @@ TclGetCharLength( */ if (numChars == -1) { - TclNumUtfChars(numChars, objPtr->bytes, objPtr->length); + numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); stringPtr->numChars = numChars; } return numChars; @@ -782,7 +782,7 @@ Tcl_GetUniChar( */ if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { return (unsigned char) objPtr->bytes[index]; @@ -991,7 +991,7 @@ Tcl_GetRange( */ if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { if (last < 0 || last >= stringPtr->numChars) { @@ -4447,7 +4447,7 @@ ExtendUnicodeRepWithString( numOrigChars = stringPtr->numChars; } if (numAppendChars == -1) { - TclNumUtfChars(numAppendChars, bytes, numBytes); + numAppendChars = Tcl_NumUtfChars(bytes, numBytes); } needed = numOrigChars + numAppendChars; uniCharStringCheckLimits(needed); diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index e3ebb8b..09163c3 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1937,6 +1937,9 @@ const TclStubs tclStubs = { 0, /* 666 */ 0, /* 667 */ Tcl_UniCharLen, /* 668 */ + TclNumUtfChars, /* 669 */ + TclGetCharLength, /* 670 */ + TclUtfAtIndex, /* 671 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4dd1e09..eda317f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -799,7 +799,7 @@ Tcl_UtfCharComplete( */ int -Tcl_NumUtfChars( +TclNumUtfChars( const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ @@ -850,6 +850,61 @@ Tcl_NumUtfChars( return i; } +#if TCL_UTF_MAX > 3 +#undef Tcl_NumUtfChars +int +Tcl_NumUtfChars( + const char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ +{ + unsigned short ch = 0; + int i = 0; + + if (length < 0) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while ((*src != '\0') && (i < INT_MAX)) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + } else { + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - 4; + + /* + * Optimize away the call in this loop. Justified because... + * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while (src <= optPtr + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + /* Loop over the remaining string where call must happen */ + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += Tcl_UtfToChar16(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } + i++; + } + } + return i; +} +#endif + /* *--------------------------------------------------------------------------- * diff --git a/tests/string.test b/tests/string.test index 203d0c6..6863c23 100644 --- a/tests/string.test +++ b/tests/string.test @@ -422,7 +422,7 @@ test string-4.16.$noComp {string first, normal string vs pure unicode string} -b # Representation checks are canaries run {list [representationpoke $s] [representationpoke $m] \ [string first $m $s]} -} -result {{string 1} {string 0} 2} +} -match glob -result {{*string 1} {*string 0} 2} test string-4.17.$noComp {string first, corner case} -body { run {string first a aaa 4294967295} } -result {-1} -- cgit v0.12