From c079597c246630f1982fc12887cb5b199fe98961 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 11 Mar 2022 14:52:27 +0000 Subject: Experiment: full UTF for 8.7. (WIP) --- .github/workflows/linux-build.yml | 2 +- .github/workflows/win-build.yml | 3 +-- generic/tcl.h | 6 +++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index b410aab..cb93bd4 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -7,7 +7,7 @@ jobs: matrix: cfgopt: - "" - - "CFLAGS=-DTCL_UTF_MAX=4" + - "CFLAGS=-DTCL_UTF_MAX=3" - "CFLAGS=-DTCL_NO_DEPRECATED=1" - "--disable-shared" - "--enable-symbols" diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index a8019ee..0ff696c 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -13,7 +13,6 @@ jobs: matrix: cfgopt: - "" - - "OPTS=utfmax" - "CHECKS=nodep" - "OPTS=static" - "OPTS=symbols" @@ -52,7 +51,7 @@ jobs: matrix: cfgopt: - "" - - "CFLAGS=-DTCL_UTF_MAX=4" + - "CFLAGS=-DTCL_UTF_MAX=3" - "CFLAGS=-DTCL_NO_DEPRECATED=1" - "--disable-shared" - "--enable-symbols" diff --git a/generic/tcl.h b/generic/tcl.h index b82cf0a..742a548 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2128,7 +2128,11 @@ typedef struct Tcl_EncodingType { */ #ifndef TCL_UTF_MAX -#define TCL_UTF_MAX 3 +# ifdef BUILD_tcl +# define TCL_UTF_MAX 4 +# else +# define TCL_UTF_MAX 3 +# endif #endif /* -- cgit v0.12 From 3ab43f6d833639cadd33faec67aefb712f5a1798 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 11 Mar 2022 22:11:45 +0000 Subject: Handle TclUniCharNcmp() --- generic/tcl.decls | 2 +- generic/tclCmdMZ.c | 4 ++-- generic/tclDecls.h | 6 +++--- generic/tclInt.h | 28 ++++++++++------------------ generic/tclStringObj.c | 2 +- generic/tclStubInit.c | 15 --------------- generic/tclUtf.c | 34 +++++++++++++++++++++++++++++----- 7 files changed, 46 insertions(+), 45 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index 8e21b1d..4a637ad 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1248,7 +1248,7 @@ declare 352 { int Tcl_Char16Len(const unsigned short *uniStr) } declare 353 {deprecated {Use Tcl_UtfNcmp}} { - int Tcl_UniCharNcmp(const Tcl_UniChar *ucs, const Tcl_UniChar *uct, + int Tcl_UniCharNcmp(const unsigned short *ucs, const unsigned short *uct, unsigned long numChars) } declare 354 { diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index f394035..5e95217 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -604,7 +604,7 @@ Tcl_RegsubObjCmd( numMatches = 0; nocase = (cflags & TCL_REG_NOCASE); - strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp; + strCmpFn = nocase ? Tcl_UniCharNcasecmp : TclUniCharNcmp; wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen); wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen); @@ -2070,7 +2070,7 @@ StringMapCmd( } end = ustring1 + length1; - strCmpFn = (nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp); + strCmpFn = (nocase ? Tcl_UniCharNcasecmp : TclUniCharNcmp); /* * Force result to be Unicode diff --git a/generic/tclDecls.h b/generic/tclDecls.h index e84a7e8..49ce21d 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1066,8 +1066,8 @@ EXTERN int Tcl_UniCharIsWordChar(int ch); EXTERN int Tcl_Char16Len(const unsigned short *uniStr); /* 353 */ TCL_DEPRECATED("Use Tcl_UtfNcmp") -int Tcl_UniCharNcmp(const Tcl_UniChar *ucs, - const Tcl_UniChar *uct, +int Tcl_UniCharNcmp(const unsigned short *ucs, + const unsigned short *uct, unsigned long numChars); /* 354 */ EXTERN char * Tcl_Char16ToUtfDString(const unsigned short *uniStr, @@ -2345,7 +2345,7 @@ typedef struct TclStubs { int (*tcl_UniCharIsUpper) (int ch); /* 350 */ int (*tcl_UniCharIsWordChar) (int ch); /* 351 */ int (*tcl_Char16Len) (const unsigned short *uniStr); /* 352 */ - TCL_DEPRECATED_API("Use Tcl_UtfNcmp") int (*tcl_UniCharNcmp) (const Tcl_UniChar *ucs, const Tcl_UniChar *uct, unsigned long numChars); /* 353 */ + TCL_DEPRECATED_API("Use Tcl_UtfNcmp") int (*tcl_UniCharNcmp) (const unsigned short *ucs, const unsigned short *uct, unsigned long numChars); /* 353 */ char * (*tcl_Char16ToUtfDString) (const unsigned short *uniStr, int uniLength, Tcl_DString *dsPtr); /* 354 */ unsigned short * (*tcl_UtfToChar16DString) (const char *src, int length, Tcl_DString *dsPtr); /* 355 */ Tcl_RegExp (*tcl_GetRegExpFromObj) (Tcl_Interp *interp, Tcl_Obj *patObj, int flags); /* 356 */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 2873ad3..a0f9622 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3319,6 +3319,16 @@ MODULE_SCOPE void TclErrorStackResetIf(Tcl_Interp *interp, MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); +MODULE_SCOPE int *TclGetUnicode(Tcl_Obj *); +MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); +MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); +MODULE_SCOPE void TclSetUnicodeObj(Tcl_Obj *, const int *, int); +MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); +MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); +MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); +MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); + + /* * Many parsing tasks need a common definition of whitespace. * Use this routine and macro to achieve that and place @@ -4777,24 +4787,6 @@ MODULE_SCOPE int TclIsPureByteArray(Tcl_Obj *objPtr); /* *---------------------------------------------------------------- - * Macro used by the Tcl core to compare Unicode strings. On big-endian - * systems we can use the more efficient memcmp, but this would not be - * lexically correct on little-endian systems. The ANSI C "prototype" for - * this macro is: - * - * MODULE_SCOPE int TclUniCharNcmp(const Tcl_UniChar *cs, - * const Tcl_UniChar *ct, unsigned long n); - *---------------------------------------------------------------- - */ - -#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) -# define TclUniCharNcmp(cs,ct,n) memcmp((cs),(ct),(n)*sizeof(Tcl_UniChar)) -#else /* !WORDS_BIGENDIAN */ -# define TclUniCharNcmp Tcl_UniCharNcmp -#endif /* WORDS_BIGENDIAN */ - -/* - *---------------------------------------------------------------- * Macro used by the Tcl core to increment a namespace's export epoch * counter. The ANSI C "prototype" for this macro is: * diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 7d4aef3..521d13b 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -3492,7 +3492,7 @@ TclStringCmp( s1len *= sizeof(Tcl_UniChar); s2len *= sizeof(Tcl_UniChar); } else { - memCmpFn = (memCmpFn_t)(void *)Tcl_UniCharNcmp; + memCmpFn = (memCmpFn_t)(void *)TclUniCharNcmp; } } } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 1aec652..f9430cb 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -81,21 +81,6 @@ #define TclWinConvertError (void (*)(DWORD))(void *)Tcl_WinConvertError #endif - -#if TCL_UTF_MAX > 3 -static void uniCodePanic(void) { - Tcl_Panic("This extension uses a deprecated function, not available now: Tcl is compiled with -DTCL_UTF_MAX==%d", TCL_UTF_MAX); -} -# define Tcl_GetUnicode (int *(*)(Tcl_Obj *))(void *)uniCodePanic -# define Tcl_GetUnicodeFromObj (int *(*)(Tcl_Obj *, Tcl_UniChar *))(void *)uniCodePanic -# define Tcl_NewUnicodeObj (Tcl_Obj *(*)(const int *, Tcl_UniChar))(void *)uniCodePanic -# define Tcl_SetUnicodeObj (void(*)(Tcl_Obj *, const Tcl_UniChar *, int))(void *)uniCodePanic -# define Tcl_AppendUnicodeToObj (void(*)(Tcl_Obj *, const Tcl_UniChar *, int))(void *)uniCodePanic -# define Tcl_UniCharNcasecmp (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, unsigned long))(void *)uniCodePanic -# define Tcl_UniCharCaseMatch (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, int))(void *)uniCodePanic -# define Tcl_UniCharNcmp (int(*)(const Tcl_UniChar *, const Tcl_UniChar *, unsigned long))(void *)uniCodePanic -#endif - #define TclUtfCharComplete UtfCharComplete #define TclUtfNext UtfNext #define TclUtfPrev UtfPrev diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 169f240..e024b65 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1849,9 +1849,36 @@ Tcl_UniCharLen( */ int +TclUniCharNcmp( + const int *ucs, /* Unicode string to compare to uct. */ + const int *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ +#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) + /* + * We are definitely on a big-endian machine; memcmp() is safe + */ + + return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); + +#else /* !WORDS_BIGENDIAN */ + /* + * We can't simply call memcmp() because that is not lexically correct. + */ + + for ( ; numChars != 0; ucs++, uct++, numChars--) { + if (*ucs != *uct) { + return (*ucs - *uct); + } + } + return 0; +#endif /* WORDS_BIGENDIAN */ +} + +int Tcl_UniCharNcmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const unsigned short *ucs, /* Unicode string to compare to uct. */ + const unsigned short *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { #if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) @@ -1868,21 +1895,18 @@ Tcl_UniCharNcmp( for ( ; numChars != 0; ucs++, uct++, numChars--) { if (*ucs != *uct) { -#if TCL_UTF_MAX < 4 /* special case for handling upper surrogates */ if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) { return 1; } else if (((*uct & 0xFC00) == 0xD800)) { return -1; } -#endif return (*ucs - *uct); } } return 0; #endif /* WORDS_BIGENDIAN */ } - /* *---------------------------------------------------------------------- * -- cgit v0.12 From 5f4d25842d6e58ab6e37998a654b80487ae80c29 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 11 Mar 2022 22:43:51 +0000 Subject: 2 more functions --- generic/tcl.decls | 6 +- generic/tclCmdMZ.c | 4 +- generic/tclDecls.h | 12 +-- generic/tclStringObj.c | 2 +- generic/tclUtf.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 208 insertions(+), 22 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index 4a637ad..5a32bfd 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1483,12 +1483,12 @@ declare 418 { int Tcl_IsChannelExisting(const char *channelName) } declare 419 {deprecated {Use Tcl_UtfNcasecmp}} { - int Tcl_UniCharNcasecmp(const Tcl_UniChar *ucs, const Tcl_UniChar *uct, + int Tcl_UniCharNcasecmp(const unsigned short *ucs, const unsigned short *uct, unsigned long numChars) } declare 420 {deprecated {Use Tcl_StringCaseMatch}} { - int Tcl_UniCharCaseMatch(const Tcl_UniChar *uniStr, - const Tcl_UniChar *uniPattern, int nocase) + int Tcl_UniCharCaseMatch(const unsigned short *uniStr, + const unsigned short *uniPattern, int nocase) } declare 421 { Tcl_HashEntry *Tcl_FindHashEntry(Tcl_HashTable *tablePtr, const void *key) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 5e95217..b178085 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -604,7 +604,7 @@ Tcl_RegsubObjCmd( numMatches = 0; nocase = (cflags & TCL_REG_NOCASE); - strCmpFn = nocase ? Tcl_UniCharNcasecmp : TclUniCharNcmp; + strCmpFn = nocase ? TclUniCharNcasecmp : TclUniCharNcmp; wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen); wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen); @@ -2070,7 +2070,7 @@ StringMapCmd( } end = ustring1 + length1; - strCmpFn = (nocase ? Tcl_UniCharNcasecmp : TclUniCharNcmp); + strCmpFn = (nocase ? TclUniCharNcasecmp : TclUniCharNcmp); /* * Force result to be Unicode diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 49ce21d..0a336f1 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1256,13 +1256,13 @@ EXTERN void Tcl_ClearChannelHandlers(Tcl_Channel channel); EXTERN int Tcl_IsChannelExisting(const char *channelName); /* 419 */ TCL_DEPRECATED("Use Tcl_UtfNcasecmp") -int Tcl_UniCharNcasecmp(const Tcl_UniChar *ucs, - const Tcl_UniChar *uct, +int Tcl_UniCharNcasecmp(const unsigned short *ucs, + const unsigned short *uct, unsigned long numChars); /* 420 */ TCL_DEPRECATED("Use Tcl_StringCaseMatch") -int Tcl_UniCharCaseMatch(const Tcl_UniChar *uniStr, - const Tcl_UniChar *uniPattern, int nocase); +int Tcl_UniCharCaseMatch(const unsigned short *uniStr, + const unsigned short *uniPattern, int nocase); /* 421 */ EXTERN Tcl_HashEntry * Tcl_FindHashEntry(Tcl_HashTable *tablePtr, const void *key); @@ -2411,8 +2411,8 @@ typedef struct TclStubs { void (*tcl_SpliceChannel) (Tcl_Channel channel); /* 416 */ void (*tcl_ClearChannelHandlers) (Tcl_Channel channel); /* 417 */ int (*tcl_IsChannelExisting) (const char *channelName); /* 418 */ - TCL_DEPRECATED_API("Use Tcl_UtfNcasecmp") int (*tcl_UniCharNcasecmp) (const Tcl_UniChar *ucs, const Tcl_UniChar *uct, unsigned long numChars); /* 419 */ - TCL_DEPRECATED_API("Use Tcl_StringCaseMatch") int (*tcl_UniCharCaseMatch) (const Tcl_UniChar *uniStr, const Tcl_UniChar *uniPattern, int nocase); /* 420 */ + TCL_DEPRECATED_API("Use Tcl_UtfNcasecmp") int (*tcl_UniCharNcasecmp) (const unsigned short *ucs, const unsigned short *uct, unsigned long numChars); /* 419 */ + TCL_DEPRECATED_API("Use Tcl_StringCaseMatch") int (*tcl_UniCharCaseMatch) (const unsigned short *uniStr, const unsigned short *uniPattern, int nocase); /* 420 */ Tcl_HashEntry * (*tcl_FindHashEntry) (Tcl_HashTable *tablePtr, const void *key); /* 421 */ Tcl_HashEntry * (*tcl_CreateHashEntry) (Tcl_HashTable *tablePtr, const void *key, int *newPtr); /* 422 */ void (*tcl_InitCustomHashTable) (Tcl_HashTable *tablePtr, int keyType, const Tcl_HashKeyType *typePtr); /* 423 */ diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 521d13b..4ab595f 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -3467,7 +3467,7 @@ TclStringCmp( if (nocase) { s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len); s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len); - memCmpFn = (memCmpFn_t)(void *)Tcl_UniCharNcasecmp; + memCmpFn = (memCmpFn_t)(void *)TclUniCharNcasecmp; } else { s1len = Tcl_GetCharLength(value1Ptr); s2len = Tcl_GetCharLength(value2Ptr); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e024b65..68a0e32 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1927,30 +1927,48 @@ Tcl_UniCharNcmp( int Tcl_UniCharNcasecmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const unsigned short *ucs, /* Unicode string to compare to uct. */ + const unsigned short *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { - Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); - Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + unsigned short lcs = Tcl_UniCharToLower(*ucs); + unsigned short lct = Tcl_UniCharToLower(*uct); if (lcs != lct) { -#if TCL_UTF_MAX < 4 /* special case for handling upper surrogates */ if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { return 1; } else if (((lct & 0xFC00) == 0xD800)) { return -1; } -#endif return (lcs - lct); } } } return 0; } + +int +TclUniCharNcasecmp( + const int *ucs, /* Unicode string to compare to uct. */ + const int *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ + for ( ; numChars != 0; numChars--, ucs++, uct++) { + if (*ucs != *uct) { + int lcs = Tcl_UniCharToLower(*ucs); + int lct = Tcl_UniCharToLower(*uct); + + if (lcs != lct) { + return (lcs - lct); + } + } + } + return 0; +} + /* *---------------------------------------------------------------------- @@ -2314,14 +2332,181 @@ Tcl_UniCharIsWordChar( */ int +TclUniCharCaseMatch( + const int *uniStr, /* Unicode String. */ + const int *uniPattern, + /* Pattern, which may contain special + * characters. */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ +{ + int ch1 = 0, p; + + while (1) { + p = *uniPattern; + + /* + * See if we're at the end of both the pattern and the string. If so, + * we succeeded. If we're at the end of the pattern but not at the end + * of the string, we failed. + */ + + if (p == 0) { + return (*uniStr == 0); + } + if ((*uniStr == 0) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern + */ + + while (*(++uniPattern) == '*') { + /* empty body */ + } + p = *uniPattern; + if (p == 0) { + return 1; + } + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*uniStr && (p != *uniStr) + && (p != Tcl_UniCharToLower(*uniStr))) { + uniStr++; + } + } else { + while (*uniStr && (p != *uniStr)) { + uniStr++; + } + } + } + if (TclUniCharCaseMatch(uniStr, uniPattern, nocase)) { + return 1; + } + if (*uniStr == 0) { + return 0; + } + uniStr++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches any + * single character. + */ + + if (p == '?') { + uniPattern++; + uniStr++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed by a + * list of characters that are acceptable, or by a range (two + * characters separated by "-"). + */ + + if (p == '[') { + int startChar, endChar; + + uniPattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); + uniStr++; + while (1) { + if ((*uniPattern == ']') || (*uniPattern == 0)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (*uniPattern == '-') { + uniPattern++; + if (*uniPattern == 0) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*uniPattern != ']') { + if (*uniPattern == 0) { + uniPattern--; + break; + } + uniPattern++; + } + uniPattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' so we + * do exact matching on the character that follows. + */ + + if (p == '\\') { + if (*(++uniPattern) == '\0') { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next bytes of + * each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*uniStr) != + Tcl_UniCharToLower(*uniPattern)) { + return 0; + } + } else if (*uniStr != *uniPattern) { + return 0; + } + uniStr++; + uniPattern++; + } +} + +int Tcl_UniCharCaseMatch( - const Tcl_UniChar *uniStr, /* Unicode String. */ - const Tcl_UniChar *uniPattern, + const unsigned short *uniStr, /* Unicode String. */ + const unsigned short *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - Tcl_UniChar ch1 = 0, p; + unsigned short ch1 = 0, p; while (1) { p = *uniPattern; @@ -2409,7 +2594,7 @@ Tcl_UniCharCaseMatch( */ if (p == '[') { - Tcl_UniChar startChar, endChar; + unsigned short startChar, endChar; uniPattern++; ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); @@ -2480,6 +2665,7 @@ Tcl_UniCharCaseMatch( } } + /* *---------------------------------------------------------------------- * -- cgit v0.12 From e709f7c5392a8506414be6727a3f8b6bdd7fbae4 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 12 Mar 2022 22:30:12 +0000 Subject: More progress --- .github/workflows/linux-build.yml | 2 +- .github/workflows/win-build.yml | 3 +- generic/tcl.decls | 8 ++-- generic/tclCmdMZ.c | 10 ++--- generic/tclDecls.h | 16 +++---- generic/tclExecute.c | 2 +- generic/tclInt.h | 2 - generic/tclStringObj.c | 88 ++++++++++++++++++++------------------- generic/tclTestObj.c | 4 +- 9 files changed, 68 insertions(+), 67 deletions(-) diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index cb93bd4..b410aab 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -7,7 +7,7 @@ jobs: matrix: cfgopt: - "" - - "CFLAGS=-DTCL_UTF_MAX=3" + - "CFLAGS=-DTCL_UTF_MAX=4" - "CFLAGS=-DTCL_NO_DEPRECATED=1" - "--disable-shared" - "--enable-symbols" diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index 0ff696c..a8019ee 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -13,6 +13,7 @@ jobs: matrix: cfgopt: - "" + - "OPTS=utfmax" - "CHECKS=nodep" - "OPTS=static" - "OPTS=symbols" @@ -51,7 +52,7 @@ jobs: matrix: cfgopt: - "" - - "CFLAGS=-DTCL_UTF_MAX=3" + - "CFLAGS=-DTCL_UTF_MAX=4" - "CFLAGS=-DTCL_NO_DEPRECATED=1" - "--disable-shared" - "--enable-symbols" diff --git a/generic/tcl.decls b/generic/tcl.decls index 5a32bfd..9c83e81 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1338,10 +1338,10 @@ declare 377 { void Tcl_RegExpGetInfo(Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr) } declare 378 { - Tcl_Obj *Tcl_NewUnicodeObj(const Tcl_UniChar *unicode, int numChars) + Tcl_Obj *Tcl_NewUnicodeObj(const unsigned char *unicode, int numChars) } declare 379 { - void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, + void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, const unsigned short *unicode, int numChars) } declare 380 { @@ -1351,7 +1351,7 @@ declare 381 { int Tcl_GetUniChar(Tcl_Obj *objPtr, int index) } declare 382 {deprecated {No longer in use, changed to macro}} { - Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr) + unsigned short *Tcl_GetUnicode(Tcl_Obj *objPtr) } declare 383 { Tcl_Obj *Tcl_GetRange(Tcl_Obj *objPtr, int first, int last) @@ -2417,7 +2417,7 @@ declare 651 { char *TclGetStringFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) } declare 652 { - Tcl_UniChar *TclGetUnicodeFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) + unsigned short *TclGetUnicodeFromObj(Tcl_Obj *objPtr, size_t *lengthPtr) } declare 653 { unsigned char *TclGetByteArrayFromObj(Tcl_Obj *objPtr, size_t *numBytesPtr) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index b178085..11b383f 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -619,7 +619,7 @@ Tcl_RegsubObjCmd( */ if (wstring < wend) { - resultPtr = Tcl_NewUnicodeObj(wstring, 0); + resultPtr = TclNewUnicodeObj(wstring, 0); Tcl_IncrRefCount(resultPtr); for (; wstring < wend; wstring++) { Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen); @@ -636,7 +636,7 @@ Tcl_RegsubObjCmd( (slen==1 || (strCmpFn(wstring, wsrc, (unsigned long) slen) == 0))) { if (numMatches == 0) { - resultPtr = Tcl_NewUnicodeObj(wstring, 0); + resultPtr = TclNewUnicodeObj(wstring, 0); Tcl_IncrRefCount(resultPtr); } if (p != wstring) { @@ -742,7 +742,7 @@ Tcl_RegsubObjCmd( break; } if (numMatches == 0) { - resultPtr = Tcl_NewUnicodeObj(wstring, 0); + resultPtr = TclNewUnicodeObj(wstring, 0); Tcl_IncrRefCount(resultPtr); if (offset > 0) { /* @@ -785,7 +785,7 @@ Tcl_RegsubObjCmd( subStart = info.matches[idx].start; subEnd = info.matches[idx].end; if ((subStart >= 0) && (subEnd >= 0)) { - args[idx + numParts] = Tcl_NewUnicodeObj( + args[idx + numParts] = TclNewUnicodeObj( wstring + offset + subStart, subEnd - subStart); } else { TclNewObj(args[idx + numParts]); @@ -2076,7 +2076,7 @@ StringMapCmd( * Force result to be Unicode */ - resultPtr = Tcl_NewUnicodeObj(ustring1, 0); + resultPtr = TclNewUnicodeObj(ustring1, 0); if (mapElemc == 2) { /* diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 0a336f1..4217e9c 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1142,18 +1142,18 @@ EXTERN int Tcl_RegExpExecObj(Tcl_Interp *interp, EXTERN void Tcl_RegExpGetInfo(Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr); /* 378 */ -EXTERN Tcl_Obj * Tcl_NewUnicodeObj(const Tcl_UniChar *unicode, +EXTERN Tcl_Obj * Tcl_NewUnicodeObj(const unsigned char *unicode, int numChars); /* 379 */ EXTERN void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, - const Tcl_UniChar *unicode, int numChars); + const unsigned short *unicode, int numChars); /* 380 */ EXTERN int Tcl_GetCharLength(Tcl_Obj *objPtr); /* 381 */ EXTERN int Tcl_GetUniChar(Tcl_Obj *objPtr, int index); /* 382 */ TCL_DEPRECATED("No longer in use, changed to macro") -Tcl_UniChar * Tcl_GetUnicode(Tcl_Obj *objPtr); +unsigned short * Tcl_GetUnicode(Tcl_Obj *objPtr); /* 383 */ EXTERN Tcl_Obj * Tcl_GetRange(Tcl_Obj *objPtr, int first, int last); /* 384 */ @@ -1930,7 +1930,7 @@ EXTERN unsigned char * Tcl_GetBytesFromObj(Tcl_Interp *interp, EXTERN char * TclGetStringFromObj(Tcl_Obj *objPtr, size_t *lengthPtr); /* 652 */ -EXTERN Tcl_UniChar * TclGetUnicodeFromObj(Tcl_Obj *objPtr, +EXTERN unsigned short * TclGetUnicodeFromObj(Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */ EXTERN unsigned char * TclGetByteArrayFromObj(Tcl_Obj *objPtr, @@ -2370,11 +2370,11 @@ typedef struct TclStubs { int (*tcl_UniCharIsPunct) (int ch); /* 375 */ int (*tcl_RegExpExecObj) (Tcl_Interp *interp, Tcl_RegExp regexp, Tcl_Obj *textObj, int offset, int nmatches, int flags); /* 376 */ void (*tcl_RegExpGetInfo) (Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr); /* 377 */ - Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */ - void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */ + Tcl_Obj * (*tcl_NewUnicodeObj) (const unsigned char *unicode, int numChars); /* 378 */ + void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const unsigned short *unicode, int numChars); /* 379 */ int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */ int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ - TCL_DEPRECATED_API("No longer in use, changed to macro") Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ + TCL_DEPRECATED_API("No longer in use, changed to macro") unsigned short * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ TCL_DEPRECATED_API("Use Tcl_AppendStringsToObj") void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */ int (*tcl_RegExpMatchObj) (Tcl_Interp *interp, Tcl_Obj *textObj, Tcl_Obj *patternObj); /* 385 */ @@ -2644,7 +2644,7 @@ typedef struct TclStubs { unsigned char * (*tclGetBytesFromObj) (Tcl_Interp *interp, Tcl_Obj *objPtr, int *numBytesPtr); /* 649 */ unsigned char * (*tcl_GetBytesFromObj) (Tcl_Interp *interp, Tcl_Obj *objPtr, size_t *numBytesPtr); /* 650 */ char * (*tclGetStringFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 651 */ - Tcl_UniChar * (*tclGetUnicodeFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 652 */ + unsigned short * (*tclGetUnicodeFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 652 */ unsigned char * (*tclGetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *numBytesPtr); /* 653 */ int (*tcl_UtfCharComplete) (const char *src, int length); /* 654 */ const char * (*tcl_UtfNext) (const char *src); /* 655 */ diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 0279218..7cc002f 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5514,7 +5514,7 @@ TEBCresume( } ustring3 = Tcl_GetUnicodeFromObj(value3Ptr, &length3); - objResultPtr = Tcl_NewUnicodeObj(ustring1, 0); + objResultPtr = TclNewUnicodeObj(ustring1, 0); p = ustring1; end = ustring1 + length; for (; ustring1 < end; ustring1++) { diff --git a/generic/tclInt.h b/generic/tclInt.h index a0f9622..ed607cd 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3319,10 +3319,8 @@ MODULE_SCOPE void TclErrorStackResetIf(Tcl_Interp *interp, MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); -MODULE_SCOPE int *TclGetUnicode(Tcl_Obj *); MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); -MODULE_SCOPE void TclSetUnicodeObj(Tcl_Obj *, const int *, int); MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 4ab595f..972eef7 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -374,8 +374,8 @@ Tcl_DbNewStringObj( */ Tcl_Obj * -Tcl_NewUnicodeObj( - const Tcl_UniChar *unicode, /* The unicode string used to initialize the +TclNewUnicodeObj( + const int *unicode, /* The unicode string used to initialize the * new object. */ int numChars) /* Number of characters in the unicode * string. */ @@ -387,6 +387,22 @@ Tcl_NewUnicodeObj( return objPtr; } +Tcl_Obj * +Tcl_NewUnicodeObj( + const unsigned char *unicode, /* The unicode string used to initialize the + * new object. */ + int numChars) /* Number of characters in the unicode + * string. */ +{ + Tcl_Obj *objPtr; + (void)unicode; + (void)numChars; + + TclNewObj(objPtr); + /* TODO JN */ + return objPtr; +} + /* *---------------------------------------------------------------------- * @@ -612,12 +628,12 @@ Tcl_GetUniChar( #undef Tcl_GetUnicodeFromObj #ifndef TCL_NO_DEPRECATED #undef Tcl_GetUnicode -Tcl_UniChar * +unsigned short * Tcl_GetUnicode( Tcl_Obj *objPtr) /* The object to find the unicode string * for. */ { - return Tcl_GetUnicodeFromObj(objPtr, (int *)NULL); + return TclGetUnicodeFromObj(objPtr, NULL); } #endif /* TCL_NO_DEPRECATED */ @@ -663,7 +679,7 @@ Tcl_GetUnicodeFromObj( } return stringPtr->unicode; } -Tcl_UniChar * +unsigned short * TclGetUnicodeFromObj( Tcl_Obj *objPtr, /* The object to find the unicode string * for. */ @@ -671,24 +687,10 @@ TclGetUnicodeFromObj( * rep's unichar length should be stored. If * NULL, no length is stored. */ { - String *stringPtr; - - SetStringFromAny(NULL, objPtr); - stringPtr = GET_STRING(objPtr); - - if (stringPtr->hasUnicode == 0) { - FillUnicodeRep(objPtr); - stringPtr = GET_STRING(objPtr); - } - - if (lengthPtr != NULL) { -#if TCL_MAJOR_VERSION > 8 - *lengthPtr = stringPtr->numChars; -#else - *lengthPtr = ((size_t)(unsigned)(stringPtr->numChars + 1)) - 1; -#endif - } - return stringPtr->unicode; + (void)objPtr; + (void)lengthPtr; + /* TODO JN */ + return NULL; } /* @@ -797,7 +799,7 @@ Tcl_GetRange( ++last; } #endif - return Tcl_NewUnicodeObj(stringPtr->unicode + first, last - first + 1); + return TclNewUnicodeObj(stringPtr->unicode + first, last - first + 1); } /* @@ -1092,16 +1094,16 @@ Tcl_AttemptSetObjLength( void Tcl_SetUnicodeObj( Tcl_Obj *objPtr, /* The object to set the string of. */ - const Tcl_UniChar *unicode, /* The unicode string used to initialize the + const unsigned short *unicode, /* The unicode string used to initialize the * object. */ int numChars) /* Number of characters in the unicode * string. */ { - if (Tcl_IsShared(objPtr)) { - Tcl_Panic("%s called with shared object", "Tcl_SetUnicodeObj"); - } - TclFreeInternalRep(objPtr); - SetUnicodeObj(objPtr, unicode, numChars); + (void)objPtr; + (void)unicode; + (void)numChars; + + /* TODO JN */ } static int @@ -1228,7 +1230,7 @@ Tcl_AppendLimitedToObj( /* If appended string starts with a continuation byte or a lower surrogate, * force objPtr to unicode representation. See [7f1162a867] */ if (bytes && ISCONTINUATION(bytes)) { - Tcl_GetUnicode(objPtr); + Tcl_GetUnicodeFromObj(objPtr, NULL); stringPtr = GET_STRING(objPtr); } if (stringPtr->hasUnicode && stringPtr->numChars > 0) { @@ -1432,7 +1434,7 @@ Tcl_AppendObjToObj( * force objPtr to unicode representation. See [7f1162a867] * This fixes append-3.4, append-3.7 and utf-1.18 testcases. */ if (ISCONTINUATION(TclGetString(appendObjPtr))) { - Tcl_GetUnicode(objPtr); + Tcl_GetUnicodeFromObj(objPtr, NULL); stringPtr = GET_STRING(objPtr); } /* @@ -2976,7 +2978,7 @@ TclStringRepeat( */ if (!inPlace || Tcl_IsShared(objPtr)) { - objResultPtr = Tcl_NewUnicodeObj(Tcl_GetUnicode(objPtr), length); + objResultPtr = TclNewUnicodeObj(Tcl_GetUnicodeFromObj(objPtr, NULL), length); } else { TclInvalidateStringRep(objPtr); objResultPtr = objPtr; @@ -2997,7 +2999,7 @@ TclStringRepeat( Tcl_AppendObjToObj(objResultPtr, objResultPtr); done *= 2; } - Tcl_AppendUnicodeToObj(objResultPtr, Tcl_GetUnicode(objResultPtr), + Tcl_AppendUnicodeToObj(objResultPtr, Tcl_GetUnicodeFromObj(objResultPtr, NULL), (count - done) * length); } else { /* @@ -3318,12 +3320,12 @@ TclStringCat( } return NULL; } - dst = Tcl_GetUnicode(objResultPtr) + start; + dst = Tcl_GetUnicodeFromObj(objResultPtr, NULL) + start; } else { Tcl_UniChar ch = 0; /* Ugly interface! No scheme to init array size. */ - objResultPtr = Tcl_NewUnicodeObj(&ch, 0); /* PANIC? */ + objResultPtr = TclNewUnicodeObj(&ch, 0); /* PANIC? */ if (0 == Tcl_AttemptSetObjLength(objResultPtr, length)) { Tcl_DecrRefCount(objResultPtr); if (interp) { @@ -3335,7 +3337,7 @@ TclStringCat( } return NULL; } - dst = Tcl_GetUnicode(objResultPtr); + dst = Tcl_GetUnicodeFromObj(objResultPtr, NULL); } while (objc--) { Tcl_Obj *objPtr = *objv++; @@ -3479,8 +3481,8 @@ TclStringCmp( s2 = value2Ptr->bytes; memCmpFn = memcmp; } else { - s1 = (char *) Tcl_GetUnicode(value1Ptr); - s2 = (char *) Tcl_GetUnicode(value2Ptr); + s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, NULL); + s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, NULL); if ( #if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) 1 @@ -3854,7 +3856,7 @@ TclStringReverse( stringPtr = GET_STRING(objPtr); if (stringPtr->hasUnicode) { - Tcl_UniChar *from = Tcl_GetUnicode(objPtr); + Tcl_UniChar *from = Tcl_GetUnicodeFromObj(objPtr, NULL); stringPtr = GET_STRING(objPtr); Tcl_UniChar *src = from + stringPtr->numChars; Tcl_UniChar *to; @@ -3865,9 +3867,9 @@ TclStringReverse( * Tcl_SetObjLength into growing the unicode rep buffer. */ - objPtr = Tcl_NewUnicodeObj(&ch, 1); + objPtr = TclNewUnicodeObj(&ch, 1); Tcl_SetObjLength(objPtr, stringPtr->numChars); - to = Tcl_GetUnicode(objPtr); + to = Tcl_GetUnicodeFromObj(objPtr, NULL); stringPtr = GET_STRING(objPtr); while (--src >= from) { #if TCL_UTF_MAX < 4 @@ -4101,7 +4103,7 @@ TclStringReplace( /* TODO: Is there an in-place option worth pursuing here? */ - result = Tcl_NewUnicodeObj(ustring, first); + result = TclNewUnicodeObj(ustring, first); if (insertPtr) { Tcl_AppendObjToObj(result, insertPtr); } diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index a235002..9814cfe 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. */ - +#undef BUILD_tcl #ifndef USE_TCL_STUBS # define USE_TCL_STUBS #endif @@ -1153,7 +1153,7 @@ TeststringobjCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - Tcl_UniChar *unicode; + unsigned short *unicode; size_t varIndex; int size, option, i; Tcl_WideInt length; -- cgit v0.12 From d05ac0b98bbc039639b7784e313620b8d3757ddf Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 12 Mar 2022 23:21:12 +0000 Subject: Start defining "utf32string" type --- generic/tcl.decls | 4 +- generic/tclCmdMZ.c | 62 +++++++++++----------- generic/tclDecls.h | 8 +-- generic/tclExecute.c | 18 +++---- generic/tclRegexp.c | 2 +- generic/tclStringObj.c | 139 ++++++++++++++++++++++++++++++++++--------------- generic/tclTestObj.c | 1 + generic/tclUtil.c | 4 +- tests/string.test | 2 +- 9 files changed, 149 insertions(+), 91 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index 9c83e81..6dbb457 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1357,7 +1357,7 @@ declare 383 { Tcl_Obj *Tcl_GetRange(Tcl_Obj *objPtr, int first, int last) } declare 384 {deprecated {Use Tcl_AppendStringsToObj}} { - void Tcl_AppendUnicodeToObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, + void Tcl_AppendUnicodeToObj(Tcl_Obj *objPtr, const unsigned short *unicode, int length) } declare 385 { @@ -1541,7 +1541,7 @@ declare 433 { # introduced in 8.4a3 declare 434 { - Tcl_UniChar *Tcl_GetUnicodeFromObj(Tcl_Obj *objPtr, int *lengthPtr) + unsigned short *Tcl_GetUnicodeFromObj(Tcl_Obj *objPtr, int *lengthPtr) } # TIP#15 (math function introspection) dkf diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 11b383f..db4002a 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -606,9 +606,9 @@ Tcl_RegsubObjCmd( nocase = (cflags & TCL_REG_NOCASE); strCmpFn = nocase ? TclUniCharNcasecmp : TclUniCharNcmp; - wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen); - wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen); - wsubspec = Tcl_GetUnicodeFromObj(objv[2], &wsublen); + wsrc = TclGetUnicodeFromObj_(objv[0], &slen); + wstring = TclGetUnicodeFromObj_(objv[1], &wlen); + wsubspec = TclGetUnicodeFromObj_(objv[2], &wsublen); wend = wstring + wlen - (slen ? slen - 1 : 0); result = TCL_OK; @@ -622,8 +622,8 @@ Tcl_RegsubObjCmd( resultPtr = TclNewUnicodeObj(wstring, 0); Tcl_IncrRefCount(resultPtr); for (; wstring < wend; wstring++) { - Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen); - Tcl_AppendUnicodeToObj(resultPtr, wstring, 1); + TclAppendUnicodeToObj(resultPtr, wsubspec, wsublen); + TclAppendUnicodeToObj(resultPtr, wstring, 1); numMatches++; } wlen = 0; @@ -640,14 +640,14 @@ Tcl_RegsubObjCmd( Tcl_IncrRefCount(resultPtr); } if (p != wstring) { - Tcl_AppendUnicodeToObj(resultPtr, p, wstring - p); + TclAppendUnicodeToObj(resultPtr, p, wstring - p); p = wstring + slen; } else { p += slen; } wstring = p - 1; - Tcl_AppendUnicodeToObj(resultPtr, wsubspec, wsublen); + TclAppendUnicodeToObj(resultPtr, wsubspec, wsublen); numMatches++; } } @@ -699,14 +699,14 @@ Tcl_RegsubObjCmd( } else { objPtr = objv[1]; } - wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen); + wstring = TclGetUnicodeFromObj_(objPtr, &wlen); if (objv[2] == objv[0]) { subPtr = Tcl_DuplicateObj(objv[2]); } else { subPtr = objv[2]; } if (!command) { - wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen); + wsubspec = TclGetUnicodeFromObj_(subPtr, &wsublen); } result = TCL_OK; @@ -750,7 +750,7 @@ Tcl_RegsubObjCmd( * specified. */ - Tcl_AppendUnicodeToObj(resultPtr, wstring, offset); + TclAppendUnicodeToObj(resultPtr, wstring, offset); } } numMatches++; @@ -763,7 +763,7 @@ Tcl_RegsubObjCmd( Tcl_RegExpGetInfo(regExpr, &info); start = info.matches[0].start; end = info.matches[0].end; - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); + TclAppendUnicodeToObj(resultPtr, wstring + offset, start); /* * In command-prefix mode, the substitutions are added as quoted @@ -826,7 +826,7 @@ Tcl_RegsubObjCmd( * the user code. */ - wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen); + wstring = TclGetUnicodeFromObj_(objPtr, &wlen); offset += end; if (end == 0 || start == end) { @@ -838,7 +838,7 @@ Tcl_RegsubObjCmd( */ if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + TclAppendUnicodeToObj(resultPtr, wstring + offset, 1); } offset++; } @@ -867,7 +867,7 @@ Tcl_RegsubObjCmd( idx = ch - '0'; } else if ((ch == '\\') || (ch == '&')) { *wsrc = ch; - Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, + TclAppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar + 1); *wsrc = '\\'; wfirstChar = wsrc + 2; @@ -881,7 +881,7 @@ Tcl_RegsubObjCmd( } if (wfirstChar != wsrc) { - Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, + TclAppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar); } @@ -889,7 +889,7 @@ Tcl_RegsubObjCmd( subStart = info.matches[idx].start; subEnd = info.matches[idx].end; if ((subStart >= 0) && (subEnd >= 0)) { - Tcl_AppendUnicodeToObj(resultPtr, + TclAppendUnicodeToObj(resultPtr, wstring + offset + subStart, subEnd - subStart); } } @@ -901,7 +901,7 @@ Tcl_RegsubObjCmd( } if (wfirstChar != wsrc) { - Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar); + TclAppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar); } if (end == 0) { @@ -911,7 +911,7 @@ Tcl_RegsubObjCmd( */ if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + TclAppendUnicodeToObj(resultPtr, wstring + offset, 1); } offset++; } else { @@ -923,7 +923,7 @@ Tcl_RegsubObjCmd( */ if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + TclAppendUnicodeToObj(resultPtr, wstring + offset, 1); } offset++; } @@ -948,7 +948,7 @@ Tcl_RegsubObjCmd( resultPtr = objv[1]; Tcl_IncrRefCount(resultPtr); } else if (offset < wlen) { - Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, wlen - offset); + TclAppendUnicodeToObj(resultPtr, wstring + offset, wlen - offset); } if (objc == 4) { if (Tcl_ObjSetVar2(interp, objv[3], NULL, resultPtr, @@ -2060,7 +2060,7 @@ StringMapCmd( } else { sourceObj = objv[objc-1]; } - ustring1 = Tcl_GetUnicodeFromObj(sourceObj, &length1); + ustring1 = TclGetUnicodeFromObj_(sourceObj, &length1); if (length1 == 0) { /* * Empty input string, just stop now. @@ -2089,7 +2089,7 @@ StringMapCmd( int mapLen, u2lc; Tcl_UniChar *mapString; - ustring2 = Tcl_GetUnicodeFromObj(mapElemv[0], &length2); + ustring2 = TclGetUnicodeFromObj_(mapElemv[0], &length2); p = ustring1; if ((length2 > length1) || (length2 == 0)) { /* @@ -2098,7 +2098,7 @@ StringMapCmd( ustring1 = end; } else { - mapString = Tcl_GetUnicodeFromObj(mapElemv[1], &mapLen); + mapString = TclGetUnicodeFromObj_(mapElemv[1], &mapLen); u2lc = (nocase ? Tcl_UniCharToLower(*ustring2) : 0); for (; ustring1 < end; ustring1++) { if (((*ustring1 == *ustring2) || @@ -2106,14 +2106,14 @@ StringMapCmd( (length2==1 || strCmpFn(ustring1, ustring2, (unsigned long) length2) == 0)) { if (p != ustring1) { - Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p); + TclAppendUnicodeToObj(resultPtr, p, ustring1-p); p = ustring1 + length2; } else { p += length2; } ustring1 = p - 1; - Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen); + TclAppendUnicodeToObj(resultPtr, mapString, mapLen); } } } @@ -2134,7 +2134,7 @@ StringMapCmd( u2lc = (int *)TclStackAlloc(interp, mapElemc * sizeof(int)); } for (index = 0; index < mapElemc; index++) { - mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index], + mapStrings[index] = TclGetUnicodeFromObj_(mapElemv[index], mapLens+index); if (nocase && ((index % 2) == 0)) { u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]); @@ -2158,7 +2158,7 @@ StringMapCmd( * Put the skipped chars onto the result first. */ - Tcl_AppendUnicodeToObj(resultPtr, p, ustring1-p); + TclAppendUnicodeToObj(resultPtr, p, ustring1-p); p = ustring1 + length2; } else { p += length2; @@ -2174,7 +2174,7 @@ StringMapCmd( * Append the map value to the unicode string. */ - Tcl_AppendUnicodeToObj(resultPtr, + TclAppendUnicodeToObj(resultPtr, mapStrings[index+1], mapLens[index+1]); break; } @@ -2191,7 +2191,7 @@ StringMapCmd( * Put the rest of the unmapped chars onto result. */ - Tcl_AppendUnicodeToObj(resultPtr, p, ustring1 - p); + TclAppendUnicodeToObj(resultPtr, p, ustring1 - p); } Tcl_SetObjResult(interp, resultPtr); done: @@ -2506,7 +2506,7 @@ StringStartCmd( return TCL_ERROR; } - string = Tcl_GetUnicodeFromObj(objv[1], &length); + string = TclGetUnicodeFromObj_(objv[1], &length); if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } @@ -2576,7 +2576,7 @@ StringEndCmd( return TCL_ERROR; } - string = Tcl_GetUnicodeFromObj(objv[1], &length); + string = TclGetUnicodeFromObj_(objv[1], &length); if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 4217e9c..bf15862 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1159,7 +1159,7 @@ EXTERN Tcl_Obj * Tcl_GetRange(Tcl_Obj *objPtr, int first, int last); /* 384 */ TCL_DEPRECATED("Use Tcl_AppendStringsToObj") void Tcl_AppendUnicodeToObj(Tcl_Obj *objPtr, - const Tcl_UniChar *unicode, int length); + const unsigned short *unicode, int length); /* 385 */ EXTERN int Tcl_RegExpMatchObj(Tcl_Interp *interp, Tcl_Obj *textObj, Tcl_Obj *patternObj); @@ -1304,7 +1304,7 @@ EXTERN int Tcl_AttemptSetObjLength(Tcl_Obj *objPtr, int length); /* 433 */ EXTERN Tcl_ThreadId Tcl_GetChannelThread(Tcl_Channel channel); /* 434 */ -EXTERN Tcl_UniChar * Tcl_GetUnicodeFromObj(Tcl_Obj *objPtr, +EXTERN unsigned short * Tcl_GetUnicodeFromObj(Tcl_Obj *objPtr, int *lengthPtr); /* 435 */ TCL_DEPRECATED("") @@ -2376,7 +2376,7 @@ typedef struct TclStubs { int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ TCL_DEPRECATED_API("No longer in use, changed to macro") unsigned short * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ - TCL_DEPRECATED_API("Use Tcl_AppendStringsToObj") void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */ + TCL_DEPRECATED_API("Use Tcl_AppendStringsToObj") void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const unsigned short *unicode, int length); /* 384 */ int (*tcl_RegExpMatchObj) (Tcl_Interp *interp, Tcl_Obj *textObj, Tcl_Obj *patternObj); /* 385 */ void (*tcl_SetNotifier) (const Tcl_NotifierProcs *notifierProcPtr); /* 386 */ Tcl_Mutex * (*tcl_GetAllocMutex) (void); /* 387 */ @@ -2426,7 +2426,7 @@ typedef struct TclStubs { char * (*tcl_AttemptDbCkrealloc) (char *ptr, unsigned int size, const char *file, int line); /* 431 */ int (*tcl_AttemptSetObjLength) (Tcl_Obj *objPtr, int length); /* 432 */ Tcl_ThreadId (*tcl_GetChannelThread) (Tcl_Channel channel); /* 433 */ - Tcl_UniChar * (*tcl_GetUnicodeFromObj) (Tcl_Obj *objPtr, int *lengthPtr); /* 434 */ + unsigned short * (*tcl_GetUnicodeFromObj) (Tcl_Obj *objPtr, int *lengthPtr); /* 434 */ TCL_DEPRECATED_API("") int (*tcl_GetMathFuncInfo) (Tcl_Interp *interp, const char *name, int *numArgsPtr, Tcl_ValueType **argTypesPtr, Tcl_MathProc **procPtr, ClientData *clientDataPtr); /* 435 */ TCL_DEPRECATED_API("") Tcl_Obj * (*tcl_ListMathFuncs) (Tcl_Interp *interp, const char *pattern); /* 436 */ Tcl_Obj * (*tcl_SubstObj) (Tcl_Interp *interp, Tcl_Obj *objPtr, int flags); /* 437 */ diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 7cc002f..7a1025d 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5495,12 +5495,12 @@ TEBCresume( objResultPtr = value3Ptr; goto doneStringMap; } - ustring1 = Tcl_GetUnicodeFromObj(valuePtr, &length); + ustring1 = TclGetUnicodeFromObj_(valuePtr, &length); if (length == 0) { objResultPtr = valuePtr; goto doneStringMap; } - ustring2 = Tcl_GetUnicodeFromObj(value2Ptr, &length2); + ustring2 = TclGetUnicodeFromObj_(value2Ptr, &length2); if (length2 > length || length2 == 0) { objResultPtr = valuePtr; goto doneStringMap; @@ -5512,7 +5512,7 @@ TEBCresume( } goto doneStringMap; } - ustring3 = Tcl_GetUnicodeFromObj(value3Ptr, &length3); + ustring3 = TclGetUnicodeFromObj_(value3Ptr, &length3); objResultPtr = TclNewUnicodeObj(ustring1, 0); p = ustring1; @@ -5524,14 +5524,14 @@ TEBCresume( memcmp(ustring1, ustring2, sizeof(Tcl_UniChar) * length2) == 0)) { if (p != ustring1) { - Tcl_AppendUnicodeToObj(objResultPtr, p, ustring1-p); + TclAppendUnicodeToObj(objResultPtr, p, ustring1-p); p = ustring1 + length2; } else { p += length2; } ustring1 = p - 1; - Tcl_AppendUnicodeToObj(objResultPtr, ustring3, length3); + TclAppendUnicodeToObj(objResultPtr, ustring3, length3); } } if (p != ustring1) { @@ -5539,7 +5539,7 @@ TEBCresume( * Put the rest of the unmapped chars onto result. */ - Tcl_AppendUnicodeToObj(objResultPtr, p, ustring1 - p); + TclAppendUnicodeToObj(objResultPtr, p, ustring1 - p); } doneStringMap: TRACE_WITH_OBJ(("%.20s %.20s %.20s => ", @@ -5565,7 +5565,7 @@ TEBCresume( valuePtr = OBJ_AT_TOS; TRACE(("%s \"%.30s\" => ", tclStringClassTable[opnd].name, O2S(valuePtr))); - ustring1 = Tcl_GetUnicodeFromObj(valuePtr, &length); + ustring1 = TclGetUnicodeFromObj_(valuePtr, &length); match = 1; if (length > 0) { int ch; @@ -5596,8 +5596,8 @@ TEBCresume( || TclHasInternalRep(value2Ptr, &tclStringType)) { Tcl_UniChar *ustring1, *ustring2; - ustring1 = Tcl_GetUnicodeFromObj(valuePtr, &length); - ustring2 = Tcl_GetUnicodeFromObj(value2Ptr, &length2); + ustring1 = TclGetUnicodeFromObj_(valuePtr, &length); + ustring2 = TclGetUnicodeFromObj_(value2Ptr, &length2); match = TclUniCharMatch(ustring1, length, ustring2, length2, nocase); } else if (TclIsPureByteArray(valuePtr) && !nocase) { diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index 8e588ac..bb8a6ad 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -482,7 +482,7 @@ Tcl_RegExpExecObj( regexpPtr->string = NULL; regexpPtr->objPtr = textObj; - udata = Tcl_GetUnicodeFromObj(textObj, &length); + udata = TclGetUnicodeFromObj_(textObj, &length); if (offset > length) { offset = length; diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 972eef7..a723586 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -80,14 +80,41 @@ static void UpdateStringOfString(Tcl_Obj *objPtr); * functions that can be invoked by generic object code. */ -const Tcl_ObjType tclStringType = { - "string", /* name */ +static const Tcl_ObjType utf32StringType = { + "utf32string", /* name */ FreeStringInternalRep, /* freeIntRepPro */ DupStringInternalRep, /* dupIntRepProc */ UpdateStringOfString, /* updateStringProc */ SetStringFromAny /* setFromAnyProc */ }; +typedef struct { + int numChars; /* The number of chars in the string. */ + int allocated; /* The amount of space actually allocated for + * the UTF-16 string (minus 1 byte for the + * termination char). */ + int maxChars; /* Max number of chars that can fit in the + * space allocated for the UTF-16 array. */ + int hasUnicode; /* Boolean determining whether the string has + * a UTF-16 representation. Always 1 */ + unsigned short unicode[TCLFLEXARRAY]; /* The array of Unicode chars. The actual size + * of this field depends on the 'maxChars' + * field above. */ +} UTF16String; + +const Tcl_ObjType tclStringType = { + "string", /* name */ + FreeStringInternalRep, /* freeIntRepPro */ +#if 0 + /* TODO JN */ + DupUTF16StringInternalRep, /* dupIntRepProc */ + UpdateUTF16StringOfString, /* updateStringProc */ + SetUTF16StringFromAny /* setFromAnyProc */ +#endif + NULL, NULL, NULL +}; + + /* * TCL STRING GROWTH ALGORITHM * @@ -656,8 +683,8 @@ Tcl_GetUnicode( *---------------------------------------------------------------------- */ -Tcl_UniChar * -Tcl_GetUnicodeFromObj( +int * +TclGetUnicodeFromObj_( Tcl_Obj *objPtr, /* The object to find the unicode string * for. */ int *lengthPtr) /* If non-NULL, the location where the string @@ -679,6 +706,22 @@ Tcl_GetUnicodeFromObj( } return stringPtr->unicode; } + +unsigned short * +Tcl_GetUnicodeFromObj( + Tcl_Obj *objPtr, /* The object to find the unicode string + * for. */ + int *lengthPtr) /* If non-NULL, the location where the string + * rep's unichar length should be stored. If + * NULL, no length is stored. */ +{ + (void)objPtr; + (void)lengthPtr; + + /* TODO JN */ + return NULL; +} + unsigned short * TclGetUnicodeFromObj( Tcl_Obj *objPtr, /* The object to find the unicode string @@ -1142,7 +1185,7 @@ SetUnicodeObj( stringCheckLimits(numChars); stringPtr = stringAlloc(numChars); SET_STRING(objPtr, stringPtr); - objPtr->typePtr = &tclStringType; + objPtr->typePtr = &utf32StringType; stringPtr->maxChars = numChars; memcpy(stringPtr->unicode, unicode, numChars * sizeof(Tcl_UniChar)); @@ -1230,7 +1273,7 @@ Tcl_AppendLimitedToObj( /* If appended string starts with a continuation byte or a lower surrogate, * force objPtr to unicode representation. See [7f1162a867] */ if (bytes && ISCONTINUATION(bytes)) { - Tcl_GetUnicodeFromObj(objPtr, NULL); + TclGetUnicodeFromObj_(objPtr, NULL); stringPtr = GET_STRING(objPtr); } if (stringPtr->hasUnicode && stringPtr->numChars > 0) { @@ -1298,9 +1341,9 @@ Tcl_AppendToObj( */ void -Tcl_AppendUnicodeToObj( +TclAppendUnicodeToObj( Tcl_Obj *objPtr, /* Points to the object to append to. */ - const Tcl_UniChar *unicode, /* The unicode string to append to the + const int *unicode, /* The unicode string to append to the * object. */ int length) /* Number of chars in "unicode". */ { @@ -1330,6 +1373,20 @@ Tcl_AppendUnicodeToObj( } } +void +Tcl_AppendUnicodeToObj( + Tcl_Obj *objPtr, /* Points to the object to append to. */ + const unsigned short *unicode, /* The unicode string to append to the + * object. */ + int length) /* Number of chars in "unicode". */ +{ + (void)objPtr; + (void)unicode; + (void)length; + + /* TODO JN */ +} + /* *---------------------------------------------------------------------- * @@ -1434,7 +1491,7 @@ Tcl_AppendObjToObj( * force objPtr to unicode representation. See [7f1162a867] * This fixes append-3.4, append-3.7 and utf-1.18 testcases. */ if (ISCONTINUATION(TclGetString(appendObjPtr))) { - Tcl_GetUnicodeFromObj(objPtr, NULL); + TclGetUnicodeFromObj_(objPtr, NULL); stringPtr = GET_STRING(objPtr); } /* @@ -1447,9 +1504,9 @@ Tcl_AppendObjToObj( * If appendObjPtr is not of the "String" type, don't convert it. */ - if (TclHasInternalRep(appendObjPtr, &tclStringType)) { + if (TclHasInternalRep(appendObjPtr, &utf32StringType)) { Tcl_UniChar *unicode = - Tcl_GetUnicodeFromObj(appendObjPtr, &numChars); + TclGetUnicodeFromObj_(appendObjPtr, &numChars); AppendUnicodeToUnicodeRep(objPtr, unicode, numChars); } else { @@ -1468,7 +1525,7 @@ Tcl_AppendObjToObj( bytes = TclGetStringFromObj(appendObjPtr, &length); numChars = stringPtr->numChars; - if ((numChars >= 0) && TclHasInternalRep(appendObjPtr, &tclStringType)) { + if ((numChars >= 0) && TclHasInternalRep(appendObjPtr, &utf32StringType)) { String *appendStringPtr = GET_STRING(appendObjPtr); appendNumChars = appendStringPtr->numChars; @@ -2877,7 +2934,7 @@ TclGetStringStorage( { String *stringPtr; - if (!TclHasInternalRep(objPtr, &tclStringType) || objPtr->bytes == NULL) { + if (!TclHasInternalRep(objPtr, &utf32StringType) || objPtr->bytes == NULL) { return TclGetStringFromObj(objPtr, (int *)sizePtr); } @@ -2925,7 +2982,7 @@ TclStringRepeat( */ if (!binary) { - if (TclHasInternalRep(objPtr, &tclStringType)) { + if (TclHasInternalRep(objPtr, &utf32StringType)) { String *stringPtr = GET_STRING(objPtr); if (stringPtr->hasUnicode) { unichar = 1; @@ -2938,7 +2995,7 @@ TclStringRepeat( Tcl_GetByteArrayFromObj(objPtr, &length); } else if (unichar) { /* Result will be pure Tcl_UniChar array. Pre-size it. */ - Tcl_GetUnicodeFromObj(objPtr, &length); + TclGetUnicodeFromObj_(objPtr, &length); } else { /* Result will be concat of string reps. Pre-size it. */ Tcl_GetStringFromObj(objPtr, &length); @@ -2978,7 +3035,7 @@ TclStringRepeat( */ if (!inPlace || Tcl_IsShared(objPtr)) { - objResultPtr = TclNewUnicodeObj(Tcl_GetUnicodeFromObj(objPtr, NULL), length); + objResultPtr = TclNewUnicodeObj(TclGetUnicodeFromObj_(objPtr, NULL), length); } else { TclInvalidateStringRep(objPtr); objResultPtr = objPtr; @@ -2999,7 +3056,7 @@ TclStringRepeat( Tcl_AppendObjToObj(objResultPtr, objResultPtr); done *= 2; } - Tcl_AppendUnicodeToObj(objResultPtr, Tcl_GetUnicodeFromObj(objResultPtr, NULL), + TclAppendUnicodeToObj(objResultPtr, TclGetUnicodeFromObj_(objResultPtr, NULL), (count - done) * length); } else { /* @@ -3096,7 +3153,7 @@ TclStringCat( binary = 0; if (ov > objv+1 && ISCONTINUATION(TclGetString(objPtr))) { forceUniChar = 1; - } else if ((objPtr->typePtr) && (objPtr->typePtr != &tclStringType)) { + } else if ((objPtr->typePtr) && (objPtr->typePtr != &utf32StringType)) { /* Prevent shimmer of non-string types. */ allowUniChar = 0; } @@ -3104,7 +3161,7 @@ TclStringCat( } else { /* assert (objPtr->typePtr != NULL) -- stork! */ binary = 0; - if (TclHasInternalRep(objPtr, &tclStringType)) { + if (TclHasInternalRep(objPtr, &utf32StringType)) { /* Have a pure Unicode value; ask to preserve it */ requestUniChar = 1; } else { @@ -3158,7 +3215,7 @@ TclStringCat( if ((objPtr->bytes == NULL) || (objPtr->length)) { int numChars; - Tcl_GetUnicodeFromObj(objPtr, &numChars); /* PANIC? */ + TclGetUnicodeFromObj_(objPtr, &numChars); /* PANIC? */ if (numChars) { last = objc - oc; if (length == 0) { @@ -3308,7 +3365,7 @@ TclStringCat( objResultPtr = *objv++; objc--; /* Ugly interface! Force resize of the unicode array. */ - Tcl_GetUnicodeFromObj(objResultPtr, &start); + TclGetUnicodeFromObj_(objResultPtr, &start); Tcl_InvalidateStringRep(objResultPtr); if (0 == Tcl_AttemptSetObjLength(objResultPtr, length)) { if (interp) { @@ -3320,7 +3377,7 @@ TclStringCat( } return NULL; } - dst = Tcl_GetUnicodeFromObj(objResultPtr, NULL) + start; + dst = TclGetUnicodeFromObj_(objResultPtr, NULL) + start; } else { Tcl_UniChar ch = 0; @@ -3337,14 +3394,14 @@ TclStringCat( } return NULL; } - dst = Tcl_GetUnicodeFromObj(objResultPtr, NULL); + dst = TclGetUnicodeFromObj_(objResultPtr, NULL); } while (objc--) { Tcl_Obj *objPtr = *objv++; if ((objPtr->bytes == NULL) || (objPtr->length)) { int more; - Tcl_UniChar *src = Tcl_GetUnicodeFromObj(objPtr, &more); + Tcl_UniChar *src = TclGetUnicodeFromObj_(objPtr, &more); memcpy(dst, src, more * sizeof(Tcl_UniChar)); dst += more; } @@ -3457,8 +3514,8 @@ TclStringCmp( s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len); s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); memCmpFn = memcmp; - } else if (TclHasInternalRep(value1Ptr, &tclStringType) - && TclHasInternalRep(value2Ptr, &tclStringType)) { + } else if (TclHasInternalRep(value1Ptr, &utf32StringType) + && TclHasInternalRep(value2Ptr, &utf32StringType)) { /* * Do a unicode-specific comparison if both of the args are of * String type. If the char length == byte length, we can do a @@ -3467,8 +3524,8 @@ TclStringCmp( */ if (nocase) { - s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len); - s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len); + s1 = (char *) TclGetUnicodeFromObj_(value1Ptr, &s1len); + s2 = (char *) TclGetUnicodeFromObj_(value2Ptr, &s2len); memCmpFn = (memCmpFn_t)(void *)TclUniCharNcasecmp; } else { s1len = Tcl_GetCharLength(value1Ptr); @@ -3481,8 +3538,8 @@ TclStringCmp( s2 = value2Ptr->bytes; memCmpFn = memcmp; } else { - s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, NULL); - s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, NULL); + s1 = (char *) TclGetUnicodeFromObj_(value1Ptr, NULL); + s2 = (char *) TclGetUnicodeFromObj_(value2Ptr, NULL); if ( #if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) 1 @@ -3680,8 +3737,8 @@ TclStringFirst( * do only the well-defined Tcl_UniChar array search. */ - un = Tcl_GetUnicodeFromObj(needle, &ln); - uh = Tcl_GetUnicodeFromObj(haystack, &lh); + un = TclGetUnicodeFromObj_(needle, &ln); + uh = TclGetUnicodeFromObj_(haystack, &lh); if ((lh < ln) || (start > lh - ln)) { /* Don't start the loop if there cannot be a valid answer */ goto firstEnd; @@ -3763,8 +3820,8 @@ TclStringLast( goto lastEnd; } - uh = Tcl_GetUnicodeFromObj(haystack, &lh); - un = Tcl_GetUnicodeFromObj(needle, &ln); + uh = TclGetUnicodeFromObj_(haystack, &lh); + un = TclGetUnicodeFromObj_(needle, &ln); if (last >= lh) { last = lh - 1; @@ -3856,7 +3913,7 @@ TclStringReverse( stringPtr = GET_STRING(objPtr); if (stringPtr->hasUnicode) { - Tcl_UniChar *from = Tcl_GetUnicodeFromObj(objPtr, NULL); + Tcl_UniChar *from = TclGetUnicodeFromObj_(objPtr, NULL); stringPtr = GET_STRING(objPtr); Tcl_UniChar *src = from + stringPtr->numChars; Tcl_UniChar *to; @@ -3869,7 +3926,7 @@ TclStringReverse( objPtr = TclNewUnicodeObj(&ch, 1); Tcl_SetObjLength(objPtr, stringPtr->numChars); - to = Tcl_GetUnicodeFromObj(objPtr, NULL); + to = TclGetUnicodeFromObj_(objPtr, NULL); stringPtr = GET_STRING(objPtr); while (--src >= from) { #if TCL_UTF_MAX < 4 @@ -4099,7 +4156,7 @@ TclStringReplace( /* The traditional implementation... */ { int numChars; - Tcl_UniChar *ustring = Tcl_GetUnicodeFromObj(objPtr, &numChars); + Tcl_UniChar *ustring = TclGetUnicodeFromObj_(objPtr, &numChars); /* TODO: Is there an in-place option worth pursuing here? */ @@ -4108,7 +4165,7 @@ TclStringReplace( Tcl_AppendObjToObj(result, insertPtr); } if (first + count < numChars) { - Tcl_AppendUnicodeToObj(result, ustring + first + count, + TclAppendUnicodeToObj(result, ustring + first + count, numChars - first - count); } @@ -4267,7 +4324,7 @@ DupStringInternalRep( copyStringPtr->allocated = copyPtr->bytes ? copyPtr->length : 0; SET_STRING(copyPtr, copyStringPtr); - copyPtr->typePtr = &tclStringType; + copyPtr->typePtr = &utf32StringType; } /* @@ -4292,7 +4349,7 @@ SetStringFromAny( TCL_UNUSED(Tcl_Interp *), Tcl_Obj *objPtr) /* The object to convert. */ { - if (!TclHasInternalRep(objPtr, &tclStringType)) { + if (!TclHasInternalRep(objPtr, &utf32StringType)) { String *stringPtr = stringAlloc(0); /* @@ -4312,7 +4369,7 @@ SetStringFromAny( stringPtr->maxChars = 0; stringPtr->hasUnicode = 0; SET_STRING(objPtr, stringPtr); - objPtr->typePtr = &tclStringType; + objPtr->typePtr = &utf32StringType; } return TCL_OK; } diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index 9814cfe..e99d4c1 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -1076,6 +1076,7 @@ TestobjCmd( #ifndef TCL_WIDE_INT_IS_LONG if (!strcmp(typeName, "wideInt")) typeName = "int"; #endif + if (!strcmp(typeName, "utf32string")) typeName = "string"; Tcl_SetObjResult(interp, Tcl_NewStringObj(typeName, -1)); } } else if (strcmp(subCmd, "refcount") == 0) { diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 66d1009..9cc82cb 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -2594,8 +2594,8 @@ TclStringMatchObj( if (TclHasInternalRep(strObj, &tclStringType) || (strObj->typePtr == NULL)) { Tcl_UniChar *udata, *uptn; - udata = Tcl_GetUnicodeFromObj(strObj, &length); - uptn = Tcl_GetUnicodeFromObj(ptnObj, &plen); + udata = TclGetUnicodeFromObj_(strObj, &length); + uptn = TclGetUnicodeFromObj_(ptnObj, &plen); match = TclUniCharMatch(udata, length, uptn, plen, flags); } else if (TclIsPureByteArray(strObj) && TclIsPureByteArray(ptnObj) && !flags) { diff --git a/tests/string.test b/tests/string.test index 203d0c6..9cac73d 100644 --- a/tests/string.test +++ b/tests/string.test @@ -422,7 +422,7 @@ test string-4.16.$noComp {string first, normal string vs pure unicode string} -b # Representation checks are canaries run {list [representationpoke $s] [representationpoke $m] \ [string first $m $s]} -} -result {{string 1} {string 0} 2} +} -result {{utf32string 1} {utf32string 0} 2} test string-4.17.$noComp {string first, corner case} -body { run {string first a aaa 4294967295} } -result {-1} -- cgit v0.12 From 46ed0441c3f458d86557c1813efb3b34c389a0e3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 14 Mar 2022 16:06:23 +0000 Subject: More progress --- .github/workflows/linux-build.yml | 2 +- .github/workflows/win-build.yml | 4 ++-- generic/tcl.decls | 2 +- generic/tclDecls.h | 4 ++-- generic/tclInt.h | 21 +++++++++++++----- generic/tclStringObj.c | 34 +++++++++++++++++++++-------- generic/tclTestObj.c | 3 ++- generic/tclUtf.c | 46 ++++++++++++++++++++++----------------- tests/stringObj.test | 44 ++++++++++++++++++------------------- win/makefile.vc | 4 ++-- win/rules.vc | 6 ++--- 11 files changed, 101 insertions(+), 69 deletions(-) diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index b410aab..cb93bd4 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -7,7 +7,7 @@ jobs: matrix: cfgopt: - "" - - "CFLAGS=-DTCL_UTF_MAX=4" + - "CFLAGS=-DTCL_UTF_MAX=3" - "CFLAGS=-DTCL_NO_DEPRECATED=1" - "--disable-shared" - "--enable-symbols" diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index a8019ee..547d27e 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -13,7 +13,7 @@ jobs: matrix: cfgopt: - "" - - "OPTS=utfmax" + - "OPTS=utf16" - "CHECKS=nodep" - "OPTS=static" - "OPTS=symbols" @@ -52,7 +52,7 @@ jobs: matrix: cfgopt: - "" - - "CFLAGS=-DTCL_UTF_MAX=4" + - "CFLAGS=-DTCL_UTF_MAX=3" - "CFLAGS=-DTCL_NO_DEPRECATED=1" - "--disable-shared" - "--enable-symbols" diff --git a/generic/tcl.decls b/generic/tcl.decls index 6dbb457..f5b2e78 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1338,7 +1338,7 @@ declare 377 { void Tcl_RegExpGetInfo(Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr) } declare 378 { - Tcl_Obj *Tcl_NewUnicodeObj(const unsigned char *unicode, int numChars) + Tcl_Obj *Tcl_NewUnicodeObj(const unsigned short *unicode, int numChars) } declare 379 { void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, const unsigned short *unicode, diff --git a/generic/tclDecls.h b/generic/tclDecls.h index bf15862..1952641 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1142,7 +1142,7 @@ EXTERN int Tcl_RegExpExecObj(Tcl_Interp *interp, EXTERN void Tcl_RegExpGetInfo(Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr); /* 378 */ -EXTERN Tcl_Obj * Tcl_NewUnicodeObj(const unsigned char *unicode, +EXTERN Tcl_Obj * Tcl_NewUnicodeObj(const unsigned short *unicode, int numChars); /* 379 */ EXTERN void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, @@ -2370,7 +2370,7 @@ typedef struct TclStubs { int (*tcl_UniCharIsPunct) (int ch); /* 375 */ int (*tcl_RegExpExecObj) (Tcl_Interp *interp, Tcl_RegExp regexp, Tcl_Obj *textObj, int offset, int nmatches, int flags); /* 376 */ void (*tcl_RegExpGetInfo) (Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr); /* 377 */ - Tcl_Obj * (*tcl_NewUnicodeObj) (const unsigned char *unicode, int numChars); /* 378 */ + Tcl_Obj * (*tcl_NewUnicodeObj) (const unsigned short *unicode, int numChars); /* 378 */ void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const unsigned short *unicode, int numChars); /* 379 */ int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */ int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ diff --git a/generic/tclInt.h b/generic/tclInt.h index ed607cd..2a04aca 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3319,12 +3319,21 @@ MODULE_SCOPE void TclErrorStackResetIf(Tcl_Interp *interp, MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); -MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); -MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); -MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); -MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); -MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); -MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); +#if TCL_UTF_MAX > 3 + MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); + MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); + MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); + MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); + MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); + MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); +#else +# define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj +# define TclNewUnicodeObj Tcl_NewUnicodeObj +# define TclAppendUnicodeToObj Tcl_AppendUnicodeToObj +# define TclUniCharNcasecmp Tcl_UniCharNcasecmp +# define TclUniCharCaseMatch Tcl_UniCharCaseMatch +# define TclUniCharNcmp Tcl_UniCharNcmp +#endif /* diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 8730331..eb5103d 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -55,8 +55,6 @@ static void AppendUtfToUtfRep(Tcl_Obj *objPtr, const char *bytes, int numBytes); static void DupStringInternalRep(Tcl_Obj *objPtr, Tcl_Obj *copyPtr); -static void DupUTF16StringInternalRep(Tcl_Obj *objPtr, - Tcl_Obj *copyPtr); static int ExtendStringRepWithUnicode(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); static void ExtendUnicodeRepWithString(Tcl_Obj *objPtr, @@ -67,12 +65,16 @@ static void FreeStringInternalRep(Tcl_Obj *objPtr); static void GrowStringBuffer(Tcl_Obj *objPtr, int needed, int flag); static void GrowUnicodeBuffer(Tcl_Obj *objPtr, int needed); static int SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); -static int SetUTF16StringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); static void SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); static int UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); +#if TCL_UTF_MAX < 4 +static void DupUTF16StringInternalRep(Tcl_Obj *objPtr, + Tcl_Obj *copyPtr); +static int SetUTF16StringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); static void UpdateStringOfUTF16String(Tcl_Obj *objPtr); +#endif #define ISCONTINUATION(bytes) (\ ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \ @@ -84,6 +86,12 @@ static void UpdateStringOfUTF16String(Tcl_Obj *objPtr); * functions that can be invoked by generic object code. */ +#if TCL_UTF_MAX > 3 + +#define utf32StringType tclStringType + +#else + static const Tcl_ObjType utf32StringType = { "utf32string", /* name */ FreeStringInternalRep, /* freeIntRepPro */ @@ -125,7 +133,7 @@ DupUTF16StringInternalRep( size_t size = offsetof(UTF16String, unicode) + (((srcStringPtr->numChars) + 1U) * sizeof(unsigned short)); UTF16String *copyStringPtr = (UTF16String *)ckalloc(size); memcpy(copyStringPtr, srcStringPtr, size); - copyStringPtr->allocated = srcStringPtr->numChars; + copyStringPtr->allocated = srcStringPtr->numChars + 1; copyStringPtr->maxChars = srcStringPtr->numChars; copyPtr->internalRep.twoPtrValue.ptr1 = copyStringPtr; @@ -156,7 +164,7 @@ SetUTF16StringFromAny( */ stringPtr->numChars = 0; - stringPtr->allocated = objPtr->length; + stringPtr->allocated = objPtr->length + 1; stringPtr->maxChars = objPtr->length; stringPtr->hasUnicode = 1; objPtr->internalRep.twoPtrValue.ptr1 = stringPtr; @@ -172,6 +180,8 @@ UpdateStringOfUTF16String( (void)objPtr; } +#endif + /* * TCL STRING GROWTH ALGORITHM * @@ -459,7 +469,7 @@ Tcl_DbNewStringObj( Tcl_Obj * TclNewUnicodeObj( - const int *unicode, /* The unicode string used to initialize the + const Tcl_UniChar *unicode, /* The unicode string used to initialize the * new object. */ int numChars) /* Number of characters in the unicode * string. */ @@ -471,9 +481,10 @@ TclNewUnicodeObj( return objPtr; } +#if TCL_UTF_MAX > 3 Tcl_Obj * Tcl_NewUnicodeObj( - const unsigned char *unicode, /* The unicode string used to initialize the + const unsigned short *unicode, /* The unicode string used to initialize the * new object. */ int numChars) /* Number of characters in the unicode * string. */ @@ -486,6 +497,7 @@ Tcl_NewUnicodeObj( /* TODO JN */ return objPtr; } +#endif /* *---------------------------------------------------------------------- @@ -740,7 +752,7 @@ Tcl_GetUnicode( *---------------------------------------------------------------------- */ -int * +Tcl_UniChar * TclGetUnicodeFromObj_( Tcl_Obj *objPtr, /* The object to find the unicode string * for. */ @@ -764,6 +776,7 @@ TclGetUnicodeFromObj_( return stringPtr->unicode; } +#if TCL_UTF_MAX > 3 unsigned short * Tcl_GetUnicodeFromObj( Tcl_Obj *objPtr, /* The object to find the unicode string @@ -778,6 +791,7 @@ Tcl_GetUnicodeFromObj( /* TODO JN */ return NULL; } +#endif unsigned short * TclGetUnicodeFromObj( @@ -1400,7 +1414,7 @@ Tcl_AppendToObj( void TclAppendUnicodeToObj( Tcl_Obj *objPtr, /* Points to the object to append to. */ - const int *unicode, /* The unicode string to append to the + const Tcl_UniChar *unicode, /* The unicode string to append to the * object. */ int length) /* Number of chars in "unicode". */ { @@ -1430,6 +1444,7 @@ TclAppendUnicodeToObj( } } +#if TCL_UTF_MAX > 3 void Tcl_AppendUnicodeToObj( Tcl_Obj *objPtr, /* Points to the object to append to. */ @@ -1443,6 +1458,7 @@ Tcl_AppendUnicodeToObj( /* TODO JN */ } +#endif /* *---------------------------------------------------------------------- diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index 9814cfe..9884a9a 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -1073,8 +1073,9 @@ TestobjCmd( Tcl_SetObjResult(interp, Tcl_NewStringObj("none", -1)); } else { typeName = objv[2]->typePtr->name; + if (!strcmp(typeName, "utf32string")) typeName = "string"; #ifndef TCL_WIDE_INT_IS_LONG - if (!strcmp(typeName, "wideInt")) typeName = "int"; + else if (!strcmp(typeName, "wideInt")) typeName = "int"; #endif Tcl_SetObjResult(interp, Tcl_NewStringObj(typeName, -1)); } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 68a0e32..02f4358 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1850,8 +1850,8 @@ Tcl_UniCharLen( int TclUniCharNcmp( - const int *ucs, /* Unicode string to compare to uct. */ - const int *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { #if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) @@ -1875,6 +1875,7 @@ TclUniCharNcmp( #endif /* WORDS_BIGENDIAN */ } +#if TCL_UTF_MAX > 3 int Tcl_UniCharNcmp( const unsigned short *ucs, /* Unicode string to compare to uct. */ @@ -1907,6 +1908,7 @@ Tcl_UniCharNcmp( return 0; #endif /* WORDS_BIGENDIAN */ } +#endif /* *---------------------------------------------------------------------- * @@ -1926,23 +1928,17 @@ Tcl_UniCharNcmp( */ int -Tcl_UniCharNcasecmp( - const unsigned short *ucs, /* Unicode string to compare to uct. */ - const unsigned short *uct, /* Unicode string ucs is compared to. */ +TclUniCharNcasecmp( + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { - unsigned short lcs = Tcl_UniCharToLower(*ucs); - unsigned short lct = Tcl_UniCharToLower(*uct); + int lcs = Tcl_UniCharToLower(*ucs); + int lct = Tcl_UniCharToLower(*uct); if (lcs != lct) { - /* special case for handling upper surrogates */ - if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { - return 1; - } else if (((lct & 0xFC00) == 0xD800)) { - return -1; - } return (lcs - lct); } } @@ -1950,24 +1946,32 @@ Tcl_UniCharNcasecmp( return 0; } +#if TCL_UTF_MAX > 3 int -TclUniCharNcasecmp( - const int *ucs, /* Unicode string to compare to uct. */ - const int *uct, /* Unicode string ucs is compared to. */ +Tcl_UniCharNcasecmp( + const unsigned short *ucs, /* Unicode string to compare to uct. */ + const unsigned short *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { - int lcs = Tcl_UniCharToLower(*ucs); - int lct = Tcl_UniCharToLower(*uct); + unsigned short lcs = Tcl_UniCharToLower(*ucs); + unsigned short lct = Tcl_UniCharToLower(*uct); if (lcs != lct) { + /* special case for handling upper surrogates */ + if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { + return 1; + } else if (((lct & 0xFC00) == 0xD800)) { + return -1; + } return (lcs - lct); } } } return 0; } +#endif /* @@ -2333,8 +2337,8 @@ Tcl_UniCharIsWordChar( int TclUniCharCaseMatch( - const int *uniStr, /* Unicode String. */ - const int *uniPattern, + const Tcl_UniChar *uniStr, /* Unicode String. */ + const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ @@ -2498,6 +2502,7 @@ TclUniCharCaseMatch( } } +#if TCL_UTF_MAX > 3 int Tcl_UniCharCaseMatch( const unsigned short *uniStr, /* Unicode String. */ @@ -2664,6 +2669,7 @@ Tcl_UniCharCaseMatch( uniPattern++; } } +#endif /* diff --git a/tests/stringObj.test b/tests/stringObj.test index a2bdf95..abe02b2 100644 --- a/tests/stringObj.test +++ b/tests/stringObj.test @@ -190,12 +190,12 @@ test stringObj-7.3 {SetStringFromAny called with non-string obj} testobj { set x 2345 list [incr x] [testobj objtype $x] [string index $x end] \ [testobj objtype $x] -} {2346 int 6 utf32string} +} {2346 int 6 string} test stringObj-7.4 {SetStringFromAny called with string obj} testobj { set x "abcdef" list [string length $x] [testobj objtype $x] \ [string length $x] [testobj objtype $x] -} {6 utf32string 6 utf32string} +} {6 string 6 string} test stringObj-8.1 {DupStringInternalRep procedure} testobj { testobj freeallvars @@ -213,28 +213,28 @@ test stringObj-8.2 {DupUnicodeInternalRep, mixed width chars} testobj { set y $x list [testobj objtype $x] [testobj objtype $y] [append x "\xAE\xBF\xEF"] \ [set y] [testobj objtype $x] [testobj objtype $y] -} "utf32string utf32string abc\xEF\xBF\xAEghi\xAE\xBF\xEF abc\xEF\xBF\xAEghi utf32string utf32string" +} "string string abc\xEF\xBF\xAEghi\xAE\xBF\xEF abc\xEF\xBF\xAEghi string string" test stringObj-8.3 {DupUnicodeInternalRep, mixed width chars} testobj { set x abc\xEF\xBF\xAEghi set y $x string length $x list [testobj objtype $x] [testobj objtype $y] [append x "\xAE\xBF\xEF"] \ [set y] [testobj objtype $x] [testobj objtype $y] -} "utf32string utf32string abc\xEF\xBF\xAEghi\xAE\xBF\xEF abc\xEF\xBF\xAEghi utf32string utf32string" +} "string string abc\xEF\xBF\xAEghi\xAE\xBF\xEF abc\xEF\xBF\xAEghi string string" test stringObj-8.4 {DupUnicodeInternalRep, all byte-size chars} testobj { set x abcdefghi string length $x set y $x list [testobj objtype $x] [testobj objtype $y] [append x jkl] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {utf32string utf32string abcdefghijkl abcdefghi utf32string utf32string} +} {string string abcdefghijkl abcdefghi string string} test stringObj-8.5 {DupUnicodeInternalRep, all byte-size chars} testobj { set x abcdefghi set y $x string length $x list [testobj objtype $x] [testobj objtype $y] [append x jkl] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {utf32string utf32string abcdefghijkl abcdefghi utf32string utf32string} +} {string string abcdefghijkl abcdefghi string string} test stringObj-9.1 {TclAppendObjToObj, mixed src & dest} {testobj testdstring} { set x abc\xEF\xBF\xAEghi @@ -244,15 +244,15 @@ test stringObj-9.1 {TclAppendObjToObj, mixed src & dest} {testobj testdstring} { string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} "utf32string none abc\xEF\xBF\xAEghi\xAE\xBF\xEF \xAE\xBF\xEF utf32string none" +} "string none abc\xEF\xBF\xAEghi\xAE\xBF\xEF \xAE\xBF\xEF string none" test stringObj-9.2 {TclAppendObjToObj, mixed src & dest} testobj { set x abc\xEF\xBF\xAEghi string length $x list [testobj objtype $x] [append x $x] [testobj objtype $x] \ [append x $x] [testobj objtype $x] -} "utf32string abc\xEF\xBF\xAEghiabc\xEF\xBF\xAEghi utf32string\ +} "string abc\xEF\xBF\xAEghiabc\xEF\xBF\xAEghi string\ abc\xEF\xBF\xAEghiabc\xEF\xBF\xAEghiabc\xEF\xBF\xAEghiabc\xEF\xBF\xAEghi\ -utf32string" +string" test stringObj-9.3 {TclAppendObjToObj, mixed src & 1-byte dest} {testobj testdstring} { set x abcdefghi testdstring free @@ -261,7 +261,7 @@ test stringObj-9.3 {TclAppendObjToObj, mixed src & 1-byte dest} {testobj testdst string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} "utf32string none abcdefghi\xAE\xBF\xEF \xAE\xBF\xEF utf32string none" +} "string none abcdefghi\xAE\xBF\xEF \xAE\xBF\xEF string none" test stringObj-9.4 {TclAppendObjToObj, 1-byte src & dest} {testobj testdstring} { set x abcdefghi testdstring free @@ -270,14 +270,14 @@ test stringObj-9.4 {TclAppendObjToObj, 1-byte src & dest} {testobj testdstring} string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {utf32string none abcdefghijkl jkl utf32string none} +} {string none abcdefghijkl jkl string none} test stringObj-9.5 {TclAppendObjToObj, 1-byte src & dest} testobj { set x abcdefghi string length $x list [testobj objtype $x] [append x $x] [testobj objtype $x] \ [append x $x] [testobj objtype $x] -} {utf32string abcdefghiabcdefghi utf32string abcdefghiabcdefghiabcdefghiabcdefghi\ -utf32string} +} {string abcdefghiabcdefghi string abcdefghiabcdefghiabcdefghiabcdefghi\ +string} test stringObj-9.6 {TclAppendObjToObj, 1-byte src & mixed dest} {testobj testdstring} { set x abc\xEF\xBF\xAEghi testdstring free @@ -286,33 +286,33 @@ test stringObj-9.6 {TclAppendObjToObj, 1-byte src & mixed dest} {testobj testdst string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} "utf32string none abc\xEF\xBF\xAEghijkl jkl utf32string none" +} "string none abc\xEF\xBF\xAEghijkl jkl string none" test stringObj-9.7 {TclAppendObjToObj, integer src & dest} testobj { set x [expr {4 * 5}] set y [expr {4 + 5}] list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [testobj objtype $x] [append x $y] [testobj objtype $x] \ [testobj objtype $y] -} {int int 209 utf32string 2099 utf32string int} +} {int int 209 string 2099 string int} test stringObj-9.8 {TclAppendObjToObj, integer src & dest} testobj { set x [expr {4 * 5}] list [testobj objtype $x] [append x $x] [testobj objtype $x] \ [append x $x] [testobj objtype $x] -} {int 2020 utf32string 20202020 utf32string} +} {int 2020 string 20202020 string} test stringObj-9.9 {TclAppendObjToObj, integer src & 1-byte dest} testobj { set x abcdefghi set y [expr {4 + 5}] string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} {utf32string int abcdefghi9 9 utf32string int} +} {string int abcdefghi9 9 string int} test stringObj-9.10 {TclAppendObjToObj, integer src & mixed dest} testobj { set x abc\xEF\xBF\xAEghi set y [expr {4 + 5}] string length $x list [testobj objtype $x] [testobj objtype $y] [append x $y] \ [set y] [testobj objtype $x] [testobj objtype $y] -} "utf32string int abc\xEF\xBF\xAEghi9 9 utf32string int" +} "string int abc\xEF\xBF\xAEghi9 9 string int" test stringObj-9.11 {TclAppendObjToObj, mixed src & 1-byte dest index check} testobj { # bug 2678, in <=8.2.0, the second obj (the one to append) in # Tcl_AppendObjToObj was not correctly checked to see if it was all one @@ -336,20 +336,20 @@ test stringObj-10.1 {Tcl_GetRange with all byte-size chars} {testobj testdstring set x [testdstring get] list [testobj objtype $x] [set y [string range $x 1 end-1]] \ [testobj objtype $x] [testobj objtype $y] -} [list none bcde utf32string utf32string] +} [list none bcde string string] test stringObj-10.2 {Tcl_GetRange with some mixed width chars} {testobj testdstring} { testdstring free testdstring append "abcïïdef" -1 set x [testdstring get] list [testobj objtype $x] [set y [string range $x 1 end-1]] \ [testobj objtype $x] [testobj objtype $y] -} [list none "bcïïde" utf32string utf32string] +} [list none "bcïïde" string string] test stringObj-10.3 {Tcl_GetRange with some mixed width chars} testobj { set x "abcïïdef" string length $x list [testobj objtype $x] [set y [string range $x 1 end-1]] \ [testobj objtype $x] [testobj objtype $y] -} [list utf32string "bcïïde" utf32string utf32string] +} [list string "bcïïde" string string] test stringObj-10.4 {Tcl_GetRange with some mixed width chars} testobj { set a "ïa¿b®cï¿d®" set result [list] @@ -368,7 +368,7 @@ test stringObj-11.1 {UpdateStringOfString} testobj { set x 2345 list [string index $x end] [testobj objtype $x] [incr x] \ [testobj objtype $x] -} {5 utf32string 2346 int} +} {5 string 2346 int} test stringObj-12.1 {Tcl_GetUniChar with byte-size chars} testobj { set x "abcdefghi" diff --git a/win/makefile.vc b/win/makefile.vc index 1ef64f2..6ff6118 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -52,7 +52,7 @@ # turn on the 64-bit compiler, if your SDK has it. # # Basic macros and options usable on the commandline (see rules.vc for more info): -# OPTS=msvcrt,noembed,nothreads,pdbs,profile,static,symbols,thrdalloc,time64bit,unchecked,utfmax,none +# OPTS=msvcrt,noembed,nothreads,pdbs,profile,static,symbols,thrdalloc,time64bit,unchecked,utf16,none # Sets special options for the core. The default is for none. # Any combination of the above may be used (comma separated). # 'none' will over-ride everything to nothing. @@ -80,7 +80,7 @@ # unchecked = Allows a symbols build to not use the debug # enabled runtime (msvcrt.dll not msvcrtd.dll # or libcmt.lib not libcmtd.lib). -# utfmax = Forces a build using UTF-32 representation internally. +# utf16 = Forces a build using UTF-16 representation internally. # # STATS=compdbg,memdbg,none # Sets optional memory and bytecode compiler debugging code added diff --git a/win/rules.vc b/win/rules.vc index 372d70a..713e7f9 100644 --- a/win/rules.vc +++ b/win/rules.vc @@ -884,9 +884,9 @@ USE_THREAD_ALLOC= 0 _USE_64BIT_TIME_T = 1 !endif -!if [nmakehlp -f $(OPTS) "utfmax"] -!message *** Force allowing 4-byte UTF-8 sequences internally -TCL_UTF_MAX = 4 +!if [nmakehlp -f $(OPTS) "utf16"] +!message *** Force UTF-16 internally +TCL_UTF_MAX = 3 !endif !endif -- cgit v0.12 From 4dd5be298811da7ad60f1cd93d3ff09e4baf34d0 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 15 Mar 2022 16:17:42 +0000 Subject: update rules.vc --- win/rules.vc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/win/rules.vc b/win/rules.vc index 713e7f9..6d9bbf8 100644 --- a/win/rules.vc +++ b/win/rules.vc @@ -816,8 +816,7 @@ DOTSEPARATED=$(DOTSEPARATED:b=.) # configuration (ignored for Tcl itself) # _USE_64BIT_TIME_T - forces a build using 64-bit time_t for 32-bit build # (CRT library should support this, not needed for Tcl 9.x) -# TCL_UTF_MAX=4 - forces a build allowing 4-byte UTF-8 sequences internally. -# (Not needed for Tcl 9.x) +# TCL_UTF_MAX=3 - forces a build using UTF-16 internally (not recommended). # Further, LINKERFLAGS are modified based on above. # Default values for all the above @@ -1423,13 +1422,13 @@ OPTDEFINES = $(OPTDEFINES) /DNO_STRTOI64=1 !if "$(_USE_64BIT_TIME_T)" == "1" OPTDEFINES = $(OPTDEFINES) /D_USE_64BIT_TIME_T=1 !endif -!if "$(TCL_UTF_MAX)" == "4" -OPTDEFINES = $(OPTDEFINES) /DTCL_UTF_MAX=4 -!endif # _ATL_XP_TARGETING - Newer SDK's need this to build for XP COMPILERFLAGS = /D_ATL_XP_TARGETING !endif +!if "$(TCL_UTF_MAX)" == "3" +OPTDEFINES = $(OPTDEFINES) /DTCL_UTF_MAX=3 +!endif # Like the TEA system only set this non empty for non-Tk extensions # Note: some extensions use PACKAGE_NAME and others use PACKAGE_TCLNAME -- cgit v0.12 From 459642686539ca9da0448167dbbf7fef7435f864 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 16 Mar 2022 22:21:39 +0000 Subject: Handle Tcl_GetCharLength --- generic/tclCmdMZ.c | 22 ++++++++--------- generic/tclCompCmdsSZ.c | 2 +- generic/tclExecute.c | 10 ++++---- generic/tclIO.c | 2 +- generic/tclInt.h | 2 ++ generic/tclProc.c | 2 +- generic/tclStringObj.c | 64 ++++++++++++++++++++++++++++++++++++++++++------- 7 files changed, 76 insertions(+), 28 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index db4002a..c2b4eb3 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -256,7 +256,7 @@ Tcl_RegexpObjCmd( */ objPtr = objv[1]; - stringLength = Tcl_GetCharLength(objPtr); + stringLength = TclGetCharLength(objPtr); if (startIndex) { TclGetIntForIndexM(interp, startIndex, stringLength, &offset); @@ -581,7 +581,7 @@ Tcl_RegsubObjCmd( objv += idx; if (startIndex) { - int stringLength = Tcl_GetCharLength(objv[1]); + int stringLength = TclGetCharLength(objv[1]); TclGetIntForIndexM(interp, startIndex, stringLength, &offset); Tcl_DecrRefCount(startIndex); @@ -1316,7 +1316,7 @@ StringFirstCmd( } if (objc == 4) { - int size = Tcl_GetCharLength(objv[2]); + int size = TclGetCharLength(objv[2]); if (TCL_OK != TclGetIntForIndexM(interp, objv[3], size - 1, &start)) { return TCL_ERROR; @@ -1360,7 +1360,7 @@ StringLastCmd( } if (objc == 4) { - int size = Tcl_GetCharLength(objv[2]); + int size = TclGetCharLength(objv[2]); if (TCL_OK != TclGetIntForIndexM(interp, objv[3], size - 1, &last)) { return TCL_ERROR; @@ -1406,7 +1406,7 @@ StringIndexCmd( * Get the char length to calculate what 'end' means. */ - length = Tcl_GetCharLength(objv[1]); + length = TclGetCharLength(objv[1]); if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } @@ -1474,7 +1474,7 @@ StringInsertCmd( return TCL_ERROR; } - length = Tcl_GetCharLength(objv[1]); + length = TclGetCharLength(objv[1]); if (TclGetIntForIndexM(interp, objv[2], length, &index) != TCL_OK) { return TCL_ERROR; } @@ -1669,7 +1669,7 @@ StringIsCmd( p++; } TclNewStringObj(tmpStr, string1, p-string1); - failat = Tcl_GetCharLength(tmpStr); + failat = TclGetCharLength(tmpStr); TclDecrRefCount(tmpStr); break; } @@ -1849,7 +1849,7 @@ StringIsCmd( p++; } TclNewStringObj(tmpStr, string1, p-string1); - failat = Tcl_GetCharLength(tmpStr); + failat = TclGetCharLength(tmpStr); TclDecrRefCount(tmpStr); break; } @@ -2293,7 +2293,7 @@ StringRangeCmd( * 'end' refers to the last character, not one past it. */ - length = Tcl_GetCharLength(objv[1]) - 1; + length = TclGetCharLength(objv[1]) - 1; if (TclGetIntForIndexM(interp, objv[2], length, &first) != TCL_OK || TclGetIntForIndexM(interp, objv[3], length, &last) != TCL_OK) { @@ -2394,7 +2394,7 @@ StringRplcCmd( return TCL_ERROR; } - length = Tcl_GetCharLength(objv[1]); + length = TclGetCharLength(objv[1]); end = length - 1; if (TclGetIntForIndexM(interp, objv[2], end, &first) != TCL_OK || @@ -2880,7 +2880,7 @@ StringLenCmd( return TCL_ERROR; } - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(Tcl_GetCharLength(objv[1]))); + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(TclGetCharLength(objv[1]))); return TCL_OK; } diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index aa2d13e..62909eb 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -890,7 +890,7 @@ TclCompileStringLenCmd( */ char buf[TCL_INTEGER_SPACE]; - int len = Tcl_GetCharLength(objPtr); + int len = TclGetCharLength(objPtr); len = sprintf(buf, "%d", len); PushLiteral(envPtr, buf, len); diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 67df5f4..f09f75c 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5244,7 +5244,7 @@ TEBCresume( case INST_STR_LEN: valuePtr = OBJ_AT_TOS; - length = Tcl_GetCharLength(valuePtr); + length = TclGetCharLength(valuePtr); TclNewIntObj(objResultPtr, length); TRACE(("\"%.20s\" => %d\n", O2S(valuePtr), length)); NEXT_INST_F(1, 1, 1); @@ -5310,7 +5310,7 @@ TEBCresume( * Get char length to calulate what 'end' means. */ - length = Tcl_GetCharLength(valuePtr); + length = TclGetCharLength(valuePtr); DECACHE_STACK_INFO(); if (TclGetIntForIndexM(interp, value2Ptr, length-1, &index)!=TCL_OK) { CACHE_STACK_INFO(); @@ -5353,7 +5353,7 @@ TEBCresume( case INST_STR_RANGE: TRACE(("\"%.20s\" %.20s %.20s =>", O2S(OBJ_AT_DEPTH(2)), O2S(OBJ_UNDER_TOS), O2S(OBJ_AT_TOS))); - length = Tcl_GetCharLength(OBJ_AT_DEPTH(2)) - 1; + length = TclGetCharLength(OBJ_AT_DEPTH(2)) - 1; DECACHE_STACK_INFO(); if (TclGetIntForIndexM(interp, OBJ_UNDER_TOS, length, @@ -5382,7 +5382,7 @@ TEBCresume( valuePtr = OBJ_AT_TOS; fromIdx = TclGetInt4AtPtr(pc+1); toIdx = TclGetInt4AtPtr(pc+5); - length = Tcl_GetCharLength(valuePtr); + length = TclGetCharLength(valuePtr); TRACE(("\"%.20s\" %d %d => ", O2S(valuePtr), fromIdx, toIdx)); /* Every range of an empty value is an empty value */ @@ -5428,7 +5428,7 @@ TEBCresume( case INST_STR_REPLACE: value3Ptr = POP_OBJECT(); valuePtr = OBJ_AT_DEPTH(2); - endIdx = Tcl_GetCharLength(valuePtr) - 1; + endIdx = TclGetCharLength(valuePtr) - 1; TRACE(("\"%.20s\" %s %s \"%.20s\" => ", O2S(valuePtr), O2S(OBJ_UNDER_TOS), O2S(OBJ_AT_TOS), O2S(value3Ptr))); DECACHE_STACK_INFO(); diff --git a/generic/tclIO.c b/generic/tclIO.c index 9d88948..f41e481 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -3556,7 +3556,7 @@ Tcl_Close( result = flushcode; } if ((result != 0) && (result != TCL_ERROR) && (interp != NULL) - && 0 == Tcl_GetCharLength(Tcl_GetObjResult(interp))) { + && 0 == TclGetCharLength(Tcl_GetObjResult(interp))) { Tcl_SetErrno(result); Tcl_SetObjResult(interp, Tcl_NewStringObj(Tcl_PosixError(interp), -1)); diff --git a/generic/tclInt.h b/generic/tclInt.h index 75f8858..2d828e8 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3322,6 +3322,7 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); #if TCL_UTF_MAX > 3 MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); + MODULE_SCOPE int TclGetCharLength(Tcl_Obj *); MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); @@ -3329,6 +3330,7 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); #else # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj # define TclNewUnicodeObj Tcl_NewUnicodeObj +# define TclGetCharLength Tcl_GetCharLength # define TclAppendUnicodeToObj Tcl_AppendUnicodeToObj # define TclUniCharNcasecmp Tcl_UniCharNcasecmp # define TclUniCharCaseMatch Tcl_UniCharCaseMatch diff --git a/generic/tclProc.c b/generic/tclProc.c index 45d1afd..75687f0 100644 --- a/generic/tclProc.c +++ b/generic/tclProc.c @@ -529,7 +529,7 @@ TclCreateProc( "FORMALARGUMENTFORMAT", NULL); goto procError; } - if ((fieldCount == 0) || (Tcl_GetCharLength(fieldValues[0]) == 0)) { + if ((fieldCount == 0) || (TclGetCharLength(fieldValues[0]) == 0)) { Tcl_SetObjResult(interp, Tcl_NewStringObj( "argument with no name", -1)); Tcl_SetErrorCode(interp, "TCL", "OPERATION", "PROC", diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 1c24716..2dc79eb 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -590,7 +590,7 @@ Tcl_NewUnicodeObj( */ int -Tcl_GetCharLength( +TclGetCharLength( Tcl_Obj *objPtr) /* The String object to get the num chars * of. */ { @@ -643,6 +643,52 @@ Tcl_GetCharLength( return numChars; } +#if TCL_UTF_MAX > 3 +int +Tcl_GetCharLength( + Tcl_Obj *objPtr) /* The String object to get the num chars + * of. */ +{ + String *stringPtr; + int numChars; + + /* + * Quick, no-shimmer return for short string reps. + */ + + if ((objPtr->bytes) && (objPtr->length < 2)) { + /* 0 bytes -> 0 chars; 1 byte -> 1 char */ + return objPtr->length; + } + + /* + * Optimize the case where we're really dealing with a bytearray object; + * we don't need to convert to a string to perform the get-length operation. + * + * Starting in Tcl 8.7, we check for a "pure" bytearray, because the + * machinery behind that test is using a proper bytearray ObjType. We + * could also compute length of an improper bytearray without shimmering + * but there's no value in that. We *want* to shimmer an improper bytearray + * because improper bytearrays have worthless internal reps. + */ + + if (TclIsPureByteArray(objPtr)) { + int length; + + (void) Tcl_GetByteArrayFromObj(objPtr, &length); + return length; + } + + /* + * OK, need to work with the object as a string. + */ + + SetUTF16StringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + return stringPtr->numChars; +} +#endif + /* *---------------------------------------------------------------------- * @@ -2336,7 +2382,7 @@ Tcl_AppendFormatToObj( goto errorMsg; case 's': if (gotPrecision) { - numChars = Tcl_GetCharLength(segment); + numChars = TclGetCharLength(segment); if (precision < numChars) { if (precision < 1) { TclNewObj(segment); @@ -2521,7 +2567,7 @@ Tcl_AppendFormatToObj( gotZero = 0; } if (gotZero) { - length += Tcl_GetCharLength(segment); + length += TclGetCharLength(segment); if (length < width) { segmentLimit -= width - length; } @@ -2652,7 +2698,7 @@ Tcl_AppendFormatToObj( gotZero = 0; } if (gotZero) { - length += Tcl_GetCharLength(segment); + length += TclGetCharLength(segment); if (length < width) { segmentLimit -= width - length; } @@ -2763,7 +2809,7 @@ Tcl_AppendFormatToObj( } if (width>0 && numChars<0) { - numChars = Tcl_GetCharLength(segment); + numChars = TclGetCharLength(segment); } if (!gotMinus && width>0) { if (numChars < width) { @@ -3720,8 +3766,8 @@ TclStringCmp( s2 = (char *) TclGetUnicodeFromObj_(value2Ptr, &s2len); memCmpFn = (memCmpFn_t)(void *)TclUniCharNcasecmp; } else { - s1len = Tcl_GetCharLength(value1Ptr); - s2len = Tcl_GetCharLength(value2Ptr); + s1len = TclGetCharLength(value1Ptr); + s2len = TclGetCharLength(value2Ptr); if ((s1len == value1Ptr->length) && (value1Ptr->bytes != NULL) && (s2len == value2Ptr->length) @@ -3866,7 +3912,7 @@ TclStringFirst( Tcl_Obj *haystack, int start) { - int lh, ln = Tcl_GetCharLength(needle); + int lh, ln = TclGetCharLength(needle); Tcl_Obj *result; int value = -1; Tcl_UniChar *checkStr, *endStr, *uh, *un; @@ -3973,7 +4019,7 @@ TclStringLast( Tcl_Obj *haystack, int last) { - int lh, ln = Tcl_GetCharLength(needle); + int lh, ln = TclGetCharLength(needle); Tcl_Obj *result; int value = -1; Tcl_UniChar *checkStr, *uh, *un; -- cgit v0.12 From 17098d70c767e79c2deb65981af371cb209cd159 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 16 Mar 2022 23:08:28 +0000 Subject: Handle Tcl_UtfAtIndex --- generic/tclBinary.c | 4 ++-- generic/tclCmdMZ.c | 12 ++++++------ generic/tclEncoding.c | 2 +- generic/tclIO.c | 2 +- generic/tclInt.h | 6 ++++-- generic/tclRegexp.c | 4 ++-- generic/tclUtf.c | 38 +++++++++++++++++++++++++++++++++----- 7 files changed, 49 insertions(+), 19 deletions(-) diff --git a/generic/tclBinary.c b/generic/tclBinary.c index 4717b05..bc17232 100644 --- a/generic/tclBinary.c +++ b/generic/tclBinary.c @@ -434,7 +434,7 @@ TclGetBytesFromObj( irPtr = TclFetchInternalRep(objPtr, &tclByteArrayType); baPtr = GET_BYTEARRAY(irPtr); - nonbyte = Tcl_UtfAtIndex(Tcl_GetString(objPtr), baPtr->bad); + nonbyte = TclUtfAtIndex(Tcl_GetString(objPtr), baPtr->bad); TclUtfToUCS4(nonbyte, &ucs4); Tcl_SetObjResult(interp, Tcl_ObjPrintf( @@ -473,7 +473,7 @@ Tcl_GetBytesFromObj( irPtr = TclFetchInternalRep(objPtr, &tclByteArrayType); baPtr = GET_BYTEARRAY(irPtr); - nonbyte = Tcl_UtfAtIndex(Tcl_GetString(objPtr), baPtr->bad); + nonbyte = TclUtfAtIndex(Tcl_GetString(objPtr), baPtr->bad); TclUtfToUCS4(nonbyte, &ucs4); Tcl_SetObjResult(interp, Tcl_ObjPrintf( diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index c2b4eb3..bd5745b 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2954,8 +2954,8 @@ StringLowerCmd( } string1 = TclGetStringFromObj(objv[1], &length1); - start = Tcl_UtfAtIndex(string1, first); - end = Tcl_UtfAtIndex(start, last - first + 1); + start = TclUtfAtIndex(string1, first); + end = TclUtfAtIndex(start, last - first + 1); resultPtr = Tcl_NewStringObj(string1, end - string1); string2 = TclGetString(resultPtr) + (start - string1); @@ -3039,8 +3039,8 @@ StringUpperCmd( } string1 = TclGetStringFromObj(objv[1], &length1); - start = Tcl_UtfAtIndex(string1, first); - end = Tcl_UtfAtIndex(start, last - first + 1); + start = TclUtfAtIndex(string1, first); + end = TclUtfAtIndex(start, last - first + 1); resultPtr = Tcl_NewStringObj(string1, end - string1); string2 = TclGetString(resultPtr) + (start - string1); @@ -3124,8 +3124,8 @@ StringTitleCmd( } string1 = TclGetStringFromObj(objv[1], &length1); - start = Tcl_UtfAtIndex(string1, first); - end = Tcl_UtfAtIndex(start, last - first + 1); + start = TclUtfAtIndex(string1, first); + end = TclUtfAtIndex(start, last - first + 1); resultPtr = Tcl_NewStringObj(string1, end - string1); string2 = TclGetString(resultPtr) + (start - string1); diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 4630a02..6890d3a 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1294,7 +1294,7 @@ Tcl_ExternalToUtf( if (*dstCharsPtr <= maxChars) { break; } - dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); + dstLen = TclUtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); *statePtr = savedState; } while (1); if (!noTerminate) { diff --git a/generic/tclIO.c b/generic/tclIO.c index f41e481..59aa50f 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -6379,7 +6379,7 @@ ReadChars( * bytes demanded by the Tcl_ExternalToUtf() call! */ - dstLimit = Tcl_UtfAtIndex(dst, charsToRead) - dst + (TCL_UTF_MAX - 1); + dstLimit = TclUtfAtIndex(dst, charsToRead) - dst + (TCL_UTF_MAX - 1); statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; diff --git a/generic/tclInt.h b/generic/tclInt.h index 2d828e8..0705c1d 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3327,6 +3327,7 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); + MODULE_SCOPE const char *TclUtfAtIndex(const char *, int); #else # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj # define TclNewUnicodeObj Tcl_NewUnicodeObj @@ -3335,6 +3336,7 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # define TclUniCharNcasecmp Tcl_UniCharNcasecmp # define TclUniCharCaseMatch Tcl_UniCharCaseMatch # define TclUniCharNcmp Tcl_UniCharNcmp +# define TclUtfAtIndex Tcl_UtfAtIndex #endif @@ -4741,8 +4743,8 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; : Tcl_UtfToUniChar(str, chPtr)) #else #define TclUtfToUniChar(str, chPtr) \ - ((((unsigned char) *(str)) < 0x80) ? \ - ((*(chPtr) = (unsigned char) *(str)), 1) \ + (((UCHAR(*(str))) < 0x80) ? \ + ((*(chPtr) = UCHAR(*(str))), 1) \ : Tcl_UtfToChar16(str, chPtr)) #endif diff --git a/generic/tclRegexp.c b/generic/tclRegexp.c index bb8a6ad..ff7c72c 100644 --- a/generic/tclRegexp.c +++ b/generic/tclRegexp.c @@ -271,8 +271,8 @@ Tcl_RegExpRange( } else { string = regexpPtr->string; } - *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so); - *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo); + *startPtr = TclUtfAtIndex(string, regexpPtr->matches[index].rm_so); + *endPtr = TclUtfAtIndex(string, regexpPtr->matches[index].rm_eo); } } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 02f4358..c47ee97 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1166,27 +1166,55 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ +#if TCL_UTF_MAX < 4 +# undef Tcl_UtfToUniChar +# define Tcl_UtfToUniChar Tcl_UtfToChar16 +#endif + const char * -Tcl_UtfAtIndex( +TclUtfAtIndex( const char *src, /* The UTF-8 string. */ int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + Tcl_UniChar ch = 0; int len = 0; while (index-- > 0) { - len = TclUtfToUniChar(src, &ch); + len = (Tcl_UtfToUniChar)(src, &ch); src += len; } #if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (len < 3)) { /* Index points at character following high Surrogate */ - src += TclUtfToUniChar(src, &ch); + src += (Tcl_UtfToUniChar)(src, &ch); } #endif return src; } +#if TCL_UTF_MAX > 3 +const char * +Tcl_UtfAtIndex( + const char *src, /* The UTF-8 string. */ + int index) /* The position of the desired character. */ +{ + unsigned short ch = 0; + int len = 0; + + while (index-- > 0) { + len = Tcl_UtfToChar16(src, &ch); + src += len; + } + if ((ch >= 0xD800) && (len < 3)) { + /* Index points at character following high Surrogate */ + src += Tcl_UtfToChar16(src, &ch); + } + return src; +} + + +#endif + /* *--------------------------------------------------------------------------- * @@ -2896,7 +2924,7 @@ TclUtfToUCS4( int *ucs4Ptr) /* Filled with the UCS4 codepoint represented * by the UTF-8 string. */ { - /* Make use of the #undef Tcl_UtfToUniChar above, which already handles UCS4. */ +# undef Tcl_UtfToUniChar return Tcl_UtfToUniChar(src, ucs4Ptr); } -- cgit v0.12 From 7d893da1b984ded235163f3ec8018195d9058f2a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 22 Mar 2022 11:25:13 +0000 Subject: More progress --- generic/tclInt.h | 4 ++++ generic/tclStringObj.c | 17 +++++------------ generic/tclStubInit.c | 2 ++ generic/tclUtf.c | 1 + tests/string.test | 2 +- tests/utf.test | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/generic/tclInt.h b/generic/tclInt.h index 0705c1d..538bca3 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3328,6 +3328,10 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); MODULE_SCOPE const char *TclUtfAtIndex(const char *, int); +# undef Tcl_GetCharLength +# define Tcl_GetCharLength TclGetCharLength +# undef Tcl_UtfAtIndex +# define Tcl_UtfAtIndex TclUtfAtIndex #else # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj # define TclNewUnicodeObj Tcl_NewUnicodeObj diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 2dc79eb..6417e1b 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -644,12 +644,12 @@ TclGetCharLength( } #if TCL_UTF_MAX > 3 +#undef Tcl_GetCharLength int Tcl_GetCharLength( Tcl_Obj *objPtr) /* The String object to get the num chars * of. */ { - String *stringPtr; int numChars; /* @@ -673,19 +673,12 @@ Tcl_GetCharLength( */ if (TclIsPureByteArray(objPtr)) { - int length; - (void) Tcl_GetByteArrayFromObj(objPtr, &length); - return length; + (void) Tcl_GetByteArrayFromObj(objPtr, &numChars); + } else { + numChars = Tcl_NumUtfChars(Tcl_GetString(objPtr), -1); } - - /* - * OK, need to work with the object as a string. - */ - - SetUTF16StringFromAny(NULL, objPtr); - stringPtr = GET_STRING(objPtr); - return stringPtr->numChars; + return numChars; } #endif diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index f9430cb..e3ebb8b 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -76,6 +76,8 @@ #undef Tcl_MacOSXOpenBundleResources #undef TclWinConvertWSAError #undef TclWinConvertError +#undef Tcl_GetCharLength +#undef Tcl_UtfAtIndex #if defined(_WIN32) || defined(__CYGWIN__) #define TclWinConvertWSAError (void (*)(DWORD))(void *)Tcl_WinConvertError #define TclWinConvertError (void (*)(DWORD))(void *)Tcl_WinConvertError diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c47ee97..4dd1e09 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1193,6 +1193,7 @@ TclUtfAtIndex( } #if TCL_UTF_MAX > 3 +#undef Tcl_UtfAtIndex const char * Tcl_UtfAtIndex( const char *src, /* The UTF-8 string. */ diff --git a/tests/string.test b/tests/string.test index 9cac73d..203d0c6 100644 --- a/tests/string.test +++ b/tests/string.test @@ -422,7 +422,7 @@ test string-4.16.$noComp {string first, normal string vs pure unicode string} -b # Representation checks are canaries run {list [representationpoke $s] [representationpoke $m] \ [string first $m $s]} -} -result {{utf32string 1} {utf32string 0} 2} +} -result {{string 1} {string 0} 2} test string-4.17.$noComp {string first, corner case} -body { run {string first a aaa 4294967295} } -result {-1} diff --git a/tests/utf.test b/tests/utf.test index 477216c..c79492e 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -1230,7 +1230,7 @@ test utf-19.1 {TclUniCharLen} -body { test utf-20.1 {TclUniCharNcmp} ucs4 { string compare [string range [format %c 0xFFFF] 0 0] [string range [format %c 0x10000] 0 0] } -1 -test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} { +test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} ucs4 { set one [format %c 0xFFFF] set two [format %c 0x10000] set first [string compare $one $two] -- cgit v0.12 From b5e6f95ea90be59cb281c089806f0e446e1272bc Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 22 Mar 2022 15:38:25 +0000 Subject: Feature-complete --- generic/tcl.decls | 9 ++++++++ generic/tclDecls.h | 15 +++++++++++++ generic/tclInt.h | 12 +++++++---- generic/tclStringObj.c | 8 +++---- generic/tclStubInit.c | 3 +++ generic/tclUtf.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++- tests/string.test | 2 +- 7 files changed, 96 insertions(+), 10 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index f5b2e78..fa844e0 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2446,6 +2446,15 @@ declare 660 { declare 668 { int Tcl_UniCharLen(const int *uniStr) } +declare 669 { + int TclNumUtfChars(const char *src, int length) +} +declare 670 { + int TclGetCharLength(Tcl_Obj *objPtr) +} +declare 671 { + const char *TclUtfAtIndex(const char *src, int index) +} # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 1952641..9f5e798 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1957,6 +1957,12 @@ EXTERN int Tcl_AsyncMarkFromSignal(Tcl_AsyncHandler async, /* Slot 667 is reserved */ /* 668 */ EXTERN int Tcl_UniCharLen(const int *uniStr); +/* 669 */ +EXTERN int TclNumUtfChars(const char *src, int length); +/* 670 */ +EXTERN int TclGetCharLength(Tcl_Obj *objPtr); +/* 671 */ +EXTERN const char * TclUtfAtIndex(const char *src, int index); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2661,6 +2667,9 @@ typedef struct TclStubs { void (*reserved666)(void); void (*reserved667)(void); int (*tcl_UniCharLen) (const int *uniStr); /* 668 */ + int (*tclNumUtfChars) (const char *src, int length); /* 669 */ + int (*tclGetCharLength) (Tcl_Obj *objPtr); /* 670 */ + const char * (*tclUtfAtIndex) (const char *src, int index); /* 671 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -4020,6 +4029,12 @@ extern const TclStubs *tclStubsPtr; /* Slot 667 is reserved */ #define Tcl_UniCharLen \ (tclStubsPtr->tcl_UniCharLen) /* 668 */ +#define TclNumUtfChars \ + (tclStubsPtr->tclNumUtfChars) /* 669 */ +#define TclGetCharLength \ + (tclStubsPtr->tclGetCharLength) /* 670 */ +#define TclUtfAtIndex \ + (tclStubsPtr->tclUtfAtIndex) /* 671 */ #endif /* defined(USE_TCL_STUBS) */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 538bca3..73d6386 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3322,12 +3322,12 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); #if TCL_UTF_MAX > 3 MODULE_SCOPE int *TclGetUnicodeFromObj_(Tcl_Obj *, int *); MODULE_SCOPE Tcl_Obj *TclNewUnicodeObj(const int *, int); - MODULE_SCOPE int TclGetCharLength(Tcl_Obj *); MODULE_SCOPE void TclAppendUnicodeToObj(Tcl_Obj *, const int *, int); MODULE_SCOPE int TclUniCharNcasecmp(const int *, const int *, unsigned long); MODULE_SCOPE int TclUniCharCaseMatch(const int *, const int *, int); MODULE_SCOPE int TclUniCharNcmp(const int *, const int *, unsigned long); - MODULE_SCOPE const char *TclUtfAtIndex(const char *, int); +# undef Tcl_NumUtfChars +# define Tcl_NumUtfChars TclNumUtfChars # undef Tcl_GetCharLength # define Tcl_GetCharLength TclGetCharLength # undef Tcl_UtfAtIndex @@ -3335,11 +3335,15 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); #else # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj # define TclNewUnicodeObj Tcl_NewUnicodeObj -# define TclGetCharLength Tcl_GetCharLength # define TclAppendUnicodeToObj Tcl_AppendUnicodeToObj # define TclUniCharNcasecmp Tcl_UniCharNcasecmp # define TclUniCharCaseMatch Tcl_UniCharCaseMatch # define TclUniCharNcmp Tcl_UniCharNcmp +# undef TclNumUtfChars +# define TclNumUtfChars Tcl_NumUtfChars +# undef TclGetCharLength +# define TclGetCharLength Tcl_GetCharLength +# undef TclUtfAtIndex # define TclUtfAtIndex Tcl_UtfAtIndex #endif @@ -4764,7 +4768,7 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; *---------------------------------------------------------------- */ -#define TclNumUtfChars(numChars, bytes, numBytes) \ +#define TclNumUtfChars_UNUSED(numChars, bytes, numBytes) \ do { \ int _count, _i = (numBytes); \ unsigned char *_str = (unsigned char *) (bytes); \ diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 6417e1b..2b11877 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -637,7 +637,7 @@ TclGetCharLength( */ if (numChars == -1) { - TclNumUtfChars(numChars, objPtr->bytes, objPtr->length); + numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); stringPtr->numChars = numChars; } return numChars; @@ -782,7 +782,7 @@ Tcl_GetUniChar( */ if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { return (unsigned char) objPtr->bytes[index]; @@ -991,7 +991,7 @@ Tcl_GetRange( */ if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { if (last < 0 || last >= stringPtr->numChars) { @@ -4447,7 +4447,7 @@ ExtendUnicodeRepWithString( numOrigChars = stringPtr->numChars; } if (numAppendChars == -1) { - TclNumUtfChars(numAppendChars, bytes, numBytes); + numAppendChars = Tcl_NumUtfChars(bytes, numBytes); } needed = numOrigChars + numAppendChars; uniCharStringCheckLimits(needed); diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index e3ebb8b..09163c3 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -1937,6 +1937,9 @@ const TclStubs tclStubs = { 0, /* 666 */ 0, /* 667 */ Tcl_UniCharLen, /* 668 */ + TclNumUtfChars, /* 669 */ + TclGetCharLength, /* 670 */ + TclUtfAtIndex, /* 671 */ }; /* !END!: Do not edit above this line. */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4dd1e09..eda317f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -799,7 +799,7 @@ Tcl_UtfCharComplete( */ int -Tcl_NumUtfChars( +TclNumUtfChars( const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ @@ -850,6 +850,61 @@ Tcl_NumUtfChars( return i; } +#if TCL_UTF_MAX > 3 +#undef Tcl_NumUtfChars +int +Tcl_NumUtfChars( + const char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ +{ + unsigned short ch = 0; + int i = 0; + + if (length < 0) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while ((*src != '\0') && (i < INT_MAX)) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + } else { + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - 4; + + /* + * Optimize away the call in this loop. Justified because... + * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while (src <= optPtr + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + /* Loop over the remaining string where call must happen */ + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += Tcl_UtfToChar16(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } + i++; + } + } + return i; +} +#endif + /* *--------------------------------------------------------------------------- * diff --git a/tests/string.test b/tests/string.test index 203d0c6..6863c23 100644 --- a/tests/string.test +++ b/tests/string.test @@ -422,7 +422,7 @@ test string-4.16.$noComp {string first, normal string vs pure unicode string} -b # Representation checks are canaries run {list [representationpoke $s] [representationpoke $m] \ [string first $m $s]} -} -result {{string 1} {string 0} 2} +} -match glob -result {{*string 1} {*string 0} 2} test string-4.17.$noComp {string first, corner case} -body { run {string first a aaa 4294967295} } -result {-1} -- cgit v0.12 From d4c27c94668a30f48edee251104255b27230107e Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 22 Mar 2022 16:12:20 +0000 Subject: ucs4 -> utf32 --- tests/stringObj.test | 17 +++++++++-------- tests/utf.test | 46 +++++++++++++++++++++++----------------------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/tests/stringObj.test b/tests/stringObj.test index bae61ab..c11bf7f 100644 --- a/tests/stringObj.test +++ b/tests/stringObj.test @@ -25,7 +25,8 @@ testConstraint testobj [llength [info commands testobj]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testdstring [llength [info commands testdstring]] testConstraint tip389 [expr {[string length \U010000] == 2}] - +testConstraint utf32 [expr {[string length [format %c 0x10000]] == 1}] + test stringObj-1.1 {string type registration} testobj { set t [testobj types] set first [string first "string" $t] @@ -57,7 +58,7 @@ test stringObj-3.2 {Tcl_SetStringObj, existing non-"empty string" object} testob lappend result [testobj refcount 1] } {{} 512 foo string 2} -test stringObj-4.1 {Tcl_SetObjLength procedure, string gets shorter} {testobj fullutf} { +test stringObj-4.1 {Tcl_SetObjLength procedure, string gets shorter} {testobj utf32} { testobj freeallvars teststringobj set 1 test teststringobj setlength 1 3 @@ -70,7 +71,7 @@ test stringObj-4.2 {Tcl_SetObjLength procedure, string gets longer} testobj { teststringobj setlength 1 10 list [teststringobj length 1] [teststringobj length2 1] } {10 10} -test stringObj-4.3 {Tcl_SetObjLength procedure, string gets longer} {testobj fullutf} { +test stringObj-4.3 {Tcl_SetObjLength procedure, string gets longer} {testobj utf32} { testobj freeallvars teststringobj set 1 abcdef teststringobj append 1 xyzq -1 @@ -97,7 +98,7 @@ test stringObj-5.2 {Tcl_AppendToObj procedure, length calculation} testobj { teststringobj append 1 123 -1 teststringobj get 1 } {x y bbCC123} -test stringObj-5.3 {Tcl_AppendToObj procedure, reallocating space} {testobj fullutf} { +test stringObj-5.3 {Tcl_AppendToObj procedure, reallocating space} {testobj utf32} { testobj freeallvars teststringobj set 1 xyz teststringobj setlength 1 15 @@ -135,7 +136,7 @@ test stringObj-6.4 {Tcl_AppendStringsToObj procedure, counting space} testobj { teststringobj appendstrings 1 { 123 } abcdefg list [teststringobj length 1] [teststringobj get 1] } {15 {abc 123 abcdefg}} -test stringObj-6.5 {Tcl_AppendStringsToObj procedure, don't double space if initial string empty} {testobj fullutf} { +test stringObj-6.5 {Tcl_AppendStringsToObj procedure, don't double space if initial string empty} {testobj utf32} { testobj freeallvars testobj newobj 1 teststringobj appendstrings 1 123 abcdefg @@ -150,7 +151,7 @@ test stringObj-6.6 {Tcl_AppendStringsToObj procedure, space reallocation} testob list [teststringobj length 1] [teststringobj length2 1] \ [teststringobj get 1] } {10 10 ab34567890} -test stringObj-6.7 {Tcl_AppendStringsToObj procedure, space reallocation} {testobj fullutf} { +test stringObj-6.7 {Tcl_AppendStringsToObj procedure, space reallocation} {testobj utf32} { testobj freeallvars teststringobj set 1 abc teststringobj setlength 1 10 @@ -172,7 +173,7 @@ test stringObj-6.9 {Tcl_AppendStringToObj, pure unicode} testobj { teststringobj get 1 } adcfoobarsoom -test stringObj-7.1 {SetStringFromAny procedure} {testobj fullutf} { +test stringObj-7.1 {SetStringFromAny procedure} {testobj utf32} { testobj freeallvars teststringobj set2 1 [list a b] teststringobj append 1 x -1 @@ -197,7 +198,7 @@ test stringObj-7.4 {SetStringFromAny called with string obj} testobj { [string length $x] [testobj objtype $x] } {6 string 6 string} -test stringObj-8.1 {DupStringInternalRep procedure} {testobj fullutf} { +test stringObj-8.1 {DupStringInternalRep procedure} {testobj utf32} { testobj freeallvars teststringobj set 1 {} teststringobj append 1 abcde -1 diff --git a/tests/utf.test b/tests/utf.test index c79492e..c0d64e2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -19,7 +19,7 @@ catch [list package require -exact tcl::test [info patchlevel]] testConstraint ucs2 [expr {[format %c 0x010000] eq "\uFFFD"}] testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] testConstraint utf16 [expr {[string length [format %c 0x10000]] == 2}] -testConstraint ucs4 [expr {[testConstraint fullutf] +testConstraint utf32 [expr {[testConstraint fullutf] && [string length [format %c 0x10000]] == 1}] testConstraint Uesc [expr {"\U0041" eq "A"}] @@ -131,7 +131,7 @@ test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {ucs2 testb test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} utf16 { string length 𐀀 } 2 -test utf-2.8.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} ucs4 { +test utf-2.8.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} utf32 { string length 𐀀 } 1 test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {ucs2 testbytestring} { @@ -140,7 +140,7 @@ test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} {ucs2 testb test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} utf16 { string length \U10FFFF } 2 -test utf-2.9.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} ucs4 { +test utf-2.9.2 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} utf32 { string length \U10FFFF } 1 test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { @@ -194,7 +194,7 @@ test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfc test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring ucs2} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 2 -test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring ucs4} { +test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring utf32} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] end } 1 test utf-4.13 {Tcl_NumUtfChars: end of string} {testnumutfchars testbytestring} { @@ -878,7 +878,7 @@ test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { test utf-8.5.0 {Tcl_UniCharAtIndex: high surrogate} ucs2 { string index \uD842 0 } \uD842 -test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} ucs4 { +test utf-8.5.1 {Tcl_UniCharAtIndex: high surrogate} utf32 { string index \uD842 0 } \uD842 test utf-8.5.2 {Tcl_UniCharAtIndex: high surrogate} utf16 { @@ -890,7 +890,7 @@ test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { test utf-8.7.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 0 } \uD83D -test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { +test utf-8.7.1 {Tcl_UniCharAtIndex: Emoji} utf32 { string index 😀G 0 } 😀 test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} utf16 { @@ -899,7 +899,7 @@ test utf-8.7.2 {Tcl_UniCharAtIndex: Emoji} utf16 { test utf-8.8.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 1 } \uDE00 -test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { +test utf-8.8.1 {Tcl_UniCharAtIndex: Emoji} utf32 { string index 😀G 1 } G test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} utf16 { @@ -908,7 +908,7 @@ test utf-8.8.2 {Tcl_UniCharAtIndex: Emoji} utf16 { test utf-8.9.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index \uD83D\uDE00G 2 } G -test utf-8.9.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { +test utf-8.9.1 {Tcl_UniCharAtIndex: Emoji} utf32 { string index 😀G 2 } {} test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} utf16 { @@ -917,7 +917,7 @@ test utf-8.9.2 {Tcl_UniCharAtIndex: Emoji} utf16 { test utf-8.10.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index 😀G 0 } \uFFFD -test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { +test utf-8.10.1 {Tcl_UniCharAtIndex: Emoji} utf32 { string index 😀G 0 } 😀 test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} utf16 { @@ -926,7 +926,7 @@ test utf-8.10.2 {Tcl_UniCharAtIndex: Emoji} utf16 { test utf-8.11.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index 😀G 1 } G -test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { +test utf-8.11.1 {Tcl_UniCharAtIndex: Emoji} utf32 { string index 😀G 1 } G test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} utf16 { @@ -935,7 +935,7 @@ test utf-8.11.2 {Tcl_UniCharAtIndex: Emoji} utf16 { test utf-8.12.0 {Tcl_UniCharAtIndex: Emoji} ucs2 { string index 😀G 2 } {} -test utf-8.12.1 {Tcl_UniCharAtIndex: Emoji} ucs4 { +test utf-8.12.1 {Tcl_UniCharAtIndex: Emoji} utf32 { string index 😀G 2 } {} test utf-8.12.2 {Tcl_UniCharAtIndex: Emoji} utf16 { @@ -951,7 +951,7 @@ test utf-9.2 {Tcl_UtfAtIndex: index > 0} { test utf-9.3.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 { string range \uD83D\uDE00G 0 0 } \uD83D -test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 { +test utf-9.3.1 {Tcl_UtfAtIndex: index = 0, Emoji} utf32 { string range 😀G 0 0 } 😀 test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} utf16 { @@ -960,7 +960,7 @@ test utf-9.3.2 {Tcl_UtfAtIndex: index = 0, Emoji} utf16 { test utf-9.4.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { string range \uD83D\uDE00G 1 1 } \uDE00 -test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { +test utf-9.4.1 {Tcl_UtfAtIndex: index > 0, Emoji} utf32 { string range 😀G 1 1 } G test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { @@ -969,7 +969,7 @@ test utf-9.4.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { test utf-9.5.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { string range \uD83D\uDE00G 2 2 } G -test utf-9.5.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { +test utf-9.5.1 {Tcl_UtfAtIndex: index > 0, Emoji} utf32 { string range 😀G 2 2 } {} test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { @@ -978,7 +978,7 @@ test utf-9.5.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { test utf-9.6.0 {Tcl_UtfAtIndex: index = 0, Emoji} ucs2 { string range 😀G 0 0 } \uFFFD -test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} ucs4 { +test utf-9.6.1 {Tcl_UtfAtIndex: index = 0, Emoji} utf32 { string range 😀G 0 0 } 😀 test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} utf16 { @@ -987,7 +987,7 @@ test utf-9.6.2 {Tcl_UtfAtIndex: index = 0, Emoji} utf16 { test utf-9.7.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { string range 😀G 1 1 } G -test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { +test utf-9.7.1 {Tcl_UtfAtIndex: index > 0, Emoji} utf32 { string range 😀G 1 1 } G test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { @@ -996,7 +996,7 @@ test utf-9.7.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { test utf-9.8.0 {Tcl_UtfAtIndex: index > 0, Emoji} ucs2 { string range 😀G 2 2 } {} -test utf-9.8.1 {Tcl_UtfAtIndex: index > 0, Emoji} ucs4 { +test utf-9.8.1 {Tcl_UtfAtIndex: index > 0, Emoji} utf32 { string range 😀G 2 2 } {} test utf-9.8.2 {Tcl_UtfAtIndex: index > 0, Emoji} utf16 { @@ -1227,10 +1227,10 @@ test utf-19.1 {TclUniCharLen} -body { unset -nocomplain foo } -result {1 4} -test utf-20.1 {TclUniCharNcmp} ucs4 { +test utf-20.1 {TclUniCharNcmp} utf32 { string compare [string range [format %c 0xFFFF] 0 0] [string range [format %c 0x10000] 0 0] } -1 -test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} ucs4 { +test utf-20.2 {[4c591fa487] TclUniCharNcmp/TclUtfNcmp} utf32 { set one [format %c 0xFFFF] set two [format %c 0x10000] set first [string compare $one $two] @@ -1357,10 +1357,10 @@ UniCharCaseCmpTest < a b UniCharCaseCmpTest > b a UniCharCaseCmpTest > B a UniCharCaseCmpTest > aBcB abca -UniCharCaseCmpTest < \uFFFF [format %c 0x10000] ucs4 -UniCharCaseCmpTest < \uFFFF \U10000 ucs4 -UniCharCaseCmpTest > [format %c 0x10000] \uFFFF ucs4 -UniCharCaseCmpTest > \U10000 \uFFFF ucs4 +UniCharCaseCmpTest < \uFFFF [format %c 0x10000] utf32 +UniCharCaseCmpTest < \uFFFF \U10000 utf32 +UniCharCaseCmpTest > [format %c 0x10000] \uFFFF utf32 +UniCharCaseCmpTest > \U10000 \uFFFF utf32 test utf-26.1 {Tcl_UniCharDString} -setup { -- cgit v0.12 From 1f3d3d7671748e4eb36cc0846e3e953bdb29c227 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 23 Mar 2022 13:31:43 +0000 Subject: Fix crash in (compabitility) "string" objType --- generic/tclStringObj.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index d43c507..9655479 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -208,13 +208,14 @@ SetUTF16StringFromAny( Tcl_DStringInit(&ds); unsigned short *utf16string = Tcl_UtfToChar16DString(objPtr->bytes, objPtr->length, &ds); - size_t size = Tcl_DStringLength(&ds); - String *stringPtr = (String *)ckalloc((offsetof(String, unicode) + 2U) + size); + int size = Tcl_DStringLength(&ds); + String *stringPtr = (String *)ckalloc((offsetof(String, unicode) + sizeof(unsigned short)) + size); + memcpy(stringPtr->unicode, utf16string, size); - stringPtr->unicode[size] = 0; Tcl_DStringFree(&ds); - size /= sizeof(unsigned short); + stringPtr->unicode[size] = 0; + stringPtr->numChars = size; stringPtr->allocated = size; stringPtr->maxChars = size; @@ -222,7 +223,7 @@ SetUTF16StringFromAny( objPtr->internalRep.twoPtrValue.ptr1 = stringPtr; objPtr->typePtr = &tclStringType; } - return TCL_OK; + return TCL_OK; } static void @@ -237,10 +238,10 @@ UpdateStringOfUTF16String( char *bytes = (char *)ckalloc(Tcl_DStringLength(&ds) + 1U); memcpy(bytes, string, Tcl_DStringLength(&ds)); + bytes[Tcl_DStringLength(&ds)] = 0; Tcl_DStringFree(&ds); objPtr->bytes = bytes; objPtr->length = Tcl_DStringLength(&ds); - printf("UpdateStringOfUTF16String %d %d\n", stringPtr->unicode[0], stringPtr->unicode[1]); } #endif -- cgit v0.12 From 369bc9b1594ea304387a97eef5658a5998699534 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 23 Mar 2022 14:32:08 +0000 Subject: Fix Tcl_UniCharAtIndex() for UTF-16 compabitility layer --- generic/tclUtf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index eda317f..cfd9915 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1182,22 +1182,20 @@ Tcl_UniCharAtIndex( const char *src, /* The UTF-8 string to dereference. */ int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + unsigned short ch = 0; int i = 0; if (index < 0) { return -1; } while (index-- > 0) { - i = TclUtfToUniChar(src, &ch); + i = Tcl_UtfToChar16(src, &ch); src += i; } -#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (i < 3)) { /* Index points at character following high Surrogate */ return -1; } -#endif TclUtfToUCS4(src, &i); return i; } -- cgit v0.12 From af426f828e2e3391d7b4ed399040300821a156e9 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 23 Mar 2022 16:19:54 +0000 Subject: Put back TclNumUtfChars() macro as TclNumUtfCharsM() --- generic/tclInt.h | 4 ++-- generic/tclStringObj.c | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/generic/tclInt.h b/generic/tclInt.h index 73d6386..3aa2626 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -4768,14 +4768,14 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; *---------------------------------------------------------------- */ -#define TclNumUtfChars_UNUSED(numChars, bytes, numBytes) \ +#define TclNumUtfCharsM(numChars, bytes, numBytes) \ do { \ int _count, _i = (numBytes); \ unsigned char *_str = (unsigned char *) (bytes); \ while (_i && (*_str < 0xC0)) { _i--; _str++; } \ _count = (numBytes) - _i; \ if (_i) { \ - _count += Tcl_NumUtfChars((bytes) + _count, _i); \ + _count += TclNumUtfChars((bytes) + _count, _i); \ } \ (numChars) = _count; \ } while (0); diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 9655479..784ed44 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -55,14 +55,14 @@ static void AppendUtfToUtfRep(Tcl_Obj *objPtr, const char *bytes, int numBytes); static void DupStringInternalRep(Tcl_Obj *objPtr, Tcl_Obj *copyPtr); -static int ExtendUniCharStringRepWithUniCharString(Tcl_Obj *objPtr, +static int ExtendStringRepWithUnicode(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); static void ExtendUnicodeRepWithString(Tcl_Obj *objPtr, const char *bytes, int numBytes, int numAppendChars); static void FillUnicodeRep(Tcl_Obj *objPtr); static void FreeStringInternalRep(Tcl_Obj *objPtr); -static void GrowUniCharStringBuffer(Tcl_Obj *objPtr, int needed, int flag); +static void GrowStringBuffer(Tcl_Obj *objPtr, int needed, int flag); static void GrowUnicodeBuffer(Tcl_Obj *objPtr, int needed); static int SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); static void SetUnicodeObj(Tcl_Obj *objPtr, @@ -284,7 +284,7 @@ UpdateStringOfUTF16String( #endif static void -GrowUniCharStringBuffer( +GrowStringBuffer( Tcl_Obj *objPtr, int needed, int flag) @@ -638,7 +638,7 @@ TclGetCharLength( */ if (numChars == -1) { - numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); + TclNumUtfCharsM(numChars, objPtr->bytes, objPtr->length); stringPtr->numChars = numChars; } return numChars; @@ -783,7 +783,7 @@ Tcl_GetUniChar( */ if (stringPtr->numChars == -1) { - stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); + TclNumUtfCharsM(stringPtr->numChars, objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { return (unsigned char) objPtr->bytes[index]; @@ -928,7 +928,7 @@ TclGetUnicodeFromObj( } return stringPtr->unicode; } - + /* *---------------------------------------------------------------------- * @@ -992,7 +992,7 @@ Tcl_GetRange( */ if (stringPtr->numChars == -1) { - stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); + TclNumUtfCharsM(stringPtr->numChars, objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { if (last < 0 || last >= stringPtr->numChars) { @@ -1884,7 +1884,7 @@ AppendUnicodeToUtfRep( { UniCharString *stringPtr = GET_UNICHAR_STRING(objPtr); - numChars = ExtendUniCharStringRepWithUniCharString(objPtr, unicode, numChars); + numChars = ExtendStringRepWithUnicode(objPtr, unicode, numChars); if (stringPtr->numChars != -1) { stringPtr->numChars += numChars; @@ -1992,7 +1992,7 @@ AppendUtfToUtfRep( * would make test stringObj-8.1 fail. */ - GrowUniCharStringBuffer(objPtr, newLength, 0); + GrowStringBuffer(objPtr, newLength, 0); /* * Relocate bytes if needed; see above. @@ -4448,7 +4448,7 @@ ExtendUnicodeRepWithString( numOrigChars = stringPtr->numChars; } if (numAppendChars == -1) { - numAppendChars = Tcl_NumUtfChars(bytes, numBytes); + TclNumUtfCharsM(numAppendChars, bytes, numBytes); } needed = numOrigChars + numAppendChars; uniCharStringCheckLimits(needed); @@ -4643,13 +4643,13 @@ UpdateStringOfString( if (stringPtr->numChars == 0) { TclInitStringRep(objPtr, NULL, 0); } else { - (void) ExtendUniCharStringRepWithUniCharString(objPtr, stringPtr->unicode, + (void) ExtendStringRepWithUnicode(objPtr, stringPtr->unicode, stringPtr->numChars); } } static int -ExtendUniCharStringRepWithUniCharString( +ExtendStringRepWithUnicode( Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars) @@ -4696,7 +4696,7 @@ ExtendUniCharStringRepWithUniCharString( */ if (size > stringPtr->allocated) { - GrowUniCharStringBuffer(objPtr, size, 1); + GrowStringBuffer(objPtr, size, 1); } copyBytes: -- cgit v0.12 From 66cbd6ac71e01082f9e8b0e088cd829defdf9886 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 23 Mar 2022 16:33:41 +0000 Subject: Fix for UpdateStringOfUTF16String() --- generic/tclStringObj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 784ed44..17c7067 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -239,9 +239,9 @@ UpdateStringOfUTF16String( char *bytes = (char *)ckalloc(Tcl_DStringLength(&ds) + 1U); memcpy(bytes, string, Tcl_DStringLength(&ds)); bytes[Tcl_DStringLength(&ds)] = 0; - Tcl_DStringFree(&ds); objPtr->bytes = bytes; objPtr->length = Tcl_DStringLength(&ds); + Tcl_DStringFree(&ds); } #endif -- cgit v0.12 From 4e3426067cddf752f0c15e8a2ce29f5f9052341f Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 24 Mar 2022 10:48:44 +0000 Subject: When compiled with TCL_NO_DEPRECATED, remove the UTF16 compatibility layer. So, we make sure that it is never used internally for the Core. This means that extensions using the compatibility layer won't work any more in this mode; extensions should be compiled using TCL_UTF_MAX=4 then they work again. --- generic/tclExecute.c | 4 ++-- generic/tclInt.h | 2 ++ generic/tclObj.c | 2 ++ generic/tclStringObj.c | 45 ++++++++++++++++++++++++++------------------- generic/tclStubInit.c | 9 +++++++++ generic/tclTestObj.c | 24 ++++++++++++++++-------- generic/tclUtf.c | 4 ++-- generic/tclUtil.c | 2 +- tests/stringObj.test | 26 +++++++++++++------------- 9 files changed, 73 insertions(+), 45 deletions(-) diff --git a/generic/tclExecute.c b/generic/tclExecute.c index f09f75c..0456146 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5592,8 +5592,8 @@ TEBCresume( * both. */ - if (TclHasInternalRep(valuePtr, &tclStringType) - || TclHasInternalRep(value2Ptr, &tclStringType)) { + if (TclHasInternalRep(valuePtr, &tclUniCharStringType) + || TclHasInternalRep(value2Ptr, &tclUniCharStringType)) { Tcl_UniChar *ustring1, *ustring2; ustring1 = TclGetUnicodeFromObj_(valuePtr, &length); diff --git a/generic/tclInt.h b/generic/tclInt.h index 3aa2626..6804996 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2768,6 +2768,7 @@ MODULE_SCOPE const Tcl_ObjType tclListType; MODULE_SCOPE const Tcl_ObjType tclDictType; MODULE_SCOPE const Tcl_ObjType tclProcBodyType; MODULE_SCOPE const Tcl_ObjType tclStringType; +MODULE_SCOPE const Tcl_ObjType tclUniCharStringType; MODULE_SCOPE const Tcl_ObjType tclEnsembleCmdType; MODULE_SCOPE const Tcl_ObjType tclRegexpType; MODULE_SCOPE Tcl_ObjType tclCmdNameType; @@ -3333,6 +3334,7 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # undef Tcl_UtfAtIndex # define Tcl_UtfAtIndex TclUtfAtIndex #else +# define tclUniCharStringType tclStringType # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj # define TclNewUnicodeObj Tcl_NewUnicodeObj # define TclAppendUnicodeToObj Tcl_AppendUnicodeToObj diff --git a/generic/tclObj.c b/generic/tclObj.c index a06b8fd..7596880 100644 --- a/generic/tclObj.c +++ b/generic/tclObj.c @@ -387,7 +387,9 @@ TclInitObjSubsystem(void) Tcl_RegisterObjType(&tclByteArrayType); Tcl_RegisterObjType(&tclDoubleType); +#if (TCL_UTF_MAX) > 3 && !defined(TCL_NO_DEPRECATED) Tcl_RegisterObjType(&tclStringType); +#endif Tcl_RegisterObjType(&tclListType); Tcl_RegisterObjType(&tclDictType); Tcl_RegisterObjType(&tclByteCodeType); diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 17c7067..627fadc 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -69,7 +69,7 @@ static void SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); static int UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX) > 3 && !defined(TCL_NO_DEPRECATED) static void DupUTF16StringInternalRep(Tcl_Obj *objPtr, Tcl_Obj *copyPtr); static int SetUTF16StringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); @@ -88,7 +88,7 @@ static void UpdateStringOfUTF16String(Tcl_Obj *objPtr); #if TCL_UTF_MAX < 4 -#define uniCharStringType tclStringType +#define tclUniCharStringType tclStringType #define GET_UNICHAR_STRING GET_STRING #define UniCharString String #define UNICHAR_STRING_MAXCHARS STRING_MAXCHARS @@ -110,6 +110,8 @@ const Tcl_ObjType tclStringType = { #else +#define tclStringType xxx +#ifndef TCL_NO_DEPRECATED const Tcl_ObjType tclStringType = { "string", /* name */ FreeStringInternalRep, /* freeIntRepPro */ @@ -117,8 +119,9 @@ const Tcl_ObjType tclStringType = { UpdateStringOfUTF16String, /* updateStringProc */ SetUTF16StringFromAny /* setFromAnyProc */ }; +#endif -static const Tcl_ObjType uniCharStringType = { +const Tcl_ObjType tclUniCharStringType = { "utf32string", /* name */ FreeStringInternalRep, /* freeIntRepPro */ DupStringInternalRep, /* dupIntRepProc */ @@ -170,6 +173,7 @@ typedef struct { ((objPtr)->internalRep.twoPtrValue.ptr1 = (void *) (stringPtr)) +#ifndef TCL_NO_DEPRECATED static void DupUTF16StringInternalRep( Tcl_Obj *srcPtr, /* Object with internal rep to copy. Must have @@ -243,6 +247,7 @@ UpdateStringOfUTF16String( objPtr->length = Tcl_DStringLength(&ds); Tcl_DStringFree(&ds); } +#endif #endif @@ -545,7 +550,7 @@ TclNewUnicodeObj( return objPtr; } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) Tcl_Obj * Tcl_NewUnicodeObj( const unsigned short *unicode, /* The unicode string used to initialize the @@ -644,7 +649,7 @@ TclGetCharLength( return numChars; } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) #undef Tcl_GetCharLength int Tcl_GetCharLength( @@ -889,7 +894,7 @@ TclGetUnicodeFromObj_( return stringPtr->unicode; } -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 3 && !defined(TCL_NO_DEPRECATED) unsigned short * Tcl_GetUnicodeFromObj( Tcl_Obj *objPtr, /* The object to find the unicode string @@ -1327,6 +1332,7 @@ Tcl_AttemptSetObjLength( *--------------------------------------------------------------------------- */ +#if !defined(TCL_NO_DEPRECATED) void Tcl_SetUnicodeObj( Tcl_Obj *objPtr, /* The object to set the string of. */ @@ -1366,6 +1372,7 @@ Tcl_SetUnicodeObj( TclInvalidateStringRep(objPtr); stringPtr->allocated = numChars; } +#endif static int UnicodeLength( @@ -1403,7 +1410,7 @@ SetUnicodeObj( uniCharStringCheckLimits(numChars); stringPtr = uniCharStringAlloc(numChars); SET_UNICHAR_STRING(objPtr, stringPtr); - objPtr->typePtr = &uniCharStringType; + objPtr->typePtr = &tclUniCharStringType; stringPtr->maxChars = numChars; memcpy(stringPtr->unicode, unicode, numChars * sizeof(Tcl_UniChar)); @@ -1591,7 +1598,7 @@ TclAppendUnicodeToObj( } } -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 3 && !defined(TCL_NO_DEPRECATED) void Tcl_AppendUnicodeToObj( Tcl_Obj *objPtr, /* Points to the object to append to. */ @@ -1736,7 +1743,7 @@ Tcl_AppendObjToObj( * If appendObjPtr is not of the "String" type, don't convert it. */ - if (TclHasInternalRep(appendObjPtr, &uniCharStringType)) { + if (TclHasInternalRep(appendObjPtr, &tclUniCharStringType)) { Tcl_UniChar *unicode = TclGetUnicodeFromObj_(appendObjPtr, &numChars); @@ -1757,7 +1764,7 @@ Tcl_AppendObjToObj( bytes = TclGetStringFromObj(appendObjPtr, &length); numChars = stringPtr->numChars; - if ((numChars >= 0) && TclHasInternalRep(appendObjPtr, &uniCharStringType)) { + if ((numChars >= 0) && TclHasInternalRep(appendObjPtr, &tclUniCharStringType)) { UniCharString *appendStringPtr = GET_UNICHAR_STRING(appendObjPtr); appendNumChars = appendStringPtr->numChars; @@ -3166,7 +3173,7 @@ TclGetStringStorage( { UniCharString *stringPtr; - if (!TclHasInternalRep(objPtr, &uniCharStringType) || objPtr->bytes == NULL) { + if (!TclHasInternalRep(objPtr, &tclUniCharStringType) || objPtr->bytes == NULL) { return TclGetStringFromObj(objPtr, (int *)sizePtr); } @@ -3214,7 +3221,7 @@ TclStringRepeat( */ if (!binary) { - if (TclHasInternalRep(objPtr, &uniCharStringType)) { + if (TclHasInternalRep(objPtr, &tclUniCharStringType)) { UniCharString *stringPtr = GET_UNICHAR_STRING(objPtr); if (stringPtr->hasUnicode) { unichar = 1; @@ -3385,7 +3392,7 @@ TclStringCat( binary = 0; if (ov > objv+1 && ISCONTINUATION(TclGetString(objPtr))) { forceUniChar = 1; - } else if ((objPtr->typePtr) && (objPtr->typePtr != &uniCharStringType)) { + } else if ((objPtr->typePtr) && (objPtr->typePtr != &tclUniCharStringType)) { /* Prevent shimmer of non-string types. */ allowUniChar = 0; } @@ -3393,7 +3400,7 @@ TclStringCat( } else { /* assert (objPtr->typePtr != NULL) -- stork! */ binary = 0; - if (TclHasInternalRep(objPtr, &uniCharStringType)) { + if (TclHasInternalRep(objPtr, &tclUniCharStringType)) { /* Have a pure Unicode value; ask to preserve it */ requestUniChar = 1; } else { @@ -3746,8 +3753,8 @@ TclStringCmp( s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len); s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); memCmpFn = memcmp; - } else if (TclHasInternalRep(value1Ptr, &uniCharStringType) - && TclHasInternalRep(value2Ptr, &uniCharStringType)) { + } else if (TclHasInternalRep(value1Ptr, &tclUniCharStringType) + && TclHasInternalRep(value2Ptr, &tclUniCharStringType)) { /* * Do a unicode-specific comparison if both of the args are of * String type. If the char length == byte length, we can do a @@ -4556,7 +4563,7 @@ DupStringInternalRep( copyStringPtr->allocated = copyPtr->bytes ? copyPtr->length : 0; SET_UNICHAR_STRING(copyPtr, copyStringPtr); - copyPtr->typePtr = &uniCharStringType; + copyPtr->typePtr = &tclUniCharStringType; } /* @@ -4581,7 +4588,7 @@ SetStringFromAny( TCL_UNUSED(Tcl_Interp *), Tcl_Obj *objPtr) /* The object to convert. */ { - if (!TclHasInternalRep(objPtr, &uniCharStringType)) { + if (!TclHasInternalRep(objPtr, &tclUniCharStringType)) { UniCharString *stringPtr = uniCharStringAlloc(0); /* @@ -4601,7 +4608,7 @@ SetStringFromAny( stringPtr->maxChars = 0; stringPtr->hasUnicode = 0; SET_UNICHAR_STRING(objPtr, stringPtr); - objPtr->typePtr = &uniCharStringType; + objPtr->typePtr = &tclUniCharStringType; } return TCL_OK; } diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 9757ad2..7c8c219 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -87,6 +87,15 @@ #define TclUtfNext UtfNext #define TclUtfPrev UtfPrev +#if TCL_UTF_MAX > 3 && defined(TCL_NO_DEPRECATED) +#define Tcl_GetUnicodeFromObj 0 +#define Tcl_AppendUnicodeToObj 0 +#define Tcl_NewUnicodeObj 0 +#define Tcl_SetUnicodeObj 0 +#define Tcl_UtfAtIndex 0 +#define Tcl_GetCharLength 0 +#endif + static int TclUtfCharComplete(const char *src, int length) { if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { return length < 3; diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index 9884a9a..223eb98 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -1264,10 +1264,14 @@ TeststringobjCmd( goto wrongNumArgs; } if (varPtr[varIndex] != NULL) { - Tcl_ConvertToType(NULL, varPtr[varIndex], - Tcl_GetObjType("string")); - strPtr = (String *)varPtr[varIndex]->internalRep.twoPtrValue.ptr1; - length = (int) strPtr->allocated; + const Tcl_ObjType *objType = Tcl_GetObjType("string"); + if (objType != NULL) { + Tcl_ConvertToType(NULL, varPtr[varIndex], objType); + strPtr = (String *)varPtr[varIndex]->internalRep.twoPtrValue.ptr1; + length = (int) strPtr->allocated; + } else { + length = -1; + } } else { length = -1; } @@ -1318,10 +1322,14 @@ TeststringobjCmd( goto wrongNumArgs; } if (varPtr[varIndex] != NULL) { - Tcl_ConvertToType(NULL, varPtr[varIndex], - Tcl_GetObjType("string")); - strPtr = (String *)varPtr[varIndex]->internalRep.twoPtrValue.ptr1; - length = strPtr->maxChars; + const Tcl_ObjType *objType = Tcl_GetObjType("string"); + if (objType != NULL) { + Tcl_ConvertToType(NULL, varPtr[varIndex],objType); + strPtr = (String *)varPtr[varIndex]->internalRep.twoPtrValue.ptr1; + length = strPtr->maxChars; + } else { + length = -1; + } } else { length = -1; } diff --git a/generic/tclUtf.c b/generic/tclUtf.c index cfd9915..2a335d7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -850,7 +850,7 @@ TclNumUtfChars( return i; } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) #undef Tcl_NumUtfChars int Tcl_NumUtfChars( @@ -1245,7 +1245,7 @@ TclUtfAtIndex( return src; } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) #undef Tcl_UtfAtIndex const char * Tcl_UtfAtIndex( diff --git a/generic/tclUtil.c b/generic/tclUtil.c index 9cc82cb..3537ecc 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -2591,7 +2591,7 @@ TclStringMatchObj( trivial = nocase ? 0 : TclMatchIsTrivial(TclGetString(ptnObj)); */ - if (TclHasInternalRep(strObj, &tclStringType) || (strObj->typePtr == NULL)) { + if (TclHasInternalRep(strObj, &tclUniCharStringType) || (strObj->typePtr == NULL)) { Tcl_UniChar *udata, *uptn; udata = TclGetUnicodeFromObj_(strObj, &length); diff --git a/tests/stringObj.test b/tests/stringObj.test index c11bf7f..0aa9a47 100644 --- a/tests/stringObj.test +++ b/tests/stringObj.test @@ -27,7 +27,7 @@ testConstraint testdstring [llength [info commands testdstring]] testConstraint tip389 [expr {[string length \U010000] == 2}] testConstraint utf32 [expr {[string length [format %c 0x10000]] == 1}] -test stringObj-1.1 {string type registration} testobj { +test stringObj-1.1 {string type registration} {testobj deprecated} { set t [testobj types] set first [string first "string" $t] set result [expr {$first >= 0}] @@ -58,27 +58,27 @@ test stringObj-3.2 {Tcl_SetStringObj, existing non-"empty string" object} testob lappend result [testobj refcount 1] } {{} 512 foo string 2} -test stringObj-4.1 {Tcl_SetObjLength procedure, string gets shorter} {testobj utf32} { +test stringObj-4.1 {Tcl_SetObjLength procedure, string gets shorter} {testobj utf32 deprecated} { testobj freeallvars teststringobj set 1 test teststringobj setlength 1 3 list [teststringobj length 1] [teststringobj length2 1] \ [teststringobj get 1] } {3 3 tes} -test stringObj-4.2 {Tcl_SetObjLength procedure, string gets longer} testobj { +test stringObj-4.2 {Tcl_SetObjLength procedure, string gets longer} {testobj deprecated} { testobj freeallvars teststringobj set 1 abcdef teststringobj setlength 1 10 list [teststringobj length 1] [teststringobj length2 1] } {10 10} -test stringObj-4.3 {Tcl_SetObjLength procedure, string gets longer} {testobj utf32} { +test stringObj-4.3 {Tcl_SetObjLength procedure, string gets longer} {testobj utf32 deprecated} { testobj freeallvars teststringobj set 1 abcdef teststringobj append 1 xyzq -1 list [teststringobj length 1] [teststringobj length2 1] \ [teststringobj get 1] } {10 10 abcdefxyzq} -test stringObj-4.4 {Tcl_SetObjLength procedure, "expty string", length 0} testobj { +test stringObj-4.4 {Tcl_SetObjLength procedure, "expty string", length 0} {testobj deprecated} { testobj freeallvars testobj newobj 1 teststringobj setlength 1 0 @@ -98,7 +98,7 @@ test stringObj-5.2 {Tcl_AppendToObj procedure, length calculation} testobj { teststringobj append 1 123 -1 teststringobj get 1 } {x y bbCC123} -test stringObj-5.3 {Tcl_AppendToObj procedure, reallocating space} {testobj utf32} { +test stringObj-5.3 {Tcl_AppendToObj procedure, reallocating space} {testobj utf32 deprecated} { testobj freeallvars teststringobj set 1 xyz teststringobj setlength 1 15 @@ -136,13 +136,13 @@ test stringObj-6.4 {Tcl_AppendStringsToObj procedure, counting space} testobj { teststringobj appendstrings 1 { 123 } abcdefg list [teststringobj length 1] [teststringobj get 1] } {15 {abc 123 abcdefg}} -test stringObj-6.5 {Tcl_AppendStringsToObj procedure, don't double space if initial string empty} {testobj utf32} { +test stringObj-6.5 {Tcl_AppendStringsToObj procedure, don't double space if initial string empty} {testobj utf32 deprecated} { testobj freeallvars testobj newobj 1 teststringobj appendstrings 1 123 abcdefg list [teststringobj length 1] [teststringobj length2 1] [teststringobj get 1] } {10 10 123abcdefg} -test stringObj-6.6 {Tcl_AppendStringsToObj procedure, space reallocation} testobj { +test stringObj-6.6 {Tcl_AppendStringsToObj procedure, space reallocation} {testobj deprecated} { testobj freeallvars teststringobj set 1 abc teststringobj setlength 1 10 @@ -151,7 +151,7 @@ test stringObj-6.6 {Tcl_AppendStringsToObj procedure, space reallocation} testob list [teststringobj length 1] [teststringobj length2 1] \ [teststringobj get 1] } {10 10 ab34567890} -test stringObj-6.7 {Tcl_AppendStringsToObj procedure, space reallocation} {testobj utf32} { +test stringObj-6.7 {Tcl_AppendStringsToObj procedure, space reallocation} {testobj utf32 deprecated} { testobj freeallvars teststringobj set 1 abc teststringobj setlength 1 10 @@ -160,7 +160,7 @@ test stringObj-6.7 {Tcl_AppendStringsToObj procedure, space reallocation} {testo list [teststringobj length 1] [teststringobj length2 1] \ [teststringobj get 1] } {11 11 ab34567890x} -test stringObj-6.8 {Tcl_AppendStringsToObj procedure, object totally empty} testobj { +test stringObj-6.8 {Tcl_AppendStringsToObj procedure, object totally empty} {testobj deprecated} { testobj freeallvars testobj newobj 1 teststringobj appendstrings 1 {} @@ -173,14 +173,14 @@ test stringObj-6.9 {Tcl_AppendStringToObj, pure unicode} testobj { teststringobj get 1 } adcfoobarsoom -test stringObj-7.1 {SetStringFromAny procedure} {testobj utf32} { +test stringObj-7.1 {SetStringFromAny procedure} {testobj utf32 deprecated} { testobj freeallvars teststringobj set2 1 [list a b] teststringobj append 1 x -1 list [teststringobj length 1] [teststringobj length2 1] \ [teststringobj get 1] } {4 4 {a bx}} -test stringObj-7.2 {SetStringFromAny procedure, null object} testobj { +test stringObj-7.2 {SetStringFromAny procedure, null object} {testobj deprecated} { testobj freeallvars testobj newobj 1 teststringobj appendstrings 1 {} @@ -198,7 +198,7 @@ test stringObj-7.4 {SetStringFromAny called with string obj} testobj { [string length $x] [testobj objtype $x] } {6 string 6 string} -test stringObj-8.1 {DupStringInternalRep procedure} {testobj utf32} { +test stringObj-8.1 {DupStringInternalRep procedure} {testobj utf32 deprecated} { testobj freeallvars teststringobj set 1 {} teststringobj append 1 abcde -1 -- cgit v0.12 From 0febd13af2d2275d3b63ef191462fc4fea18db99 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 24 Mar 2022 11:34:39 +0000 Subject: Implement PANIC when the UTF16 compatibility layer is used in combination with -DTCL_NO_DEPRECATED --- generic/tclStubInit.c | 28 ++++++++++++++++++---------- generic/tclUtf.c | 6 +++--- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 7c8c219..131053a 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -83,19 +83,27 @@ #define TclWinConvertError (void (*)(DWORD))(void *)Tcl_WinConvertError #endif -#define TclUtfCharComplete UtfCharComplete -#define TclUtfNext UtfNext -#define TclUtfPrev UtfPrev #if TCL_UTF_MAX > 3 && defined(TCL_NO_DEPRECATED) -#define Tcl_GetUnicodeFromObj 0 -#define Tcl_AppendUnicodeToObj 0 -#define Tcl_NewUnicodeObj 0 -#define Tcl_SetUnicodeObj 0 -#define Tcl_UtfAtIndex 0 -#define Tcl_GetCharLength 0 +static void uniCodePanic(void) { + Tcl_Panic("Tcl is compiled without the the UTF16 compatibility layer (-DTCL_NO_DEPRECATED)"); +} +# define Tcl_GetUnicode (unsigned short *(*)(Tcl_Obj *))(void *)uniCodePanic +# define Tcl_GetUnicodeFromObj (unsigned short *(*)(Tcl_Obj *, int *))(void *)uniCodePanic +# define Tcl_NewUnicodeObj (Tcl_Obj *(*)(const unsigned short *, int))(void *)uniCodePanic +# define Tcl_SetUnicodeObj (void(*)(Tcl_Obj *, const unsigned short *, int))(void *)uniCodePanic +# define Tcl_AppendUnicodeToObj (void(*)(Tcl_Obj *, const unsigned short *, int))(void *)uniCodePanic +# define Tcl_UtfAtIndex (const char *(*)(const char *, int))(void *)uniCodePanic +# define Tcl_GetCharLength (int(*)(Tcl_Obj *))(void *)uniCodePanic +# define Tcl_UniCharNcmp (int(*)(const unsigned short *, const unsigned short *, unsigned long))(void *)uniCodePanic +# define Tcl_UniCharNcasecmp (int(*)(const unsigned short *, const unsigned short *, unsigned long))(void *)uniCodePanic +# define Tcl_UniCharCaseMatch (int(*)(const unsigned short *, const unsigned short *, int))(void *)uniCodePanic #endif +#define TclUtfCharComplete UtfCharComplete +#define TclUtfNext UtfNext +#define TclUtfPrev UtfPrev + static int TclUtfCharComplete(const char *src, int length) { if ((unsigned)((unsigned char)*(src) - 0xF0) < 5) { return length < 3; @@ -679,8 +687,8 @@ static int utfNcasecmp(const char *s1, const char *s2, unsigned int n){ # define Tcl_SetExitProc 0 # define Tcl_SetPanicProc 0 # define Tcl_FindExecutable 0 -# define Tcl_GetUnicode 0 #if TCL_UTF_MAX < 4 +# define Tcl_GetUnicode 0 # define Tcl_AppendUnicodeToObj 0 # define Tcl_UniCharCaseMatch 0 # define Tcl_UniCharNcasecmp 0 diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 2a335d7..82adf65 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -1957,7 +1957,7 @@ TclUniCharNcmp( #endif /* WORDS_BIGENDIAN */ } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) int Tcl_UniCharNcmp( const unsigned short *ucs, /* Unicode string to compare to uct. */ @@ -2028,7 +2028,7 @@ TclUniCharNcasecmp( return 0; } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) int Tcl_UniCharNcasecmp( const unsigned short *ucs, /* Unicode string to compare to uct. */ @@ -2584,7 +2584,7 @@ TclUniCharCaseMatch( } } -#if TCL_UTF_MAX > 3 +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) int Tcl_UniCharCaseMatch( const unsigned short *uniStr, /* Unicode String. */ -- cgit v0.12 From bab10ccf5c152dec04a0eed4fac248b749af4fd9 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 24 Mar 2022 12:43:05 +0000 Subject: Add TclGetRange() to the compatibility set --- generic/tcl.decls | 3 +++ generic/tclDecls.h | 5 ++++ generic/tclInt.h | 6 +++-- generic/tclStringObj.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++- generic/tclStubInit.c | 3 +++ 5 files changed, 77 insertions(+), 3 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index cb6e282..f6a4c05 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2463,6 +2463,9 @@ declare 670 { declare 671 { const char *TclUtfAtIndex(const char *src, int index) } +declare 672 { + Tcl_Obj *TclGetRange(Tcl_Obj *objPtr, int first, int last) +} # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 06ee797..fdf8673 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1969,6 +1969,8 @@ EXTERN int TclNumUtfChars(const char *src, int length); EXTERN int TclGetCharLength(Tcl_Obj *objPtr); /* 671 */ EXTERN const char * TclUtfAtIndex(const char *src, int index); +/* 672 */ +EXTERN Tcl_Obj * TclGetRange(Tcl_Obj *objPtr, int first, int last); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2676,6 +2678,7 @@ typedef struct TclStubs { int (*tclNumUtfChars) (const char *src, int length); /* 669 */ int (*tclGetCharLength) (Tcl_Obj *objPtr); /* 670 */ const char * (*tclUtfAtIndex) (const char *src, int index); /* 671 */ + Tcl_Obj * (*tclGetRange) (Tcl_Obj *objPtr, int first, int last); /* 672 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -4043,6 +4046,8 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tclGetCharLength) /* 670 */ #define TclUtfAtIndex \ (tclStubsPtr->tclUtfAtIndex) /* 671 */ +#define TclGetRange \ + (tclStubsPtr->tclGetRange) /* 672 */ #endif /* defined(USE_TCL_STUBS) */ diff --git a/generic/tclInt.h b/generic/tclInt.h index 6804996..7486d60 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3333,6 +3333,8 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # define Tcl_GetCharLength TclGetCharLength # undef Tcl_UtfAtIndex # define Tcl_UtfAtIndex TclUtfAtIndex +# undef Tcl_GetRange +# define Tcl_GetRange TclGetRange #else # define tclUniCharStringType tclStringType # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj @@ -3345,8 +3347,8 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # define TclNumUtfChars Tcl_NumUtfChars # undef TclGetCharLength # define TclGetCharLength Tcl_GetCharLength -# undef TclUtfAtIndex -# define TclUtfAtIndex Tcl_UtfAtIndex +# undef TclGetRange +# define TclGetRange Tcl_GetRange #endif diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 627fadc..332d1d2 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -110,7 +110,6 @@ const Tcl_ObjType tclStringType = { #else -#define tclStringType xxx #ifndef TCL_NO_DEPRECATED const Tcl_ObjType tclStringType = { "string", /* name */ @@ -952,6 +951,8 @@ TclGetUnicodeFromObj( *---------------------------------------------------------------------- */ +#if TCL_UTF_MAX > 3 && !defined(TCL_NO_DEPRECATED) +#undef Tcl_GetRange Tcl_Obj * Tcl_GetRange( Tcl_Obj *objPtr, /* The Tcl object to find the range of. */ @@ -959,6 +960,66 @@ Tcl_GetRange( int last) /* Last index of the range. */ { Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ + String *stringPtr; + int length; + + if (first < 0) { + first = 0; + } + + /* + * Optimize the case where we're really dealing with a bytearray object + * we don't need to convert to a string to perform the substring operation. + */ + + if (TclIsPureByteArray(objPtr)) { + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + + if (last < 0 || last >= length) { + last = length - 1; + } + if (last < first) { + TclNewObj(newObjPtr); + return newObjPtr; + } + return Tcl_NewByteArrayObj(bytes + first, last - first + 1); + } + + /* + * OK, need to work with the object as a utf16 string. + */ + + SetUTF16StringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (last < 0 || last >= stringPtr->numChars) { + last = stringPtr->numChars - 1; + } + if (last < first) { + TclNewObj(newObjPtr); + return newObjPtr; + } + /* See: bug [11ae2be95dac9417] */ + if ((first > 0) && ((stringPtr->unicode[first] & 0xFC00) == 0xDC00) + && ((stringPtr->unicode[first-1] & 0xFC00) == 0xD800)) { + ++first; + } + if ((last + 1 < stringPtr->numChars) + && ((stringPtr->unicode[last+1] & 0xFC00) == 0xDC00) + && ((stringPtr->unicode[last] & 0xFC00) == 0xD800)) { + ++last; + } + return Tcl_NewUnicodeObj(stringPtr->unicode + first, last - first + 1); +} +#endif + +Tcl_Obj * +TclGetRange( + Tcl_Obj *objPtr, /* The Tcl object to find the range of. */ + int first, /* First index of the range. */ + int last) /* Last index of the range. */ +{ + Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ UniCharString *stringPtr; int length; diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 131053a..2046938 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -48,6 +48,7 @@ #undef Tcl_UniCharCaseMatch #undef Tcl_UniCharLen #undef Tcl_UniCharNcmp +#undef Tcl_GetRange #undef Tcl_DumpActiveMemory #undef Tcl_ValidateAllMemory #undef Tcl_FindHashEntry @@ -98,6 +99,7 @@ static void uniCodePanic(void) { # define Tcl_UniCharNcmp (int(*)(const unsigned short *, const unsigned short *, unsigned long))(void *)uniCodePanic # define Tcl_UniCharNcasecmp (int(*)(const unsigned short *, const unsigned short *, unsigned long))(void *)uniCodePanic # define Tcl_UniCharCaseMatch (int(*)(const unsigned short *, const unsigned short *, int))(void *)uniCodePanic +# define Tcl_GetRange (Tcl_Obj *(*)(Tcl_Obj *, int, int))(void *)uniCodePanic #endif #define TclUtfCharComplete UtfCharComplete @@ -1957,6 +1959,7 @@ const TclStubs tclStubs = { TclNumUtfChars, /* 669 */ TclGetCharLength, /* 670 */ TclUtfAtIndex, /* 671 */ + TclGetRange, /* 672 */ }; /* !END!: Do not edit above this line. */ -- cgit v0.12 From 53f15cc67318ebe942c270c60268b0396688befc Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 24 Mar 2022 13:18:33 +0000 Subject: Fix internal usage of Tcl_GetRange/Tcl_UtfAtIndex --- generic/tclCmdMZ.c | 6 +++--- generic/tclExecute.c | 4 ++-- generic/tclInt.h | 2 ++ generic/tclStringObj.c | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index bd5745b..1beb732 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -395,7 +395,7 @@ Tcl_RegexpObjCmd( newPtr = Tcl_NewListObj(2, objs); } else { if ((i <= info.nsubs) && (info.matches[i].end > 0)) { - newPtr = Tcl_GetRange(objPtr, + newPtr = TclGetRange(objPtr, offset + info.matches[i].start, offset + info.matches[i].end - 1); } else { @@ -2301,7 +2301,7 @@ StringRangeCmd( } if (last >= 0) { - Tcl_SetObjResult(interp, Tcl_GetRange(objv[1], first, last)); + Tcl_SetObjResult(interp, TclGetRange(objv[1], first, last)); } return TCL_OK; } @@ -3790,7 +3790,7 @@ TclNRSwitchObjCmd( if (matchVarObj != NULL) { Tcl_Obj *substringObj; - substringObj = Tcl_GetRange(stringObj, + substringObj = TclGetRange(stringObj, info.matches[j].start, info.matches[j].end-1); /* diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 0456146..965d821 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5373,7 +5373,7 @@ TEBCresume( if (toIdx < 0) { TclNewObj(objResultPtr); } else { - objResultPtr = Tcl_GetRange(OBJ_AT_DEPTH(2), fromIdx, toIdx); + objResultPtr = TclGetRange(OBJ_AT_DEPTH(2), fromIdx, toIdx); } TRACE_APPEND(("\"%.30s\"\n", O2S(objResultPtr))); NEXT_INST_V(1, 3, 1); @@ -5414,7 +5414,7 @@ TEBCresume( if (toIdx < 0) { TclNewObj(objResultPtr); } else { - objResultPtr = Tcl_GetRange(valuePtr, fromIdx, toIdx); + objResultPtr = TclGetRange(valuePtr, fromIdx, toIdx); } } TRACE_APPEND(("%.30s\n", O2S(objResultPtr))); diff --git a/generic/tclInt.h b/generic/tclInt.h index 7486d60..b6cf3b4 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3347,6 +3347,8 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # define TclNumUtfChars Tcl_NumUtfChars # undef TclGetCharLength # define TclGetCharLength Tcl_GetCharLength +# undef TclUtfAtIndex +# define TclUtfAtIndex Tcl_UtfAtIndex # undef TclGetRange # define TclGetRange Tcl_GetRange #endif diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 332d1d2..6032dbb 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -2449,7 +2449,7 @@ Tcl_AppendFormatToObj( if (precision < 1) { TclNewObj(segment); } else { - segment = Tcl_GetRange(segment, 0, precision - 1); + segment = TclGetRange(segment, 0, precision - 1); } numChars = precision; Tcl_IncrRefCount(segment); -- cgit v0.12 From 4576277c9931b6267da5083fb250652d077bb819 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 24 Mar 2022 13:58:06 +0000 Subject: Add TclGetUniChar() to the compatibility set --- generic/tcl.decls | 3 +++ generic/tclCmdMZ.c | 4 ++-- generic/tclDecls.h | 5 +++++ generic/tclExecute.c | 2 +- generic/tclInt.h | 4 ++++ generic/tclStringObj.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ generic/tclStubInit.c | 7 ++++++ 7 files changed, 80 insertions(+), 3 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index f6a4c05..3b5c8a9 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -2466,6 +2466,9 @@ declare 671 { declare 672 { Tcl_Obj *TclGetRange(Tcl_Obj *objPtr, int first, int last) } +declare 673 { + int TclGetUniChar(Tcl_Obj *objPtr, int index) +} # ----- BASELINE -- FOR -- 8.7.0 ----- # diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 1beb732..29a73cf 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -310,7 +310,7 @@ Tcl_RegexpObjCmd( eflags = 0; } else if (offset > stringLength) { eflags = TCL_REG_NOTBOL; - } else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') { + } else if (TclGetUniChar(objPtr, offset-1) == '\n') { eflags = 0; } else { eflags = TCL_REG_NOTBOL; @@ -1412,7 +1412,7 @@ StringIndexCmd( } if ((index >= 0) && (index < length)) { - int ch = Tcl_GetUniChar(objv[1], index); + int ch = TclGetUniChar(objv[1], index); if (ch == -1) { return TCL_OK; diff --git a/generic/tclDecls.h b/generic/tclDecls.h index fdf8673..ac90006 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1971,6 +1971,8 @@ EXTERN int TclGetCharLength(Tcl_Obj *objPtr); EXTERN const char * TclUtfAtIndex(const char *src, int index); /* 672 */ EXTERN Tcl_Obj * TclGetRange(Tcl_Obj *objPtr, int first, int last); +/* 673 */ +EXTERN int TclGetUniChar(Tcl_Obj *objPtr, int index); typedef struct { const struct TclPlatStubs *tclPlatStubs; @@ -2679,6 +2681,7 @@ typedef struct TclStubs { int (*tclGetCharLength) (Tcl_Obj *objPtr); /* 670 */ const char * (*tclUtfAtIndex) (const char *src, int index); /* 671 */ Tcl_Obj * (*tclGetRange) (Tcl_Obj *objPtr, int first, int last); /* 672 */ + int (*tclGetUniChar) (Tcl_Obj *objPtr, int index); /* 673 */ } TclStubs; extern const TclStubs *tclStubsPtr; @@ -4048,6 +4051,8 @@ extern const TclStubs *tclStubsPtr; (tclStubsPtr->tclUtfAtIndex) /* 671 */ #define TclGetRange \ (tclStubsPtr->tclGetRange) /* 672 */ +#define TclGetUniChar \ + (tclStubsPtr->tclGetUniChar) /* 673 */ #endif /* defined(USE_TCL_STUBS) */ diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 965d821..1f72d63 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5329,7 +5329,7 @@ TEBCresume( valuePtr->bytes+index, 1); } else { char buf[4] = ""; - int ch = Tcl_GetUniChar(valuePtr, index); + int ch = TclGetUniChar(valuePtr, index); /* * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) diff --git a/generic/tclInt.h b/generic/tclInt.h index b6cf3b4..cc13c61 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3335,6 +3335,8 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # define Tcl_UtfAtIndex TclUtfAtIndex # undef Tcl_GetRange # define Tcl_GetRange TclGetRange +# undef Tcl_GetUniChar +# define Tcl_GetUniChar TclGetUniChar #else # define tclUniCharStringType tclStringType # define TclGetUnicodeFromObj_ Tcl_GetUnicodeFromObj @@ -3351,6 +3353,8 @@ MODULE_SCOPE int TclZipfs_Init(Tcl_Interp *interp); # define TclUtfAtIndex Tcl_UtfAtIndex # undef TclGetRange # define TclGetRange Tcl_GetRange +# undef TclGetUniChar +# define TclGetUniChar Tcl_GetUniChar #endif diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 6032dbb..e8777cd 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -747,12 +747,70 @@ TclCheckEmptyString( *---------------------------------------------------------------------- */ +#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#undef Tcl_GetUniChar int Tcl_GetUniChar( Tcl_Obj *objPtr, /* The object to get the Unicode charater * from. */ int index) /* Get the index'th Unicode character. */ { + String *stringPtr; + int ch, length; + + if (index < 0) { + return -1; + } + + /* + * Optimize the case where we're really dealing with a bytearray object + * we don't need to convert to a string to perform the indexing operation. + */ + + if (TclIsPureByteArray(objPtr)) { + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (index >= length) { + return -1; + } + + return (int) bytes[index]; + } + + /* + * OK, need to work with the object as a string. + */ + + SetUTF16StringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (index >= stringPtr->numChars) { + return -1; + } + ch = stringPtr->unicode[index]; + /* See: bug [11ae2be95dac9417] */ + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x400) { + if ((index > 0) + && ((stringPtr->unicode[index-1] & 0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index] & 0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | + (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } + return ch; +} +#endif + +int +TclGetUniChar( + Tcl_Obj *objPtr, /* The object to get the Unicode charater + * from. */ + int index) /* Get the index'th Unicode character. */ +{ UniCharString *stringPtr; int ch, length; diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index 2046938..fc78f74 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -49,6 +49,7 @@ #undef Tcl_UniCharLen #undef Tcl_UniCharNcmp #undef Tcl_GetRange +#undef Tcl_GetUniChar #undef Tcl_DumpActiveMemory #undef Tcl_ValidateAllMemory #undef Tcl_FindHashEntry @@ -79,6 +80,10 @@ #undef TclWinConvertError #undef Tcl_GetCharLength #undef Tcl_UtfAtIndex +#undef TclNumUtfChars +#undef TclGetCharLength +#undef TclUtfAtIndex +#undef TclGetUniChar #if defined(_WIN32) || defined(__CYGWIN__) #define TclWinConvertWSAError (void (*)(DWORD))(void *)Tcl_WinConvertError #define TclWinConvertError (void (*)(DWORD))(void *)Tcl_WinConvertError @@ -100,6 +105,7 @@ static void uniCodePanic(void) { # define Tcl_UniCharNcasecmp (int(*)(const unsigned short *, const unsigned short *, unsigned long))(void *)uniCodePanic # define Tcl_UniCharCaseMatch (int(*)(const unsigned short *, const unsigned short *, int))(void *)uniCodePanic # define Tcl_GetRange (Tcl_Obj *(*)(Tcl_Obj *, int, int))(void *)uniCodePanic +# define Tcl_GetUniChar (int(*)(Tcl_Obj *, int))(void *)uniCodePanic #endif #define TclUtfCharComplete UtfCharComplete @@ -1960,6 +1966,7 @@ const TclStubs tclStubs = { TclGetCharLength, /* 670 */ TclUtfAtIndex, /* 671 */ TclGetRange, /* 672 */ + TclGetUniChar, /* 673 */ }; /* !END!: Do not edit above this line. */ -- cgit v0.12 From 5438d045182b2bc669f663a22728d7306c459112 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 24 Mar 2022 23:29:50 +0000 Subject: bugfix: Handle NULL characters in Tcl_GetCharLength() --- generic/tclStringObj.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index e8777cd..988c6e5 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -681,7 +681,8 @@ Tcl_GetCharLength( (void) Tcl_GetByteArrayFromObj(objPtr, &numChars); } else { - numChars = Tcl_NumUtfChars(Tcl_GetString(objPtr), -1); + Tcl_GetString(objPtr); + numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } return numChars; } -- cgit v0.12 From 29b5c15de58bfb8f522ebfcde30d4ded5712d4a2 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 25 Mar 2022 12:14:41 +0000 Subject: Unbreak build with-DTCL_UTF_MAX=3 --- generic/tclStubInit.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/generic/tclStubInit.c b/generic/tclStubInit.c index fc78f74..7d04481 100644 --- a/generic/tclStubInit.c +++ b/generic/tclStubInit.c @@ -80,10 +80,7 @@ #undef TclWinConvertError #undef Tcl_GetCharLength #undef Tcl_UtfAtIndex -#undef TclNumUtfChars -#undef TclGetCharLength -#undef TclUtfAtIndex -#undef TclGetUniChar + #if defined(_WIN32) || defined(__CYGWIN__) #define TclWinConvertWSAError (void (*)(DWORD))(void *)Tcl_WinConvertError #define TclWinConvertError (void (*)(DWORD))(void *)Tcl_WinConvertError -- cgit v0.12 From d4140ec31b18515b482006a2d9cac7e2d281f8db Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 26 Mar 2022 23:02:29 +0000 Subject: Don't run obj-1.1 test with -DTCL_NO_DEPRECATED --- tests/obj.test | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/obj.test b/tests/obj.test index 4fa8d3a..7563422 100644 --- a/tests/obj.test +++ b/tests/obj.test @@ -19,11 +19,13 @@ if {"::tcltest" ni [namespace children]} { ::tcltest::loadTestedCommands catch [list package require -exact tcl::test [info patchlevel]] +package require tcltests + testConstraint testobj [llength [info commands testobj]] testConstraint longIs32bit [expr {$tcl_platform(wordSize) == 4}] testConstraint wideIs64bit [expr {wide(0x8000000000000000) < 0}] -test obj-1.1 {Tcl_AppendAllObjTypes, and InitTypeTable, Tcl_RegisterObjType} testobj { +test obj-1.1 {Tcl_AppendAllObjTypes, and InitTypeTable, Tcl_RegisterObjType} {testobj deprecated} { set r 1 foreach {t} { bytearray -- cgit v0.12 From ec52ceb63b1563cfdeb057731661d0ae92332765 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 29 Mar 2022 21:14:54 +0000 Subject: Undo deprecation for Tcl_AppendUnicodeToObj (as described in the TIP) --- generic/tcl.decls | 2 +- generic/tclDecls.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/generic/tcl.decls b/generic/tcl.decls index 3b5c8a9..f0718c1 100644 --- a/generic/tcl.decls +++ b/generic/tcl.decls @@ -1356,7 +1356,7 @@ declare 382 {deprecated {No longer in use, changed to macro}} { declare 383 { Tcl_Obj *Tcl_GetRange(Tcl_Obj *objPtr, int first, int last) } -declare 384 {deprecated {Use Tcl_AppendStringsToObj}} { +declare 384 { void Tcl_AppendUnicodeToObj(Tcl_Obj *objPtr, const unsigned short *unicode, int length) } diff --git a/generic/tclDecls.h b/generic/tclDecls.h index ac90006..020c4bb 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -1157,8 +1157,7 @@ unsigned short * Tcl_GetUnicode(Tcl_Obj *objPtr); /* 383 */ EXTERN Tcl_Obj * Tcl_GetRange(Tcl_Obj *objPtr, int first, int last); /* 384 */ -TCL_DEPRECATED("Use Tcl_AppendStringsToObj") -void Tcl_AppendUnicodeToObj(Tcl_Obj *objPtr, +EXTERN void Tcl_AppendUnicodeToObj(Tcl_Obj *objPtr, const unsigned short *unicode, int length); /* 385 */ EXTERN int Tcl_RegExpMatchObj(Tcl_Interp *interp, @@ -2392,7 +2391,7 @@ typedef struct TclStubs { int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ TCL_DEPRECATED_API("No longer in use, changed to macro") unsigned short * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ - TCL_DEPRECATED_API("Use Tcl_AppendStringsToObj") void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const unsigned short *unicode, int length); /* 384 */ + void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const unsigned short *unicode, int length); /* 384 */ int (*tcl_RegExpMatchObj) (Tcl_Interp *interp, Tcl_Obj *textObj, Tcl_Obj *patternObj); /* 385 */ void (*tcl_SetNotifier) (const Tcl_NotifierProcs *notifierProcPtr); /* 386 */ Tcl_Mutex * (*tcl_GetAllocMutex) (void); /* 387 */ -- cgit v0.12 From 572557152a0233310ad8df9c1f877cecd65e3406 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 30 Mar 2022 11:56:16 +0000 Subject: Change UTF-16 version of Tcl_GetRange() not to use the tclStringObj type any more --- generic/tclStringObj.c | 55 +++++++++++++++++++------------------------------- generic/tclTest.c | 3 ++- 2 files changed, 23 insertions(+), 35 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 988c6e5..63b833f 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -180,12 +180,12 @@ DupUTF16StringInternalRep( Tcl_Obj *copyPtr) /* Object with internal rep to set. Must not * currently have an internal rep.*/ { - String *srcStringPtr = ((String *) (srcPtr)->internalRep.twoPtrValue.ptr1); + String *srcStringPtr = GET_STRING(srcPtr); size_t size = offsetof(String, unicode) + (((srcStringPtr->allocated) + 1U) * sizeof(unsigned short)); String *copyStringPtr = (String *)ckalloc(size); memcpy(copyStringPtr, srcStringPtr, size); - copyPtr->internalRep.twoPtrValue.ptr1 = copyStringPtr; + SET_STRING(copyPtr, copyStringPtr); copyPtr->typePtr = &tclStringType; } @@ -223,7 +223,7 @@ SetUTF16StringFromAny( stringPtr->allocated = size; stringPtr->maxChars = size; stringPtr->hasUnicode = 1; - objPtr->internalRep.twoPtrValue.ptr1 = stringPtr; + SET_STRING(objPtr, stringPtr); objPtr->typePtr = &tclStringType; } return TCL_OK; @@ -234,7 +234,7 @@ UpdateStringOfUTF16String( Tcl_Obj *objPtr) /* Object with string rep to update. */ { Tcl_DString ds; - String *stringPtr = ((String *) (objPtr)->internalRep.twoPtrValue.ptr1); + String *stringPtr = GET_STRING(objPtr); Tcl_DStringInit(&ds); const char *string = Tcl_Char16ToUtfDString(stringPtr->unicode, stringPtr->numChars, &ds); @@ -560,18 +560,19 @@ Tcl_NewUnicodeObj( Tcl_Obj *objPtr; TclNewObj(objPtr); - TclFreeInternalRep(objPtr); + TclInvalidateStringRep(objPtr); - String *stringPtr = (String *)ckalloc((offsetof(String, unicode) + 2U) + numChars * sizeof(unsigned short)); - memcpy(stringPtr->unicode, unicode, numChars); - stringPtr->unicode[numChars] = 0; + String *stringPtr = (String *)ckalloc((offsetof(String, unicode) + + sizeof(unsigned short)) + numChars * sizeof(unsigned short)); + memcpy(stringPtr->unicode, unicode, numChars); + stringPtr->unicode[numChars] = 0; - stringPtr->numChars = numChars; - stringPtr->allocated = numChars; - stringPtr->maxChars = numChars; - stringPtr->hasUnicode = 1; - objPtr->internalRep.twoPtrValue.ptr1 = stringPtr; - objPtr->typePtr = &tclStringType; + stringPtr->numChars = numChars; + stringPtr->allocated = numChars; + stringPtr->maxChars = numChars; + stringPtr->hasUnicode = 1; + SET_STRING(objPtr, stringPtr); + objPtr->typePtr = &tclStringType; return objPtr; } @@ -1019,7 +1020,6 @@ Tcl_GetRange( int last) /* Last index of the range. */ { Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ - String *stringPtr; int length; if (first < 0) { @@ -1044,31 +1044,18 @@ Tcl_GetRange( return Tcl_NewByteArrayObj(bytes + first, last - first + 1); } - /* - * OK, need to work with the object as a utf16 string. - */ - - SetUTF16StringFromAny(NULL, objPtr); - stringPtr = GET_STRING(objPtr); + int numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); - if (last < 0 || last >= stringPtr->numChars) { - last = stringPtr->numChars - 1; + if (last >= numChars) { + last = numChars - 1; } if (last < first) { TclNewObj(newObjPtr); return newObjPtr; } - /* See: bug [11ae2be95dac9417] */ - if ((first > 0) && ((stringPtr->unicode[first] & 0xFC00) == 0xDC00) - && ((stringPtr->unicode[first-1] & 0xFC00) == 0xD800)) { - ++first; - } - if ((last + 1 < stringPtr->numChars) - && ((stringPtr->unicode[last+1] & 0xFC00) == 0xDC00) - && ((stringPtr->unicode[last] & 0xFC00) == 0xD800)) { - ++last; - } - return Tcl_NewUnicodeObj(stringPtr->unicode + first, last - first + 1); + const char *begin = Tcl_UtfAtIndex(objPtr->bytes, first); + const char *end = Tcl_UtfAtIndex(objPtr->bytes, last + 1); + return Tcl_NewStringObj(begin, end - begin); } #endif diff --git a/generic/tclTest.c b/generic/tclTest.c index 30681d9..cbd1f70 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -16,6 +16,7 @@ */ #undef STATIC_BUILD +#undef BUILD_tcl #ifndef USE_TCL_STUBS # define USE_TCL_STUBS #endif @@ -6971,7 +6972,7 @@ TestUtfNextCmd( int objc, Tcl_Obj *const objv[]) { - size_t numBytes; + int numBytes; char *bytes; const char *result, *first; char buffer[32]; -- cgit v0.12 From 720797e76cbd290e097b9094da83c44cadfac5bf Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 15 May 2022 20:19:30 +0000 Subject: Change (unsupported) "utfmax" option in rules.vc to it's reverse "utf16". No effect in Tcl 8.6. Not recommended (but working!) in Tcl 8.7 and 9.0 --- .travis.yml | 18 ------------------ win/makefile.vc | 4 ++-- win/rules.vc | 15 +++++++-------- 3 files changed, 9 insertions(+), 28 deletions(-) diff --git a/.travis.yml b/.travis.yml index eb1ec0c..061fe2d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -197,15 +197,6 @@ jobs: script: - cmd.exe //C vcvarsall.bat x64 '&&' nmake '-f' makefile.vc all tcltest - cmd.exe //C vcvarsall.bat x64 '&&' nmake '-f' makefile.vc test - - name: "Windows/MSVC/Shared: UTF_MAX=4" - os: windows - compiler: cl - env: *vcenv - before_install: *vcpreinst - install: [] - script: - - cmd.exe //C vcvarsall.bat x64 '&&' nmake 'OPTS=utfmax' '-f' makefile.vc all tcltest - - cmd.exe //C vcvarsall.bat x64 '&&' nmake 'OPTS=utfmax' '-f' makefile.vc test - name: "Windows/MSVC/Static" os: windows compiler: cl @@ -252,15 +243,6 @@ jobs: script: - cmd.exe //C vcvarsall.bat x86 '&&' nmake '-f' makefile.vc all tcltest - cmd.exe //C vcvarsall.bat x86 '&&' nmake '-f' makefile.vc test - - name: "Windows/MSVC-x86/Shared: UTF_MAX=4" - os: windows - compiler: cl - env: *vcenv - before_install: *vcpreinst - install: [] - script: - - cmd.exe //C vcvarsall.bat x86 '&&' nmake 'OPTS=utfmax' '-f' makefile.vc all tcltest - - cmd.exe //C vcvarsall.bat x86 '&&' nmake 'OPTS=utfmax' '-f' makefile.vc test - name: "Windows/MSVC-x86/Static" os: windows compiler: cl diff --git a/win/makefile.vc b/win/makefile.vc index 395a9ca..011546a 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -52,7 +52,7 @@ # turn on the 64-bit compiler, if your SDK has it. # # Basic macros and options usable on the commandline (see rules.vc for more info): -# OPTS=msvcrt,nothreads,pdbs,profile,static,staticpkg,symbols,thrdalloc,time64bit,unchecked,utfmax,none +# OPTS=msvcrt,nothreads,pdbs,profile,static,staticpkg,symbols,thrdalloc,time64bit,unchecked,utf16,none # Sets special options for the core. The default is for none. # Any combination of the above may be used (comma separated). # 'none' will over-ride everything to nothing. @@ -77,7 +77,7 @@ # unchecked = Allows a symbols build to not use the debug # enabled runtime (msvcrt.dll not msvcrtd.dll # or libcmt.lib not libcmtd.lib). -# utfmax = Forces a build using UTF-32 representation internally. +# utf16 = Forces a build using UTF-16 representation internally. # # STATS=compdbg,memdbg,none # Sets optional memory and bytecode compiler debugging code added diff --git a/win/rules.vc b/win/rules.vc index a571899..3107756 100644 --- a/win/rules.vc +++ b/win/rules.vc @@ -816,8 +816,7 @@ DOTSEPARATED=$(DOTSEPARATED:b=.) # configuration (ignored for Tcl itself) # _USE_64BIT_TIME_T - forces a build using 64-bit time_t for 32-bit build # (CRT library should support this, not needed for Tcl 9.x) -# TCL_UTF_MAX=4 - forces a build allowing 4-byte UTF-8 sequences internally. -# (Not needed for Tcl 9.x) +# TCL_UTF_MAX=3 - forces a build using UTF-16 internally (not recommended). # Further, LINKERFLAGS are modified based on above. # Default values for all the above @@ -884,9 +883,9 @@ USE_THREAD_ALLOC= 0 _USE_64BIT_TIME_T = 1 !endif -!if [nmakehlp -f $(OPTS) "utfmax"] -!message *** Force allowing 4-byte UTF-8 sequences internally -TCL_UTF_MAX = 4 +!if [nmakehlp -f $(OPTS) "utf16"] +!message *** Force UTF-16 internally +TCL_UTF_MAX = 3 !endif !endif @@ -1423,13 +1422,13 @@ OPTDEFINES = $(OPTDEFINES) /DNO_STRTOI64=1 !if "$(_USE_64BIT_TIME_T)" == "1" OPTDEFINES = $(OPTDEFINES) /D_USE_64BIT_TIME_T=1 !endif -!if "$(TCL_UTF_MAX)" == "4" -OPTDEFINES = $(OPTDEFINES) /DTCL_UTF_MAX=4 -!endif # _ATL_XP_TARGETING - Newer SDK's need this to build for XP COMPILERFLAGS = /D_ATL_XP_TARGETING !endif +!if "$(TCL_UTF_MAX)" == "3" +OPTDEFINES = $(OPTDEFINES) /DTCL_UTF_MAX=3 +!endif # Like the TEA system only set this non empty for non-Tk extensions # Note: some extensions use PACKAGE_NAME and others use PACKAGE_TCLNAME -- cgit v0.12