From fbf2a3255bd9e59c1a84415305da87b202330606 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 2 Apr 2020 20:18:52 +0000 Subject: New utility routine TclUtfToUCS4() to contain some complexity. Two callers adapted. --- generic/tclCmdMZ.c | 34 +++++++-------------------------- generic/tclInt.h | 1 + generic/tclUtf.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d344678..23370a8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1081,23 +1081,10 @@ Tcl_SplitObjCmd( Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS); for ( ; stringPtr < end; stringPtr += len) { - int fullchar; - len = TclUtfToUniChar(stringPtr, &ch); - fullchar = ch; - -#if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (len < 3)) { - len += TclUtfToUniChar(stringPtr + len, &ch); - fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; - } -#endif + int ucs4; - /* - * Assume Tcl_UniChar is an integral type... - */ - - hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(fullchar), - &isNew); + len = TclUtfToUCS4(stringPtr, &ucs4); + hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ucs4), &isNew); if (isNew) { TclNewStringObj(objPtr, stringPtr, len); @@ -1466,7 +1453,6 @@ StringIsCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { const char *string1, *end, *stop; - Tcl_UniChar ch = 0; int (*chcomp)(int) = NULL; /* The UniChar comparison function. */ int i, failat = 0, result = 1, strict = 0, index, length1, length2; Tcl_Obj *objPtr, *failVarObj = NULL; @@ -1797,16 +1783,10 @@ StringIsCmd( } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { - int fullchar; - length2 = TclUtfToUniChar(string1, &ch); - fullchar = ch; -#if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (length2 < 3)) { - length2 += TclUtfToUniChar(string1 + length2, &ch); - fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; - } -#endif - if (!chcomp(fullchar)) { + int ucs4; + + length2 = TclUtfToUCS4(string1, &ucs4); + if (!chcomp(ucs4)) { result = 0; break; } diff --git a/generic/tclInt.h b/generic/tclInt.h index c30a257..74b2cc9 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3186,6 +3186,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); +MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr, diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c58f5a9..0db06bd 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -278,8 +278,8 @@ Tcl_UniCharToUtfDString( * If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done: * For any UTF-8 string containing a character outside of the BMP, the * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr + * and generate a return value of 1. Calling Tcl_UtfToUniChar again + * will produce the low surrogate and a return value of 3. Because *chPtr * is used to remember whether the high surrogate is already produced, it * is recommended to initialize the variable it points to as 0 before * the first call to Tcl_UtfToUniChar is done. @@ -2156,6 +2156,57 @@ TclUniCharMatch( } /* + *--------------------------------------------------------------------------- + * + * TclUtfToUCS4 -- + * + * Extract the 4-byte codepoint from the leading bytes of the + * Modified UTF-8 string "src". This is a utility routine to + * contain the surrogate gymnastics in one place. + * + * The caller must ensure that the source buffer is long enough that this + * routine does not run off the end and dereference non-existent memory + * looking for trail bytes. If the source buffer is known to be '\0' + * terminated, this cannot happen. Otherwise, the caller should call + * Tcl_UtfCharComplete() before calling this routine to ensure that + * enough bytes remain in the string. + * + * Results: + * *usc4Ptr is filled with the UCS4 code point, and the return value is + * the number of bytes from the UTF-8 string that were consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +TclUtfToUCS4( + const char *src, /* The UTF-8 string. */ + int *ucs4Ptr) /* Filled with the UCS4 codepoint represented + * by the UTF-8 string. */ +{ + int len, fullchar; + Tcl_UniChar ch = 0; + + len = TclUtfToUniChar(src, &ch); + fullchar = ch; + +#if TCL_UTF_MAX == 4 + /* 4-byte UTF-8 is supported; decode surrogates */ + + if ((ch >= 0xD800) && len < 3) + len += Tcl_UtfToUniChar(src + len, &ch); + fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; + } +#endif + + *ucs4Ptr = fullchar; + return len; +} + +/* * Local Variables: * mode: c * c-basic-offset: 4 -- cgit v0.12 From d429ddf40d173212a0fadb4794b91096c58201a1 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 2 Apr 2020 20:36:04 +0000 Subject: More callers. --- generic/tclUtf.c | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 0db06bd..8f02790 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -578,19 +578,11 @@ Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Unicode character to search for. */ { - int len, fullchar; - Tcl_UniChar find = 0; - +fprintf(stdout, "COVER\n"); fflush(stdout); while (1) { - len = TclUtfToUniChar(src, &find); - fullchar = find; -#if TCL_UTF_MAX <= 4 - if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) { - len += TclUtfToUniChar(src + len, &find); - fullchar = (((fullchar & 0x3FF) << 10) | (find & 0x3FF)) + 0x10000; - } -#endif - if (fullchar == ch) { + int ucs4, len = TclUtfToUCS4(src, &ucs4); + + if (ucs4 == ch) { return src; } if (*src == '\0') { @@ -624,21 +616,12 @@ Tcl_UtfFindLast( const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Unicode character to search for. */ { - int len, fullchar; - Tcl_UniChar find = 0; - const char *last; + const char *last = NULL; - last = NULL; while (1) { - len = TclUtfToUniChar(src, &find); - fullchar = find; -#if TCL_UTF_MAX <= 4 - if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) { - len += TclUtfToUniChar(src + len, &find); - fullchar = (((fullchar & 0x3FF) << 10) | (find & 0x3FF)) + 0x10000; - } -#endif - if (fullchar == ch) { + int ucs4, len = TclUtfToUCS4(src, &ucs4); + + if (ucs4 == ch) { last = src; } if (*src == '\0') { -- cgit v0.12 From c583e934033ce158b9b2b03320ecd598e908efcc Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 2 Apr 2020 20:50:32 +0000 Subject: Adapt another caller. This one had a bug when (TCL_UTF_MAX == 4) because it was never adapted when TclUtfToUniChar change from returning 0 to returning 1 for a high surrogate. --- generic/tclDisassemble.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/generic/tclDisassemble.c b/generic/tclDisassemble.c index b60edca..94679fe 100644 --- a/generic/tclDisassemble.c +++ b/generic/tclDisassemble.c @@ -855,7 +855,6 @@ PrintSourceToObj( { register const char *p; register int i = 0, len; - Tcl_UniChar ch = 0; if (stringPtr == NULL) { Tcl_AppendToObj(appendObj, "\"\"", -1); @@ -865,9 +864,10 @@ PrintSourceToObj( Tcl_AppendToObj(appendObj, "\"", -1); p = stringPtr; for (; (*p != '\0') && (i < maxChars); p+=len) { + int ucs4; - len = TclUtfToUniChar(p, &ch); - switch (ch) { + len = TclUtfToUCS4(p, &ucs4); + switch (ucs4) { case '"': Tcl_AppendToObj(appendObj, "\\\"", -1); i += 2; @@ -893,28 +893,14 @@ PrintSourceToObj( i += 2; continue; default: -#if TCL_UTF_MAX > 4 - if (ch > 0xFFFF) { - Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ch); + if (ucs4 > 0xFFFF) { + Tcl_AppendPrintfToObj(appendObj, "\\U%08x", ucs4); i += 10; - } else -#elif TCL_UTF_MAX > 3 - /* If len == 0, this means we have a char > 0xFFFF, resulting in - * TclUtfToUniChar producing a surrogate pair. We want to output - * this pair as a single Unicode character. - */ - if (len == 0) { - int upper = ((ch & 0x3FF) + 1) << 10; - len = TclUtfToUniChar(p, &ch); - Tcl_AppendPrintfToObj(appendObj, "\\U%08x", upper + (ch & 0x3FF)); - i += 10; - } else -#endif - if (ch < 0x20 || ch >= 0x7F) { - Tcl_AppendPrintfToObj(appendObj, "\\u%04x", ch); + } else if (ucs4 < 0x20 || ucs4 >= 0x7F) { + Tcl_AppendPrintfToObj(appendObj, "\\u%04x", ucs4); i += 6; } else { - Tcl_AppendPrintfToObj(appendObj, "%c", ch); + Tcl_AppendPrintfToObj(appendObj, "%c", ucs4); i++; } continue; -- cgit v0.12 From 2f92315df0eafedcd9c06b98df46e3ce71701cb2 Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 2 Apr 2020 21:29:26 +0000 Subject: One more caller conversion that simplifies. There may be other callers of TclUtfToUniChar that would be made more correct or consistent with a similar conversion. --- generic/tclScan.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/generic/tclScan.c b/generic/tclScan.c index 4b9298d..c599797 100644 --- a/generic/tclScan.c +++ b/generic/tclScan.c @@ -881,14 +881,7 @@ Tcl_ScanObjCmd( * Scan a single Unicode character. */ - offset = TclUtfToUniChar(string, &sch); - i = (int)sch; -#if TCL_UTF_MAX == 4 - if ((sch >= 0xD800) && (offset < 3)) { - offset += TclUtfToUniChar(string+offset, &sch); - i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); - } -#endif + offset = TclUtfToUCS4(string, &i); string += offset; if (!(flags & SCAN_SUPPRESS)) { objPtr = Tcl_NewIntObj(i); -- cgit v0.12 From 6d656d18330f1b128b4587401e2bd4be7afbb0df Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 2 Apr 2020 22:05:52 +0000 Subject: typo --- generic/tclUtf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8f02790..c5dc30a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2179,7 +2179,7 @@ TclUtfToUCS4( #if TCL_UTF_MAX == 4 /* 4-byte UTF-8 is supported; decode surrogates */ - if ((ch >= 0xD800) && len < 3) + if ((ch >= 0xD800) && len < 3) { len += Tcl_UtfToUniChar(src + len, &ch); fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000; } -- cgit v0.12 From 381c52e801647c8e1af5eb08a6f298a29cdbf72a Mon Sep 17 00:00:00 2001 From: dgp Date: Thu, 2 Apr 2020 22:18:06 +0000 Subject: Remove stray debug --- generic/tclUtf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index c5dc30a..6782df9 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -578,7 +578,6 @@ Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Unicode character to search for. */ { -fprintf(stdout, "COVER\n"); fflush(stdout); while (1) { int ucs4, len = TclUtfToUCS4(src, &ucs4); -- cgit v0.12