diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-09-12 10:11:55 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-09-12 10:11:55 (GMT) |
commit | 3b3d524d3ab6cd4d2bb66f30dae043c5c622cf23 (patch) | |
tree | a7d3be3be5996d60c7e526bd007ff12be6c8e575 /generic/tclUtf.c | |
parent | f03ee8f9815a46ca1520e2b40825a81218c1b4d1 (diff) | |
parent | b9bb8fd5b282207069eb191367f3a67d1daf50d5 (diff) | |
download | tcl-3b3d524d3ab6cd4d2bb66f30dae043c5c622cf23.zip tcl-3b3d524d3ab6cd4d2bb66f30dae043c5c622cf23.tar.gz tcl-3b3d524d3ab6cd4d2bb66f30dae043c5c622cf23.tar.bz2 |
Merge 8.7
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 182 |
1 files changed, 11 insertions, 171 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 68112c5..ba5948f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -215,9 +215,7 @@ Tcl_UniCharToUtf( * character (at most 4 bytes). */ { -#if TCL_UTF_MAX > 3 int flags = ch; -#endif if (ch >= TCL_COMBINE) { ch &= (TCL_COMBINE - 1); @@ -234,9 +232,7 @@ Tcl_UniCharToUtf( } if (ch <= 0xFFFF) { if ( -#if TCL_UTF_MAX > 3 (flags & TCL_COMBINE) && -#endif ((ch & 0xF800) == 0xD800)) { if (ch & 0x0400) { /* Low surrogate */ @@ -349,15 +345,6 @@ Tcl_UniCharToUtfDString( p = string; wEnd = uniStr + uniLength; -#if TCL_UTF_MAX < 4 - /* Initialize the buffer so that some random data doesn't trick - * Tcl_UniCharToUtf() into thinking it should combine surrogate pairs. - * Once TCL_UTF_MAX == 3 is removed and Tcl_UniCharToUtf restored to its - * prior non-stateful nature, this call to memset can also be removed. - */ - memset(p, 0xff, Tcl_DStringLength(dsPtr) - oldLength); -#endif - for (w = uniStr; w < wEnd; ) { p += Tcl_UniCharToUtf(*w, p); w++; @@ -402,15 +389,6 @@ Tcl_Char16ToUtfDString( p = string; wEnd = uniStr + uniLength; -#if TCL_UTF_MAX < 4 - /* Initialize the buffer so that some random data doesn't trick - * Tcl_UniCharToUtf() into thinking it should combine surrogate pairs. - * Because TCL_COMBINE is used here, memset() is required even when - * TCL_UTF_MAX == 4. - */ - memset(p, 0xff, Tcl_DStringLength(dsPtr) - oldLength); -#endif - for (w = uniStr; w < wEnd; ) { if (!len && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ @@ -447,15 +425,6 @@ Tcl_Char16ToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * If TCL_UTF_MAX < 4, special handling of Surrogate pairs is done: - * For any UTF-8 string containing a character outside of the BMP, the - * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 1. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 3. Because *chPtr - * is used to remember whether the high surrogate is already produced, it - * is recommended to initialize the variable it points to as 0 before - * the first call to Tcl_UtfToUniChar is done. - * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -715,11 +684,11 @@ Tcl_UtfToUniCharDString( endPtr = src + length; optPtr = endPtr - 4; while (p <= optPtr) { - p += TclUtfToUCS4(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); *w++ = ch; } while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { - p += TclUtfToUCS4(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); *w++ = ch; } while (p < endPtr) { @@ -965,7 +934,7 @@ Tcl_UtfFindFirst( int ch) /* The Unicode character to search for. */ { while (1) { - int find, len = TclUtfToUCS4(src, &find); + int find, len = Tcl_UtfToUniChar(src, &find); if (find == ch) { return src; @@ -1004,7 +973,7 @@ Tcl_UtfFindLast( const char *last = NULL; while (1) { - int find, len = TclUtfToUCS4(src, &find); + int find, len = Tcl_UtfToUniChar(src, &find); if (find == ch) { last = src; @@ -1227,13 +1196,7 @@ Tcl_UniCharAtIndex( i = TclUtfToUniChar(src, &ch); src += i; } -#if TCL_UTF_MAX < 4 - if ((ch >= 0xD800) && (i < 3)) { - /* Index points at character following high Surrogate */ - return -1; - } -#endif - TclUtfToUCS4(src, &i); + Tcl_UtfToUniChar(src, &i); return i; } @@ -1243,9 +1206,7 @@ Tcl_UniCharAtIndex( * Tcl_UtfAtIndex -- * * Returns a pointer to the specified character (not byte) position in - * the UTF-8 string. If TCL_UTF_MAX < 4, characters > U+FFFF count as - * 2 positions, but then the pointer should never be placed between - * the two positions. + * the UTF-8 string. * * Results: * As above. @@ -1377,7 +1338,7 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); upChar = Tcl_UniCharToUpper(ch); /* @@ -1430,7 +1391,7 @@ Tcl_UtfToLower( src = dst = str; while (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); /* @@ -1486,7 +1447,7 @@ Tcl_UtfToTitle( src = dst = str; if (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); if (len < TclUtfCount(titleChar)) { @@ -1498,7 +1459,7 @@ Tcl_UtfToTitle( src += len; } while (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); lowChar = ch; /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { @@ -1605,16 +1566,6 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif return (ch1 - ch2); } } @@ -1656,16 +1607,6 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1705,16 +1646,6 @@ TclUtfCmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif return ch1 - ch2; } } @@ -1751,16 +1682,6 @@ TclUtfCasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1959,7 +1880,7 @@ TclUniCharNcmp( const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ size_t numChars) /* Number of unichars to compare. */ { -#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) +#if defined(WORDS_BIGENDIAN) /* * We are definitely on a big-endian machine; memcmp() is safe */ @@ -1973,14 +1894,6 @@ TclUniCharNcmp( for ( ; numChars != 0; ucs++, uct++, numChars--) { if (*ucs != *uct) { -#if TCL_UTF_MAX < 4 - /* special case for handling upper surrogates */ - if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) { - return 1; - } else if (((*uct & 0xFC00) == 0xD800)) { - return -1; - } -#endif return (*ucs - *uct); } } @@ -2018,14 +1931,6 @@ TclUniCharNcasecmp( Tcl_UniChar lct = Tcl_UniCharToLower(*uct); if (lcs != lct) { -#if TCL_UTF_MAX < 4 - /* special case for handling upper surrogates */ - if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { - return 1; - } else if (((lct & 0xFC00) == 0xD800)) { - return -1; - } -#endif return (lcs - lct); } } @@ -2753,71 +2658,6 @@ TclUniCharMatch( } /* - *--------------------------------------------------------------------------- - * - * TclUtfToUCS4 -- - * - * Extracts the 4-byte codepoint from the leading bytes of the - * Modified UTF-8 string "src". This is a utility routine to - * contain the surrogate gymnastics in one place. - * - * The caller must ensure that the source buffer is long enough that this - * routine does not run off the end and dereference non-existent memory - * looking for trail bytes. If the source buffer is known to be '\0' - * terminated, this cannot happen. Otherwise, the caller should call - * Tcl_UtfCharComplete() before calling this routine to ensure that - * enough bytes remain in the string. - * - * Results: - * Fills *usc4Ptr with the UCS4 code point and returns the number of bytes - * consumed from the source string. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -#if TCL_UTF_MAX < 4 -int -TclUtfToUCS4( - const char *src, /* The UTF-8 string. */ - int *ucs4Ptr) /* Filled with the UCS4 codepoint represented - * by the UTF-8 string. */ -{ - /* Make use of the #undef Tcl_UtfToUniChar above, which already handles UCS4. */ - return Tcl_UtfToUniChar(src, ucs4Ptr); -} - -int -TclUniCharToUCS4( - const Tcl_UniChar *src, /* The Tcl_UniChar string. */ - int *ucs4Ptr) /* Filled with the UCS4 codepoint represented - * by the Tcl_UniChar string. */ -{ - if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) { - *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000; - return 2; - } - *ucs4Ptr = src[0]; - return 1; -} - -const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) { - if (src <= ptr + 1) { - return ptr; - } - if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) { - return src - 2; - } - return src - 1; -} - - - -#endif - -/* * Local Variables: * mode: c * c-basic-offset: 4 |