diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-05-01 19:42:53 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-05-01 19:42:53 (GMT) |
commit | 190852002f644f02a6e13c6a7c9017c3f21e1003 (patch) | |
tree | bed7c42f0cd850e442f6cc56d59c3b6b5efe258a /generic/tclUtf.c | |
parent | 6c4b78cfa8c06ea5963591778902da74850d1985 (diff) | |
download | tcl-190852002f644f02a6e13c6a7c9017c3f21e1003.zip tcl-190852002f644f02a6e13c6a7c9017c3f21e1003.tar.gz tcl-190852002f644f02a6e13c6a7c9017c3f21e1003.tar.bz2 |
Remove internal use of TCL_UTF_MAX=3 as much as possible, without compromizing existing TIP's
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 154 |
1 files changed, 18 insertions, 136 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6e14689..e3ee84a 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -408,7 +408,7 @@ Tcl_Char16ToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done: + * Special handling of Surrogate pairs is done: * For any UTF-8 string containing a character outside of the BMP, the * first call to this function will fill *chPtr with the high surrogate * and generate a return value of 1. Calling Tcl_UtfToUniChar again @@ -676,11 +676,11 @@ Tcl_UtfToUniCharDString( endPtr = src + length; optPtr = endPtr - 4; while (p <= optPtr) { - p += TclUtfToUCS4(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); *w++ = ch; } while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { - p += TclUtfToUCS4(p, &ch); + p += Tcl_UtfToUniChar(p, &ch); *w++ = ch; } while (p < endPtr) { @@ -849,7 +849,7 @@ TclNumUtfChars( return i; } -#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#if !defined(TCL_NO_DEPRECATED) #undef Tcl_NumUtfChars int Tcl_NumUtfChars( @@ -929,7 +929,7 @@ Tcl_UtfFindFirst( int ch) /* The Unicode character to search for. */ { while (1) { - int find, len = TclUtfToUCS4(src, &find); + int find, len = Tcl_UtfToUniChar(src, &find); if (find == ch) { return src; @@ -968,7 +968,7 @@ Tcl_UtfFindLast( const char *last = NULL; while (1) { - int find, len = TclUtfToUCS4(src, &find); + int find, len = Tcl_UtfToUniChar(src, &find); if (find == ch) { last = src; @@ -1195,7 +1195,7 @@ Tcl_UniCharAtIndex( /* Index points at character following high Surrogate */ return -1; } - TclUtfToUCS4(src, &i); + Tcl_UtfToUniChar(src, &i); return i; } @@ -1205,9 +1205,7 @@ Tcl_UniCharAtIndex( * Tcl_UtfAtIndex -- * * Returns a pointer to the specified character (not byte) position in - * the UTF-8 string. If TCL_UTF_MAX < 4, characters > U+FFFF count as - * 2 positions, but then the pointer should never be placed between - * the two positions. + * the UTF-8 string. * * Results: * As above. @@ -1218,11 +1216,6 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -#if TCL_UTF_MAX < 4 -# undef Tcl_UtfToUniChar -# define Tcl_UtfToUniChar Tcl_UtfToChar16 -#endif - const char * TclUtfAtIndex( const char *src, /* The UTF-8 string. */ @@ -1235,16 +1228,10 @@ TclUtfAtIndex( len = (Tcl_UtfToUniChar)(src, &ch); src += len; } -#if TCL_UTF_MAX < 4 - if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following high Surrogate */ - src += (Tcl_UtfToUniChar)(src, &ch); - } -#endif return src; } -#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#if !defined(TCL_NO_DEPRECATED) #undef Tcl_UtfAtIndex const char * Tcl_UtfAtIndex( @@ -1353,7 +1340,7 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); upChar = Tcl_UniCharToUpper(ch); /* @@ -1406,7 +1393,7 @@ Tcl_UtfToLower( src = dst = str; while (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); /* @@ -1462,7 +1449,7 @@ Tcl_UtfToTitle( src = dst = str; if (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) { @@ -1474,7 +1461,7 @@ Tcl_UtfToTitle( src += len; } while (*src) { - len = TclUtfToUCS4(src, &ch); + len = Tcl_UtfToUniChar(src, &ch); lowChar = ch; /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { @@ -1581,16 +1568,6 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif return (ch1 - ch2); } } @@ -1632,16 +1609,6 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1681,16 +1648,6 @@ TclUtfCmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif return ch1 - ch2; } } @@ -1727,16 +1684,6 @@ TclUtfCasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX < 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1935,7 +1882,7 @@ TclUniCharNcmp( const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { -#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) +#if defined(WORDS_BIGENDIAN) /* * We are definitely on a big-endian machine; memcmp() is safe */ @@ -1956,14 +1903,14 @@ TclUniCharNcmp( #endif /* WORDS_BIGENDIAN */ } -#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#if !defined(TCL_NO_DEPRECATED) int Tcl_UniCharNcmp( const unsigned short *ucs, /* Unicode string to compare to uct. */ const unsigned short *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { -#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3) +#if defined(WORDS_BIGENDIAN) /* * We are definitely on a big-endian machine; memcmp() is safe */ @@ -2027,7 +1974,7 @@ TclUniCharNcasecmp( return 0; } -#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#if !defined(TCL_NO_DEPRECATED) int Tcl_UniCharNcasecmp( const unsigned short *ucs, /* Unicode string to compare to uct. */ @@ -2583,7 +2530,7 @@ TclUniCharCaseMatch( } } -#if (TCL_UTF_MAX > 3) && !defined(TCL_NO_DEPRECATED) +#if !defined(TCL_NO_DEPRECATED) int Tcl_UniCharCaseMatch( const unsigned short *uniStr, /* Unicode String. */ @@ -2945,71 +2892,6 @@ TclUniCharMatch( } /* - *--------------------------------------------------------------------------- - * - * TclUtfToUCS4 -- - * - * Extracts the 4-byte codepoint from the leading bytes of the - * Modified UTF-8 string "src". This is a utility routine to - * contain the surrogate gymnastics in one place. - * - * The caller must ensure that the source buffer is long enough that this - * routine does not run off the end and dereference non-existent memory - * looking for trail bytes. If the source buffer is known to be '\0' - * terminated, this cannot happen. Otherwise, the caller should call - * Tcl_UtfCharComplete() before calling this routine to ensure that - * enough bytes remain in the string. - * - * Results: - * Fills *usc4Ptr with the UCS4 code point and returns the number of bytes - * consumed from the source string. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -#if TCL_UTF_MAX < 4 -int -TclUtfToUCS4( - const char *src, /* The UTF-8 string. */ - int *ucs4Ptr) /* Filled with the UCS4 codepoint represented - * by the UTF-8 string. */ -{ -# undef Tcl_UtfToUniChar - return Tcl_UtfToUniChar(src, ucs4Ptr); -} - -int -TclUniCharToUCS4( - const Tcl_UniChar *src, /* The Tcl_UniChar string. */ - int *ucs4Ptr) /* Filled with the UCS4 codepoint represented - * by the Tcl_UniChar string. */ -{ - if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) { - *ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[1] & 0x3FF)) + 0x10000; - return 2; - } - *ucs4Ptr = src[0]; - return 1; -} - -const Tcl_UniChar *TclUCS4Prev(const Tcl_UniChar *src, const Tcl_UniChar *ptr) { - if (src <= ptr + 1) { - return ptr; - } - if (((src[-1] & 0xFC00) == 0xDC00) && ((src[-2] & 0xFC00) == 0xD800)) { - return src - 2; - } - return src - 1; -} - - - -#endif - -/* * Local Variables: * mode: c * c-basic-offset: 4 |