diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 445 |
1 files changed, 43 insertions, 402 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 44a0a75..ca4a166 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -207,13 +207,18 @@ Invalid( Tcl_Size Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the - * buffer. + * buffer. Can be or'ed with flag TCL_COMBINE. */ char *buf) /* Buffer in which the UTF-8 representation of * ch is stored. Must be large enough to hold the UTF-8 * character (at most 4 bytes). */ { + int flags = ch; + + if (ch >= TCL_COMBINE) { + ch &= (TCL_COMBINE - 1); + } if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; @@ -225,7 +230,8 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { - if ((ch & 0xF800) == 0xD800) { + if ((flags & TCL_COMBINE) && + ((ch & 0xF800) == 0xD800)) { if (ch & 0x0400) { /* Low surrogate */ if ( (0x80 == (0xC0 & buf[0])) @@ -302,7 +308,6 @@ three: *--------------------------------------------------------------------------- */ -#undef Tcl_UniCharToUtfDString char * Tcl_UniCharToUtfDString( const int *uniStr, /* Unicode string to convert to UTF-8. */ @@ -386,7 +391,7 @@ Tcl_Char16ToUtfDString( /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } - len = Tcl_UniCharToUtf(*w, p); + len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p); p += len; if ((*w >= 0xD800) && (len < 3)) { len = 0; /* Indication that high surrogate was found */ @@ -417,15 +422,6 @@ Tcl_Char16ToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * Special handling of Surrogate pairs is done: - * For any UTF-8 string containing a character outside of the BMP, the - * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 1. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 3. Because *chPtr - * is used to remember whether the high surrogate is already produced, it - * is recommended to initialize the variable it points to as 0 before - * the first call to Tcl_UtfToUniChar is done. - * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -443,7 +439,6 @@ static const unsigned short cp1252[32] = { 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 }; -#undef Tcl_UtfToUniChar Tcl_Size Tcl_UtfToUniChar( const char *src, /* The UTF-8 string. */ @@ -644,7 +639,6 @@ Tcl_UtfToChar16( *--------------------------------------------------------------------------- */ -#undef Tcl_UtfToUniCharDString int * Tcl_UtfToUniCharDString( const char *src, /* UTF-8 string to convert to Unicode. */ @@ -807,7 +801,7 @@ Tcl_UtfCharComplete( */ Tcl_Size -TclNumUtfChars( +Tcl_NumUtfChars( const char *src, /* The UTF-8 string to measure. */ Tcl_Size length) /* The length of the string in bytes, or * negative value for strlen(src). */ @@ -817,7 +811,7 @@ TclNumUtfChars( if (length < 0) { /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ - while ((*src != '\0') && (i < INT_MAX)) { + while (*src != '\0') { src += TclUtfToUniChar(src, &ch); i++; } @@ -858,9 +852,8 @@ TclNumUtfChars( return i; } -#if !defined(TCL_NO_DEPRECATED) Tcl_Size -Tcl_NumUtfChars( +TclNumUtfChars( const char *src, /* The UTF-8 string to measure. */ Tcl_Size length) /* The length of the string in bytes, or * negative for strlen(src). */ @@ -870,7 +863,7 @@ Tcl_NumUtfChars( if (length < 0) { /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ - while ((*src != '\0') && (i < INT_MAX)) { + while (*src != '\0') { src += Tcl_UtfToChar16(src, &ch); i++; } @@ -910,7 +903,6 @@ Tcl_NumUtfChars( } return i; } -#endif /* *--------------------------------------------------------------------------- @@ -1189,20 +1181,16 @@ Tcl_UniCharAtIndex( const char *src, /* The UTF-8 string to dereference. */ Tcl_Size index) /* The position of the desired character. */ { - unsigned short ch = 0; + Tcl_UniChar ch = 0; int i = 0; if (index < 0) { return -1; } - while (index-- > 0) { - i = Tcl_UtfToChar16(src, &ch); + while (index--) { + i = TclUtfToUniChar(src, &ch); src += i; } - if ((ch >= 0xD800) && (i < 3)) { - /* Index points at character following high Surrogate */ - return -1; - } Tcl_UtfToUniChar(src, &i); return i; } @@ -1225,21 +1213,20 @@ Tcl_UniCharAtIndex( */ const char * -TclUtfAtIndex( +Tcl_UtfAtIndex( const char *src, /* The UTF-8 string. */ Tcl_Size index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; while (index-- > 0) { - src += TclUtfToUniChar(src, &ch); + src += Tcl_UtfToUniChar(src, &ch); } return src; } -#if !defined(TCL_NO_DEPRECATED) const char * -Tcl_UtfAtIndex( +TclUtfAtIndex( const char *src, /* The UTF-8 string. */ Tcl_Size index) /* The position of the desired character. */ { @@ -1257,7 +1244,6 @@ Tcl_UtfAtIndex( } return src; } -#endif /* *--------------------------------------------------------------------------- @@ -1353,7 +1339,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(upChar)) { memmove(dst, src, len); dst += len; } else { @@ -1406,7 +1392,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { @@ -1456,7 +1442,7 @@ Tcl_UtfToTitle( len = Tcl_UtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); - if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(titleChar)) { memmove(dst, src, len); dst += len; } else { @@ -1472,7 +1458,7 @@ Tcl_UtfToTitle( lowChar = Tcl_UniCharToLower(lowChar); } - if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { @@ -1503,10 +1489,12 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - unsigned long numBytes) /* Number of *bytes* to compare. */ + const void *csPtr, /* UTF string to compare to ct. */ + const void *ctPtr, /* UTF string cs is compared to. */ + size_t numBytes) /* Number of *bytes* to compare. */ { + const char *cs = (const char *)csPtr; + const char *ct = (const char *)ctPtr; /* * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes @@ -1536,8 +1524,8 @@ TclpUtfNcmp2( * * Tcl_UtfNcmp -- * - * Compare at most numChars UTF-16 chars of string cs to string ct. Both cs - * and ct are assumed to be at least numChars UTF-16 chars long. + * Compare at most numChars chars (not bytes) of string cs to string ct. Both cs + * and ct are assumed to be at least numChars chars long. * * Results: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. @@ -1548,12 +1536,11 @@ TclpUtfNcmp2( *---------------------------------------------------------------------- */ -#if !defined(TCL_NO_DEPRECATED) int -Tcl_UtfNcmp( +TclUtfNcmp( const char *cs, /* UTF string to compare to ct. */ const char *ct, /* UTF string cs is compared to. */ - unsigned long numChars) /* Number of UTF-16 chars to compare. */ + size_t numChars) /* Number of UTF-16 chars to compare. */ { unsigned short ch1 = 0, ch2 = 0; @@ -1565,7 +1552,7 @@ Tcl_UtfNcmp( while (numChars-- > 0) { /* - * n must be interpreted as UTF-16 chars, not bytes. This should be called + * n must be interpreted as chars, not bytes. This should be called * only when both strings are of at least n UTF-16 chars long (no need for \0 * check) */ @@ -1586,10 +1573,9 @@ Tcl_UtfNcmp( } return 0; } -#endif /* TCL_NO_DEPRECATED */ int -TclUtfNcmp( +Tcl_UtfNcmp( const char *cs, /* UTF string to compare to ct. */ const char *ct, /* UTF string cs is compared to. */ size_t numChars) /* Number of chars to compare. */ @@ -1617,46 +1603,14 @@ TclUtfNcmp( } return 0; } - -int -TclUtfNmemcmp( - const void *csPtr, /* UTF string to compare to ct. */ - const void *ctPtr, /* UTF string cs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - const char *cs = (const char *)csPtr; - const char *ct = (const char *)ctPtr; - - /* - * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 - * (the byte 0x01.) - */ - - while (numChars-- > 0) { - /* - * n must be interpreted as chars, not bytes. This should be called - * only when both strings are of at least n chars long (no need for \0 - * check) - */ - - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - return 0; -} /* *---------------------------------------------------------------------- * * Tcl_UtfNcasecmp -- * - * Compare at most numChars UTF-16 chars of string cs to string ct case - * insensitive. Both cs and ct are assumed to be at least numChars UTF-16 + * Compare at most numChars chars (not bytes) of string cs to string ct case + * insensitive. Both cs and ct are assumed to be at least numChars UTF * chars long. * * Results: @@ -1668,12 +1622,11 @@ TclUtfNmemcmp( *---------------------------------------------------------------------- */ -#if !defined(TCL_NO_DEPRECATED) int -Tcl_UtfNcasecmp( +TclUtfNcasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct, /* UTF string cs is compared to. */ - unsigned long numChars) /* Number of UTF-16 chars to compare. */ + size_t numChars) /* Number of UTF-16 chars to compare. */ { unsigned short ch1 = 0, ch2 = 0; @@ -1703,11 +1656,9 @@ Tcl_UtfNcasecmp( } return 0; } -#endif /* TCL_NO_DEPRECATED */ - int -TclUtfNcasecmp( +Tcl_UtfNcasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct, /* UTF string cs is compared to. */ size_t numChars) /* Number of chars to compare. */ @@ -1733,35 +1684,6 @@ TclUtfNcasecmp( return 0; } -int -TclUtfNcasememcmp( - const void *csPtr, /* UTF string to compare to ct. */ - const void *ctPtr, /* UTF string cs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ - const char *cs = (const char *)csPtr; - const char *ct = (const char *)ctPtr; - Tcl_UniChar ch1 = 0, ch2 = 0; - - while (numChars-- > 0) { - /* - * n must be interpreted as chars, not bytes. - * This should be called only when both strings are of - * at least n chars long (no need for \0 check) - */ - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); - if (ch1 != ch2) { - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - } - return 0; -} - /* *---------------------------------------------------------------------- * @@ -1988,7 +1910,6 @@ Tcl_Char16Len( *---------------------------------------------------------------------- */ -#undef Tcl_UniCharLen Tcl_Size Tcl_UniCharLen( const int *uniStr) /* Unicode string to find length of. */ @@ -2005,7 +1926,7 @@ Tcl_UniCharLen( /* *---------------------------------------------------------------------- * - * Tcl_UniCharNcmp -- + * TclUniCharNcmp -- * * Compare at most numChars chars (not bytes) of string ucs to string uct. * Both ucs and uct are assumed to be at least numChars chars long. @@ -2046,73 +1967,10 @@ TclUniCharNcmp( #endif /* WORDS_BIGENDIAN */ } -int -TclUniCharNmemcmp( - const void *ucsPtr, /* Unicode string to compare to uct. */ - const void *uctPtr, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of chars (not bytes) to compare. */ -{ - const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr; - const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr; -#if defined(WORDS_BIGENDIAN) - /* - * We are definitely on a big-endian machine; memcmp() is safe - */ - - return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); - -#else /* !WORDS_BIGENDIAN */ - /* - * We can't simply call memcmp() because that is not lexically correct. - */ - - for ( ; numChars != 0; ucs++, uct++, numChars--) { - if (*ucs != *uct) { - return (*ucs - *uct); - } - } - return 0; -#endif /* WORDS_BIGENDIAN */ -} - -#if !defined(TCL_NO_DEPRECATED) -int -Tcl_UniCharNcmp( - const unsigned short *ucs, /* Unicode string to compare to uct. */ - const unsigned short *uct, /* Unicode string ucs is compared to. */ - unsigned long numChars) /* Number of chars (not bytes) to compare. */ -{ -#if defined(WORDS_BIGENDIAN) - /* - * We are definitely on a big-endian machine; memcmp() is safe - */ - - return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); - -#else /* !WORDS_BIGENDIAN */ - /* - * We can't simply call memcmp() because that is not lexically correct. - */ - - for ( ; numChars != 0; ucs++, uct++, numChars--) { - if (*ucs != *uct) { - /* special case for handling upper surrogates */ - if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) { - return 1; - } else if (((*uct & 0xFC00) == 0xD800)) { - return -1; - } - return (*ucs - *uct); - } - } - return 0; -#endif /* WORDS_BIGENDIAN */ -} -#endif /* *---------------------------------------------------------------------- * - * Tcl_UniCharNcasecmp -- + * TclUniCharNcasecmp -- * * Compare at most numChars chars (not bytes) of string ucs to string uct case * insensitive. Both ucs and uct are assumed to be at least numChars @@ -2145,54 +2003,6 @@ TclUniCharNcasecmp( } return 0; } - -int -TclUniCharNcasememcmp( - const void *ucsPtr, /* Unicode string to compare to uct. */ - const void *uctPtr, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of chars (not bytes) to compare. */ -{ - const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr; - const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr; - for ( ; numChars != 0; numChars--, ucs++, uct++) { - if (*ucs != *uct) { - Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); - Tcl_UniChar lct = Tcl_UniCharToLower(*uct); - - if (lcs != lct) { - return (lcs - lct); - } - } - } - return 0; -} - -#if !defined(TCL_NO_DEPRECATED) -int -Tcl_UniCharNcasecmp( - const unsigned short *ucs, /* Unicode string to compare to uct. */ - const unsigned short *uct, /* Unicode string ucs is compared to. */ - unsigned long numChars) /* Number of chars (not bytes) to compare. */ -{ - for ( ; numChars != 0; numChars--, ucs++, uct++) { - if (*ucs != *uct) { - unsigned short lcs = Tcl_UniCharToLower(*ucs); - unsigned short lct = Tcl_UniCharToLower(*uct); - - if (lcs != lct) { - /* special case for handling upper surrogates */ - if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { - return 1; - } else if (((lct & 0xFC00) == 0xD800)) { - return -1; - } - return (lcs - lct); - } - } - } - return 0; -} -#endif /* *---------------------------------------------------------------------- @@ -2529,7 +2339,7 @@ Tcl_UniCharIsWordChar( /* *---------------------------------------------------------------------- * - * Tcl_UniCharCaseMatch -- + * TclUniCharCaseMatch -- * * See if a particular Unicode string matches a particular pattern. * Allows case insensitivity. This is the Unicode equivalent of the char* @@ -2715,175 +2525,6 @@ TclUniCharCaseMatch( uniPattern++; } } - -#if !defined(TCL_NO_DEPRECATED) -int -Tcl_UniCharCaseMatch( - const unsigned short *uniStr, /* Unicode String. */ - const unsigned short *uniPattern, - /* Pattern, which may contain special - * characters. */ - int nocase) /* 0 for case sensitive, 1 for insensitive */ -{ - unsigned short ch1 = 0, p; - - while (1) { - p = *uniPattern; - - /* - * See if we're at the end of both the pattern and the string. If so, - * we succeeded. If we're at the end of the pattern but not at the end - * of the string, we failed. - */ - - if (p == 0) { - return (*uniStr == 0); - } - if ((*uniStr == 0) && (p != '*')) { - return 0; - } - - /* - * Check for a "*" as the next pattern character. It matches any - * substring. We handle this by skipping all the characters up to the - * next matching one in the pattern, and then calling ourselves - * recursively for each postfix of string, until either we match or we - * reach the end of the string. - */ - - if (p == '*') { - /* - * Skip all successive *'s in the pattern - */ - - while (*(++uniPattern) == '*') { - /* empty body */ - } - p = *uniPattern; - if (p == 0) { - return 1; - } - if (nocase) { - p = Tcl_UniCharToLower(p); - } - while (1) { - /* - * Optimization for matching - cruise through the string - * quickly if the next char in the pattern isn't a special - * character - */ - - if ((p != '[') && (p != '?') && (p != '\\')) { - if (nocase) { - while (*uniStr && (p != *uniStr) - && (p != Tcl_UniCharToLower(*uniStr))) { - uniStr++; - } - } else { - while (*uniStr && (p != *uniStr)) { - uniStr++; - } - } - } - if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { - return 1; - } - if (*uniStr == 0) { - return 0; - } - uniStr++; - } - } - - /* - * Check for a "?" as the next pattern character. It matches any - * single character. - */ - - if (p == '?') { - uniPattern++; - uniStr++; - continue; - } - - /* - * Check for a "[" as the next pattern character. It is followed by a - * list of characters that are acceptable, or by a range (two - * characters separated by "-"). - */ - - if (p == '[') { - unsigned short startChar, endChar; - - uniPattern++; - ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); - uniStr++; - while (1) { - if ((*uniPattern == ']') || (*uniPattern == 0)) { - return 0; - } - startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) - : *uniPattern); - uniPattern++; - if (*uniPattern == '-') { - uniPattern++; - if (*uniPattern == 0) { - return 0; - } - endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) - : *uniPattern); - uniPattern++; - if (((startChar <= ch1) && (ch1 <= endChar)) - || ((endChar <= ch1) && (ch1 <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - break; - } - } else if (startChar == ch1) { - break; - } - } - while (*uniPattern != ']') { - if (*uniPattern == 0) { - uniPattern--; - break; - } - uniPattern++; - } - uniPattern++; - continue; - } - - /* - * If the next pattern character is '\', just strip off the '\' so we - * do exact matching on the character that follows. - */ - - if (p == '\\') { - if (*(++uniPattern) == '\0') { - return 0; - } - } - - /* - * There's no special character. Just make sure that the next bytes of - * each string match. - */ - - if (nocase) { - if (Tcl_UniCharToLower(*uniStr) != - Tcl_UniCharToLower(*uniPattern)) { - return 0; - } - } else if (*uniStr != *uniPattern) { - return 0; - } - uniStr++; - uniPattern++; - } -} -#endif /* *---------------------------------------------------------------------- @@ -2892,7 +2533,7 @@ Tcl_UniCharCaseMatch( * * See if a particular Unicode string matches a particular pattern. * Allows case insensitivity. This is the Unicode equivalent of the char* - * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted + * Tcl_StringCaseMatch. This variant of TclUniCharCaseMatch uses counted * Strings, so embedded NULLs are allowed. * * Results: |