diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 135 |
1 files changed, 112 insertions, 23 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 25cc2d1..43636b4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -94,7 +94,7 @@ static const unsigned char totalBytes[256] = { int TclUtfCount( - int ch) /* The Tcl_UniChar whose size is returned. */ + int ch) /* The Unicode character whose size is returned. */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; @@ -398,7 +398,7 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar ch, *w, *wString; + Tcl_UniChar ch = 0, *w, *wString; const char *p, *end; int oldLength; @@ -522,13 +522,13 @@ Tcl_NumUtfChars( * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the first occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the - * return value is NULL. + * As above. If the Unicode character does not exist in the given string, + * the return value is NULL. * * Side effects: * None. @@ -539,14 +539,21 @@ Tcl_NumUtfChars( const char * Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; + } +#endif + if (fullchar == ch) { return src; } if (*src == '\0') { @@ -561,12 +568,12 @@ Tcl_UtfFindFirst( * * Tcl_UtfFindLast -- * - * Returns a pointer to the last occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the last occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the + * As above. If the Unicode character does not exist in the given string, the * return value is NULL. * * Side effects: @@ -578,16 +585,23 @@ Tcl_UtfFindFirst( const char * Tcl_UtfFindLast( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; + } +#endif + if (fullchar == ch) { last = src; } if (*src == '\0') { @@ -687,7 +701,7 @@ Tcl_UtfPrev( * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified character + * Returns the Tcl_UniChar represented at the specified character * (not byte) position in the UTF-8 string. * * Results: @@ -706,8 +720,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch = 0; - while (index >= 0) { - index--; + while (index-- >= 0) { src += TclUtfToUniChar(src, &ch); } return ch; @@ -737,8 +750,7 @@ Tcl_UtfAtIndex( { Tcl_UniChar ch = 0; - while (index > 0) { - index--; + while (index-- > 0) { src += TclUtfToUniChar(src, &ch); } return src; @@ -1053,6 +1065,16 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif return (ch1 - ch2); } } @@ -1084,6 +1106,7 @@ Tcl_UtfNcasecmp( unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1093,6 +1116,16 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1102,11 +1135,57 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- + * TclUtfCasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1126,12 +1205,22 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - while (*cs && *ct) { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { |
