diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 130 |
1 files changed, 111 insertions, 19 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 5f87ffa..dbe5e45 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -94,7 +94,7 @@ static const unsigned char totalBytes[256] = { int TclUtfCount( - int ch) /* The Tcl_UniChar whose size is returned. */ + int ch) /* The Unicode character whose size is returned. */ { if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; @@ -398,7 +398,7 @@ Tcl_UtfToUniCharDString( * appended to this previously initialized * DString. */ { - Tcl_UniChar ch, *w, *wString; + Tcl_UniChar ch = 0, *w, *wString; const char *p, *end; size_t oldLength; @@ -521,13 +521,13 @@ Tcl_NumUtfChars( * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the first occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the - * return value is NULL. + * As above. If the Unicode character does not exist in the given string, + * the return value is NULL. * * Side effects: * None. @@ -538,14 +538,21 @@ Tcl_NumUtfChars( const char * Tcl_UtfFindFirst( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; + } +#endif + if (fullchar == ch) { return src; } if (*src == '\0') { @@ -560,12 +567,12 @@ Tcl_UtfFindFirst( * * Tcl_UtfFindLast -- * - * Returns a pointer to the last occurance of the given Tcl_UniChar in - * the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the last occurance of the given Unicode character + * in the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, the + * As above. If the Unicode character does not exist in the given string, the * return value is NULL. * * Side effects: @@ -577,16 +584,23 @@ Tcl_UtfFindFirst( const char * Tcl_UtfFindLast( const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Tcl_UniChar to search for. */ + int ch) /* The Unicode character to search for. */ { - int len; + int len, fullchar; Tcl_UniChar find = 0; const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); - if (find == ch) { + fullchar = find; +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(src, &find); + fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; + } +#endif + if (fullchar == ch) { last = src; } if (*src == '\0') { @@ -686,7 +700,7 @@ Tcl_UtfPrev( * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified character + * Returns the Tcl_UniChar represented at the specified character * (not byte) position in the UTF-8 string. * * Results: @@ -705,6 +719,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch = 0; + src += TclUtfToUniChar(src, &ch); while (index--) { src += TclUtfToUniChar(src, &ch); } @@ -1052,6 +1067,16 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif return (ch1 - ch2); } } @@ -1083,6 +1108,7 @@ Tcl_UtfNcasecmp( size_t numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1 = 0, ch2 = 0; + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1092,6 +1118,16 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1101,11 +1137,57 @@ Tcl_UtfNcasecmp( } return 0; } + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfCmp -- + * + * Compare UTF chars of string cs to string ct case sensitively. + * Replacement for strcmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + Tcl_UniChar ch1 = 0, ch2 = 0; + + while (*cs && *ct) { + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif + return ch1 - ch2; + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + /* *---------------------------------------------------------------------- * - * Tcl_UtfNcasecmp -- + * TclUtfCasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1125,12 +1207,22 @@ TclUtfCasecmp( const char *cs, /* UTF string to compare to ct. */ const char *ct) /* UTF string cs is compared to. */ { - while (*cs && *ct) { - Tcl_UniChar ch1, ch2; + Tcl_UniChar ch1 = 0, ch2 = 0; + while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { +#if TCL_UTF_MAX == 4 + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } +#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { |
