diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 1748 |
1 files changed, 346 insertions, 1402 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 2fa0e80..e5497a4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -3,7 +3,7 @@ * * Routines for manipulating UTF-8 strings. * - * Copyright © 1997-1998 Sun Microsystems, Inc. + * Copyright (c) 1997-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. @@ -26,7 +26,7 @@ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) -#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT)) +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) @@ -55,44 +55,46 @@ #define UNICODE_SELF 0x80 /* - * The following structures are used when mapping between Unicode and + * The following structures are used when mapping between Unicode (UCS-2) and * UTF-8. */ -static const unsigned char totalBytes[256] = { +static CONST unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#if TCL_UTF_MAX > 3 + 4,4,4,4,4,4,4,4, +#else + 1,1,1,1,1,1,1,1, +#endif +#if TCL_UTF_MAX > 4 + 5,5,5,5, +#else + 1,1,1,1, +#endif +#if TCL_UTF_MAX > 5 + 6,6,6,6 +#else + 1,1,1,1 +#endif }; -static const unsigned char complete[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -/* End of "continuation byte section" */ - 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 -}; - /* * Functions used only in this module. */ -static int Invalid(const char *src); +static int UtfCount(int ch); /* *--------------------------------------------------------------------------- * - * TclUtfCount -- + * UtfCount -- * * Find the number of bytes in the Utf character "ch". * @@ -105,98 +107,44 @@ static int Invalid(const char *src); *--------------------------------------------------------------------------- */ -int -TclUtfCount( - int ch) /* The Unicode character whose size is returned. */ +INLINE static int +UtfCount( + int ch) /* The Tcl_UniChar whose size is returned. */ { - if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { + if ((ch > 0) && (ch < UNICODE_SELF)) { return 1; } if (ch <= 0x7FF) { return 2; } - if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { + if (ch <= 0xFFFF) { + return 3; + } +#if TCL_UTF_MAX > 3 + if (ch <= 0x1FFFFF) { return 4; } + if (ch <= 0x3FFFFFF) { + return 5; + } + if (ch <= 0x7FFFFFFF) { + return 6; + } +#endif return 3; } - -/* - *--------------------------------------------------------------------------- - * - * Invalid -- - * - * Given a pointer to a two-byte prefix of a well-formed UTF-8 byte - * sequence (a lead byte followed by a trail byte) this routine - * examines those two bytes to determine whether the sequence is - * invalid in UTF-8. This might be because it is an overlong - * encoding, or because it encodes something out of the proper range. - * - * Given a pointer to the bytes \xF8 or \xFC , this routine will - * try to read beyond the end of the "bounds" table. Callers must - * prevent this. - * - * Given a pointer to something else (an ASCII byte, a trail byte, - * or another byte that can never begin a valid byte sequence such - * as \xF5) this routine returns false. That makes the routine poorly - * named, as it does not detect and report all invalid sequences. - * - * Callers have to take care that this routine does something useful - * for their needs. - * - * Results: - * A boolean. - *--------------------------------------------------------------------------- - */ -static const unsigned char bounds[28] = { - 0x80, 0x80, /* \xC0 accepts \x80 only */ - 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, - 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */ - 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ - 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */ - 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */ - 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */ -}; - -static int -Invalid( - const char *src) /* Points to lead byte of a UTF-8 byte sequence */ -{ - unsigned char byte = UCHAR(*src); - int index; - - if ((byte & 0xC3) == 0xC0) { - /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ - index = (byte - 0xC0) >> 1; - if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) { - /* Out of bounds - report invalid. */ - return 1; - } - } - return 0; -} - /* *--------------------------------------------------------------------------- * * Tcl_UniCharToUtf -- * - * Stores the given Tcl_UniChar as a sequence of UTF-8 bytes in the provided - * buffer. Equivalent to Plan 9 runetochar(). - * - * Surrogate pairs are handled as follows: When ch is a high surrogate, - * the first byte of the 4-byte UTF-8 sequence is stored in the buffer and - * the function returns 1. If the function is called again with a low - * surrogate and the same buffer, the remaining 3 bytes of the 4-byte - * UTF-8 sequence are produced. - * - * If no low surrogate follows the high surrogate (which is actually illegal), - * calling Tcl_UniCharToUtf again with ch being -1 produces a 3-byte UTF-8 - * sequence representing the high surrogate. + * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the + * provided buffer. Equivalent to Plan 9 runetochar(). * * Results: - * Returns the number of bytes stored into the buffer. + * The return values is the number of bytes in the buffer that were + * consumed. * * Side effects: * None. @@ -204,84 +152,63 @@ Invalid( *--------------------------------------------------------------------------- */ -Tcl_Size +INLINE int Tcl_UniCharToUtf( - int ch, /* The Tcl_UniChar to be stored in the - * buffer. - */ - char *buf) /* Buffer in which the UTF-8 representation of - * ch is stored. Must be large enough to hold the UTF-8 - * character (at most 4 bytes). - */ + int ch, /* The Tcl_UniChar to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the Tcl_UniChar is stored. Buffer must be + * large enough to hold the UTF-8 character + * (at most TCL_UTF_MAX bytes). */ { - if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { + if ((ch > 0) && (ch < UNICODE_SELF)) { buf[0] = (char) ch; return 1; } if (ch >= 0) { if (ch <= 0x7FF) { - buf[1] = (char) (0x80 | (0x3F & ch)); - buf[0] = (char) (0xC0 | (ch >> 6)); + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 6) | 0xC0); return 2; } if (ch <= 0xFFFF) { - if ((ch & 0xF800) == 0xD800) { - if (ch & 0x0400) { - /* Low surrogate */ - if ( (0x80 == (0xC0 & buf[0])) - && (0 == (0xCF & buf[1]))) { - /* Previous Tcl_UniChar was a high surrogate, so combine */ - buf[2] = (char) (0x80 | (0x3F & ch)); - buf[1] |= (char) (0x80 | (0x0F & (ch >> 6))); - return 3; - } - /* Previous Tcl_UniChar was not a high surrogate, so just output */ - } else { - /* High surrogate */ - - /* Add 0x10000 to the raw number encoded in the surrogate - * pair in order to get the code point. - */ - ch += 0x40; - - /* Fill buffer with specific 3-byte (invalid) byte combination, - so following low surrogate can recognize it and combine */ - buf[2] = (char) ((ch << 4) & 0x30); - buf[1] = (char) (0x80 | (0x3F & (ch >> 2))); - buf[0] = (char) (0xF0 | (0x07 & (ch >> 8))); - return 1; - } - } - goto three; + three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; } - if (ch <= 0x10FFFF) { - buf[3] = (char) (0x80 | (0x3F & ch)); - buf[2] = (char) (0x80 | (0x3F & (ch >> 6))); - buf[1] = (char) (0x80 | (0x3F & (ch >> 12))); - buf[0] = (char) (0xF0 | (ch >> 18)); + +#if TCL_UTF_MAX > 3 + if (ch <= 0x1FFFFF) { + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } - } else if (ch == -1) { - if ( (0x80 == (0xC0 & buf[0])) - && (0 == (0xCF & buf[1])) - && (0xF0 == (0xF8 & buf[-1]))) { - ch = 0xD7C0 - + ((0x07 & buf[-1]) << 8) - + ((0x3F & buf[0]) << 2) - + ((0x30 & buf[1]) >> 4); - buf[1] = (char) (0x80 | (0x3F & ch)); - buf[0] = (char) (0x80 | (0x3F & (ch >> 6))); - buf[-1] = (char) (0xE0 | (ch >> 12)); - return 2; + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; } +#endif } ch = 0xFFFD; -three: - buf[2] = (char) (0x80 | (0x3F & ch)); - buf[1] = (char) (0x80 | (0x3F & (ch >> 6))); - buf[0] = (char) (0xE0 | (ch >> 12)); - return 3; + goto three; } /* @@ -302,41 +229,29 @@ three: *--------------------------------------------------------------------------- */ -#undef Tcl_UniCharToUtfDString char * Tcl_UniCharToUtfDString( - const int *uniStr, /* Unicode string to convert to UTF-8. */ - Tcl_Size uniLength, /* Length of Unicode string. Negative for nul - * terminated string */ + CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars + * (must be >= 0). */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - const int *w, *wEnd; + CONST Tcl_UniChar *w, *wEnd; char *p, *string; - Tcl_Size oldLength; + int oldLength; /* - * UTF-8 string length in bytes will be <= Unicode string length * 4. + * UTF-8 string length in bytes will be <= Unicode string length * + * TCL_UTF_MAX. */ - if (uniStr == NULL) { - return NULL; - } - if (uniLength < 0) { - uniLength = 0; - w = uniStr; - while (*w != '\0') { - uniLength++; - w++; - } - } oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4); + Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; wEnd = uniStr + uniLength; - for (w = uniStr; w < wEnd; ) { p += Tcl_UniCharToUtf(*w, p); w++; @@ -346,61 +261,6 @@ Tcl_UniCharToUtfDString( return string; } -char * -Tcl_Char16ToUtfDString( - const unsigned short *uniStr,/* Utf-16 string to convert to UTF-8. */ - Tcl_Size uniLength, /* Length of Utf-16 string. */ - Tcl_DString *dsPtr) /* UTF-8 representation of string is appended - * to this previously initialized DString. */ -{ - const unsigned short *w, *wEnd; - char *p, *string; - Tcl_Size oldLength; - int len = 1; - - /* - * UTF-8 string length in bytes will be <= Utf16 string length * 3. - */ - - if (uniStr == NULL) { - return NULL; - } - if (uniLength < 0) { - - uniLength = 0; - w = uniStr; - while (*w != '\0') { - uniLength++; - w++; - } - } - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 3); - string = Tcl_DStringValue(dsPtr) + oldLength; - - p = string; - wEnd = uniStr + uniLength; - - for (w = uniStr; w < wEnd; ) { - if (!len && ((*w & 0xFC00) != 0xDC00)) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - len = Tcl_UniCharToUtf(*w, p); - p += len; - if ((*w >= 0xD800) && (len < 3)) { - len = 0; /* Indication that high surrogate was found */ - } - w++; - } - if (!len) { - /* Special case for handling high surrogates. */ - p += Tcl_UniCharToUtf(-1, p); - } - Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); - - return string; -} /* *--------------------------------------------------------------------------- * @@ -417,15 +277,6 @@ Tcl_Char16ToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * Special handling of Surrogate pairs is done: - * For any UTF-8 string containing a character outside of the BMP, the - * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 1. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 3. Because *chPtr - * is used to remember whether the high surrogate is already produced, it - * is recommended to initialize the variable it points to as 0 before - * the first call to Tcl_UtfToUniChar is done. - * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -436,192 +287,89 @@ Tcl_Char16ToUtfDString( *--------------------------------------------------------------------------- */ -static const unsigned short cp1252[32] = { - 0x20AC, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, - 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, - 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, - 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 -}; - -Tcl_Size +int Tcl_UtfToUniChar( - const char *src, /* The UTF-8 string. */ - int *chPtr)/* Filled with the Unicode character represented by + register CONST char *src, /* The UTF-8 string. */ + register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { - int byte; + register int byte; /* - * Unroll 1 to 4 byte UTF-8 sequences. + * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Treats naked trail bytes 0x80 to 0x9F as valid characters from - * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> - * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid + * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ - if ((unsigned)(byte-0x80) < (unsigned)0x20) { - *chPtr = cp1252[byte-0x80]; - } else { - *chPtr = byte; - } + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xE0) { - if ((byte != 0xC1) && ((src[1] & 0xC0) == 0x80)) { + if ((src[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed by a trail-byte. */ - *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); - if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { - return 2; - } + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + return 2; } /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ + + *chPtr = (Tcl_UniChar) byte; + return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (((byte & 0x0F) << 12) + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); - if (*chPtr > 0x7FF) { - return 3; - } + return 3; } /* * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ - } else if (byte < 0xF5) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { - /* - * Four-byte-character lead byte followed by three trail bytes. - */ - *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { - return 4; - } - } - - /* - * A four-byte-character lead-byte not followed by three trail-bytes - * represents itself. - */ - } - *chPtr = byte; - return 1; -} - -Tcl_Size -Tcl_UtfToChar16( - const char *src, /* The UTF-8 string. */ - unsigned short *chPtr)/* Filled with the Tcl_UniChar represented by - * the UTF-8 string. This could be a surrogate too. */ -{ - unsigned short byte; - - /* - * Unroll 1 to 4 byte UTF-8 sequences. - */ - - byte = UCHAR(*src); - if (byte < 0xC0) { - /* - * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Treats naked trail bytes 0x80 to 0x9F as valid characters from - * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8> - * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid - * characters representing themselves. - */ - - /* If *chPtr contains a high surrogate (produced by a previous - * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation - * bytes, then we must produce a follow-up low surrogate. We only - * do that if the high surrogate matches the bits we encounter. - */ - if (((byte & 0xC0) == 0x80) - && ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) - && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) - && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) { - *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; - return 3; - } - if ((unsigned)(byte-0x80) < (unsigned)0x20) { - *chPtr = cp1252[byte-0x80]; - } else { - *chPtr = byte; - } + *chPtr = (Tcl_UniChar) byte; return 1; - } else if (byte < 0xE0) { - if ((byte != 0xC1) && ((src[1] & 0xC0) == 0x80)) { - /* - * Two-byte-character lead-byte followed by a trail-byte. - */ - - *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); - if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { - return 2; - } - } - - /* - * A two-byte-character lead-byte not followed by trail-byte - * represents itself. - */ - } else if (byte < 0xF0) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { - /* - * Three-byte-character lead byte followed by two trail bytes. - */ - - *chPtr = (((byte & 0x0F) << 12) - | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); - if (*chPtr > 0x7FF) { - return 3; - } - } + } +#if TCL_UTF_MAX > 3 + { + int ch, total, trail; - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ - } else if (byte < 0xF5) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { - /* - * Four-byte-character lead byte followed by at least two trail bytes. - * We don't test the validity of 3th trail byte, see [ed29806ba] - */ - Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) - | ((src[2] & 0x3F) >> 4)) - 0x40; - if (high < 0x400) { - /* produce high surrogate, advance source pointer */ - *chPtr = 0xD800 + high; - return 1; - } - /* out of range, < 0x10000 or > 0x10FFFF */ + total = totalBytes[byte]; + trail = total - 1; + if (trail > 0) { + ch = byte & (0x3F >> trail); + do { + src++; + if ((*src & 0xC0) != 0x80) { + *chPtr = byte; + return 1; + } + ch <<= 6; + ch |= (*src & 0x3F); + trail--; + } while (trail > 0); + *chPtr = ch; + return total; } - - /* - * A four-byte-character lead-byte not followed by three trail-bytes - * represents itself. - */ } +#endif - *chPtr = byte; + *chPtr = (Tcl_UniChar) byte; return 1; } @@ -643,118 +391,42 @@ Tcl_UtfToChar16( *--------------------------------------------------------------------------- */ -#undef Tcl_UtfToUniCharDString -int * +Tcl_UniChar * Tcl_UtfToUniCharDString( - const char *src, /* UTF-8 string to convert to Unicode. */ - Tcl_Size length, /* Length of UTF-8 string in bytes, or -1 for + CONST char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ Tcl_DString *dsPtr) /* Unicode representation of string is * appended to this previously initialized * DString. */ { - int ch = 0, *w, *wString; - const char *p; - Tcl_Size oldLength; - /* Pointer to the end of string. Never read endPtr[0] */ - const char *endPtr = src + length; - /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; - - if (src == NULL) { - return NULL; - } - if (length < 0) { - length = strlen(src); - } - - /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in - * bytes. - */ - - oldLength = Tcl_DStringLength(dsPtr); - - Tcl_DStringSetLength(dsPtr, - oldLength + ((length + 1) * sizeof(int))); - wString = (int *) (Tcl_DStringValue(dsPtr) + oldLength); - - w = wString; - p = src; - endPtr = src + length; - optPtr = endPtr - 4; - while (p <= optPtr) { - p += TclUtfToUniChar(p, &ch); - *w++ = ch; - } - while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) { - p += TclUtfToUniChar(p, &ch); - *w++ = ch; - } - while (p < endPtr) { - *w++ = UCHAR(*p++); - } - *w = '\0'; - Tcl_DStringSetLength(dsPtr, - oldLength + ((char *) w - (char *) wString)); - - return wString; -} + Tcl_UniChar *w, *wString; + CONST char *p, *end; + int oldLength; -unsigned short * -Tcl_UtfToChar16DString( - const char *src, /* UTF-8 string to convert to Unicode. */ - Tcl_Size length, /* Length of UTF-8 string in bytes, or -1 for - * strlen(). */ - Tcl_DString *dsPtr) /* Unicode representation of string is - * appended to this previously initialized - * DString. */ -{ - unsigned short ch = 0, *w, *wString; - const char *p; - Tcl_Size oldLength; - /* Pointer to the end of string. Never read endPtr[0] */ - const char *endPtr = src + length; - /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - TCL_UTF_MAX; - - if (src == NULL) { - return NULL; - } if (length < 0) { length = strlen(src); } /* - * Unicode string length in WCHARs will be <= UTF-8 string length in + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in * bytes. */ oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, - oldLength + ((length + 1) * sizeof(unsigned short))); - wString = (unsigned short *) (Tcl_DStringValue(dsPtr) + oldLength); + (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); + wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; - p = src; - endPtr = src + length; - optPtr = endPtr - 3; - while (p <= optPtr) { - p += Tcl_UtfToChar16(p, &ch); - *w++ = ch; - } - while (p < endPtr) { - if (Tcl_UtfCharComplete(p, endPtr-p)) { - p += Tcl_UtfToChar16(p, &ch); - *w++ = ch; - } else { - *w++ = UCHAR(*p++); - } + end = src + length; + for (p = src; p < end; ) { + p += TclUtfToUniChar(p, w); + w++; } *w = '\0'; Tcl_DStringSetLength(dsPtr, - oldLength + ((char *) w - (char *) wString)); + (oldLength + ((char *) w - (char *) wString))); return wString; } @@ -780,11 +452,14 @@ Tcl_UtfToChar16DString( int Tcl_UtfCharComplete( - const char *src, /* String to check if first few bytes contain + CONST char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ - Tcl_Size length) /* Length of above string in bytes. */ + int length) /* Length of above string in bytes. */ { - return length >= complete[UCHAR(*src)]; + int ch; + + ch = *((unsigned char *) src); + return length >= totalBytes[ch]; } /* @@ -805,124 +480,59 @@ Tcl_UtfCharComplete( *--------------------------------------------------------------------------- */ -Tcl_Size -TclNumUtfChars( - const char *src, /* The UTF-8 string to measure. */ - Tcl_Size length) /* The length of the string in bytes, or - * negative value for strlen(src). */ -{ - Tcl_UniChar ch = 0; - Tcl_Size i = 0; - - if (length < 0) { - /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ - while ((*src != '\0') && (i < INT_MAX)) { - src += TclUtfToUniChar(src, &ch); - i++; - } - } else { - /* Will return value between 0 and length. No overflow checks. */ - - /* Pointer to the end of string. Never read endPtr[0] */ - const char *endPtr = src + length; - /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - 4; - - /* - * Optimize away the call in this loop. Justified because... - * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) - * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - * So (endPtr - src) >= TCL_UTF_MAX, and passing that to - * Tcl_UtfCharComplete we know will cause return of 1. - */ - while (src <= optPtr - /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { - src += TclUtfToUniChar(src, &ch); - i++; - } - /* Loop over the remaining string where call must happen */ - while (src < endPtr) { - if (Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); - } else { - /* - * src points to incomplete UTF-8 sequence - * Treat first byte as character and count it - */ - src++; - } - i++; - } - } - return i; -} - -#if !defined(TCL_NO_DEPRECATED) -Tcl_Size +int Tcl_NumUtfChars( - const char *src, /* The UTF-8 string to measure. */ - Tcl_Size length) /* The length of the string in bytes, or - * negative for strlen(src). */ + register CONST char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ { - unsigned short ch = 0; - Tcl_Size i = 0; + Tcl_UniChar ch; + register Tcl_UniChar *chPtr = &ch; + register int i; + + /* + * The separate implementations are faster. + * + * Since this is a time-sensitive function, we also do the check for the + * single-byte char case specially. + */ + i = 0; if (length < 0) { - /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ - while ((*src != '\0') && (i < INT_MAX)) { - src += Tcl_UtfToChar16(src, &ch); + while (*src != '\0') { + src += TclUtfToUniChar(src, chPtr); i++; } } else { - /* Will return value between 0 and length. No overflow checks. */ + register int n; - /* Pointer to the end of string. Never read endPtr[0] */ - const char *endPtr = src + length; - /* Pointer to last byte where optimization still can be used */ - const char *optPtr = endPtr - 4; - - /* - * Optimize away the call in this loop. Justified because... - * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) - * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - * So (endPtr - src) >= TCL_UTF_MAX, and passing that to - * Tcl_UtfCharComplete we know will cause return of 1. - */ - while (src <= optPtr - /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { - src += Tcl_UtfToChar16(src, &ch); - i++; - } - /* Loop over the remaining string where call must happen */ - while (src < endPtr) { - if (Tcl_UtfCharComplete(src, endPtr - src)) { - src += Tcl_UtfToChar16(src, &ch); - } else { - /* - * src points to incomplete UTF-8 sequence - * Treat first byte as character and count it - */ + while (length > 0) { + if (UCHAR(*src) < 0xC0) { + length--; src++; + } else { + n = Tcl_UtfToUniChar(src, chPtr); + length -= n; + src += n; } i++; } } return i; } -#endif - + /* *--------------------------------------------------------------------------- * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurrence of the given Unicode character - * in the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the first occurance of the given Tcl_UniChar in + * the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Unicode character does not exist in the given string, - * the return value is NULL. + * As above. If the Tcl_UniChar does not exist in the given string, the + * return value is NULL. * * Side effects: * None. @@ -930,14 +540,16 @@ Tcl_NumUtfChars( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfFindFirst( - const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Unicode character to search for. */ + CONST char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ { - while (1) { - int find, len = TclUtfToUniChar(src, &find); + int len; + Tcl_UniChar find; + while (1) { + len = TclUtfToUniChar(src, &find); if (find == ch) { return src; } @@ -953,12 +565,12 @@ Tcl_UtfFindFirst( * * Tcl_UtfFindLast -- * - * Returns a pointer to the last occurrence of the given Unicode character - * in the NULL-terminated UTF-8 string. The NULL terminator is considered + * Returns a pointer to the last occurance of the given Tcl_UniChar in + * the NULL-terminated UTF-8 string. The NULL terminator is considered * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). * * Results: - * As above. If the Unicode character does not exist in the given string, the + * As above. If the Tcl_UniChar does not exist in the given string, the * return value is NULL. * * Side effects: @@ -967,16 +579,18 @@ Tcl_UtfFindFirst( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfFindLast( - const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Unicode character to search for. */ + CONST char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ { - const char *last = NULL; + int len; + Tcl_UniChar find; + CONST char *last; + last = NULL; while (1) { - int find, len = TclUtfToUniChar(src, &find); - + len = TclUtfToUniChar(src, &find); if (find == ch) { last = src; } @@ -993,11 +607,9 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext - * returns a pointer to the next UTF-8 character in the string. - * The caller must not ask for the next character after the last - * character in the string if the string is not terminated by a null - * character. + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. * * Results: * The return value is the pointer to the next character in the UTF-8 @@ -1009,48 +621,13 @@ Tcl_UtfFindLast( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfNext( - const char *src) /* The current location in the string. */ + CONST char *src) /* The current location in the string. */ { - int left; - const char *next; - - if (((*src) & 0xC0) == 0x80) { - /* Continuation byte, so we start 'inside' a (possible valid) UTF-8 - * sequence. Since we are not allowed to access src[-1], we cannot - * check if the sequence is actually valid, the best we can do is - * just assume it is valid and locate the end. */ - if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) { - ++src; - } - return src; - } + Tcl_UniChar ch; - left = totalBytes[UCHAR(*src)]; - next = src + 1; - while (--left) { - if ((*next & 0xC0) != 0x80) { - /* - * src points to non-trail byte; We ran out of trail bytes - * before the needs of the lead byte were satisfied. - * Let the (malformed) lead byte alone be a character - */ - return src + 1; - } - next++; - } - /* - * Call Invalid() here only if required conditions are met: - * src[0] is known a lead byte. - * src[1] is known a trail byte. - * Especially important to prevent calls when src[0] == '\xF8' or '\xFC' - * See tests utf-6.37 through utf-6.43 through valgrind or similar tool. - */ - if ((next == src + 1) || Invalid(src)) { - return src + 1; - } - return next; + return src + TclUtfToUniChar(src, &ch); } /* @@ -1074,96 +651,34 @@ Tcl_UtfNext( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfPrev( - const char *src, /* A location in a UTF-8 string. */ - const char *start) /* Pointer to the beginning of the string */ + CONST char *src, /* The current location in the string. */ + CONST char *start) /* Pointer to the beginning of the string, to + * avoid going backwards too far. */ { - int trailBytesSeen = 0; /* How many trail bytes have been verified? */ - const char *fallback = src - 1; - /* If we cannot find a lead byte that might - * start a prefix of a valid UTF byte sequence, - * we will fallback to a one-byte back step */ - const char *look = fallback; - /* Start search at the fallback position */ - - /* Quick boundary case exit. */ - if (fallback <= start) { - return start; - } - - do { - unsigned char byte = UCHAR(look[0]); - + CONST char *look; + int i, byte; + + src--; + look = src; + for (i = 0; i < TCL_UTF_MAX; i++) { + if (look < start) { + if (src < start) { + src = start; + } + break; + } + byte = *((unsigned char *) look); if (byte < 0x80) { - /* - * Single byte character. Either this is a correct previous - * character, or it is followed by at least one trail byte - * which indicates a malformed sequence. In either case the - * correct result is to return the fallback. - */ - return fallback; + break; } if (byte >= 0xC0) { - /* Non-trail byte; May be multibyte lead. */ - - if ((trailBytesSeen == 0) - /* - * We've seen no trailing context to use to check - * anything. From what we know, this non-trail byte - * is a prefix of a previous character, and accepting - * it (the fallback) is correct. - */ - - || (trailBytesSeen >= totalBytes[byte])) { - /* - * That is, (1 + trailBytesSeen > needed). - * We've examined more bytes than needed to complete - * this lead byte. No matter about well-formedness or - * validity, the sequence starting with this lead byte - * will never include the fallback location, so we must - * return the fallback location. See test utf-7.17 - */ - return fallback; - } - - /* - * trailBytesSeen > 0, so we can examine look[1] safely. - * Use that capability to screen out invalid sequences. - */ - - if (Invalid(look)) { - /* Reject */ - return fallback; - } - return (const char *)look; + return look; } - - /* We saw a trail byte. */ - trailBytesSeen++; - - if ((const char *)look == start) { - /* - * Do not read before the start of the string - * - * If we get here, we've examined bytes at every location - * >= start and < src and all of them are trail bytes, - * including (*start). We need to return our fallback - * and exit this loop before we run past the start of the string. - */ - return fallback; - } - - /* Continue the search backwards... */ look--; - } while (trailBytesSeen < 4); - - /* - * We've seen 4 trail bytes, so we know there will not be a - * properly formed byte sequence to find, and we can stop looking, - * accepting the fallback. - */ - return fallback; + } + return src; } /* @@ -1183,27 +698,18 @@ Tcl_UtfPrev( *--------------------------------------------------------------------------- */ -int +Tcl_UniChar Tcl_UniCharAtIndex( - const char *src, /* The UTF-8 string to dereference. */ - Tcl_Size index) /* The position of the desired character. */ + register CONST char *src, /* The UTF-8 string to dereference. */ + register int index) /* The position of the desired character. */ { - unsigned short ch = 0; - int i = 0; + Tcl_UniChar ch; - if (index < 0) { - return -1; - } - while (index-- > 0) { - i = Tcl_UtfToChar16(src, &ch); - src += i; - } - if ((ch >= 0xD800) && (i < 3)) { - /* Index points at character following high Surrogate */ - return -1; + while (index >= 0) { + index--; + src += TclUtfToUniChar(src, &ch); } - TclUtfToUniChar(src, &i); - return i; + return ch; } /* @@ -1223,41 +729,20 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -const char * -TclUtfAtIndex( - const char *src, /* The UTF-8 string. */ - Tcl_Size index) /* The position of the desired character. */ +CONST char * +Tcl_UtfAtIndex( + register CONST char *src, /* The UTF-8 string. */ + register int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + Tcl_UniChar ch; - while (index-- > 0) { + while (index > 0) { + index--; src += TclUtfToUniChar(src, &ch); } return src; } -#if !defined(TCL_NO_DEPRECATED) -const char * -Tcl_UtfAtIndex( - const char *src, /* The UTF-8 string. */ - Tcl_Size index) /* The position of the desired character. */ -{ - unsigned short ch = 0; - Tcl_Size len = 0; - - if (index > 0) { - while (index--) { - src += (len = Tcl_UtfToChar16(src, &ch)); - } - if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following high Surrogate */ - src += Tcl_UtfToChar16(src, &ch); - } - } - return src; -} -#endif - /* *--------------------------------------------------------------------------- * @@ -1267,7 +752,7 @@ Tcl_UtfAtIndex( * * Results: * Stores the bytes represented by the backslash sequence in dst and - * returns the number of bytes written to dst. At most 4 bytes + * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes * are written to dst; dst must have been large enough to accept those * bytes. If readPtr isn't NULL then it is filled in with a count of the * number of bytes in the backslash sequence. @@ -1284,9 +769,9 @@ Tcl_UtfAtIndex( *--------------------------------------------------------------------------- */ -Tcl_Size +int Tcl_UtfBackslash( - const char *src, /* Points to the backslash character of a + CONST char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ @@ -1294,7 +779,7 @@ Tcl_UtfBackslash( * backslash sequence. */ { #define LINE_LENGTH 128 - Tcl_Size numRead; + int numRead; int result; result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); @@ -1303,7 +788,7 @@ Tcl_UtfBackslash( * We ate a whole line. Pay the price of a strlen() */ - result = TclParseBackslash(src, strlen(src), &numRead, dst); + result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); } if (readPtr != NULL) { *readPtr = numRead; @@ -1329,13 +814,13 @@ Tcl_UtfBackslash( *---------------------------------------------------------------------- */ -Tcl_Size +int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - int ch, upChar; + Tcl_UniChar ch, upChar; char *src, *dst; - Tcl_Size len; + int bytes; /* * Iterate over the string until we hit the terminating null. @@ -1343,7 +828,7 @@ Tcl_UtfToUpper( src = dst = str; while (*src) { - len = TclUtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); upChar = Tcl_UniCharToUpper(ch); /* @@ -1352,13 +837,13 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) { - memmove(dst, src, len); - dst += len; + if (bytes < UtfCount(upChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; } else { dst += Tcl_UniCharToUtf(upChar, dst); } - src += len; + src += bytes; } *dst = '\0'; return (dst - str); @@ -1382,13 +867,13 @@ Tcl_UtfToUpper( *---------------------------------------------------------------------- */ -Tcl_Size +int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - int ch, lowChar; + Tcl_UniChar ch, lowChar; char *src, *dst; - Tcl_Size len; + int bytes; /* * Iterate over the string until we hit the terminating null. @@ -1396,7 +881,7 @@ Tcl_UtfToLower( src = dst = str; while (*src) { - len = TclUtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); /* @@ -1405,13 +890,13 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { - memmove(dst, src, len); - dst += len; + if (bytes < UtfCount(lowChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } - src += len; + src += bytes; } *dst = '\0'; return (dst - str); @@ -1436,13 +921,13 @@ Tcl_UtfToLower( *---------------------------------------------------------------------- */ -Tcl_Size +int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - int ch, titleChar, lowChar; + Tcl_UniChar ch, titleChar, lowChar; char *src, *dst; - Tcl_Size len; + int bytes; /* * Capitalize the first character and then lowercase the rest of the @@ -1452,32 +937,28 @@ Tcl_UtfToTitle( src = dst = str; if (*src) { - len = TclUtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); - if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) { - memmove(dst, src, len); - dst += len; + if (bytes < UtfCount(titleChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; } else { dst += Tcl_UniCharToUtf(titleChar, dst); } - src += len; + src += bytes; } while (*src) { - len = TclUtfToUniChar(src, &ch); - lowChar = ch; - /* Special exception for Georgian Asomtavruli chars, no titlecase. */ - if ((unsigned)(lowChar - 0x1C90) >= 0x30) { - lowChar = Tcl_UniCharToLower(lowChar); - } + bytes = TclUtfToUniChar(src, &ch); + lowChar = Tcl_UniCharToLower(ch); - if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { - memmove(dst, src, len); - dst += len; + if (bytes < UtfCount(lowChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } - src += len; + src += bytes; } *dst = '\0'; return (dst - str); @@ -1502,8 +983,8 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ unsigned long numBytes) /* Number of *bytes* to compare. */ { /* @@ -1512,7 +993,7 @@ TclpUtfNcmp2( * fine in the strcmp manner. */ - int result = 0; + register int result = 0; for ( ; numBytes != 0; numBytes--, cs++, ct++) { if (*cs != *ct) { @@ -1535,8 +1016,8 @@ TclpUtfNcmp2( * * Tcl_UtfNcmp -- * - * Compare at most numChars UTF-16 chars of string cs to string ct. Both cs - * and ct are assumed to be at least numChars UTF-16 chars long. + * Compare at most numChars UTF chars of string cs to string ct. Both cs + * and ct are assumed to be at least numChars UTF chars long. * * Results: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. @@ -1547,89 +1028,17 @@ TclpUtfNcmp2( *---------------------------------------------------------------------- */ -#if !defined(TCL_NO_DEPRECATED) int Tcl_UtfNcmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - unsigned long numChars) /* Number of UTF-16 chars to compare. */ -{ - unsigned short ch1 = 0, ch2 = 0; - - /* - * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 - * (the byte 0x01.) - */ - - while (numChars-- > 0) { - /* - * n must be interpreted as UTF-16 chars, not bytes. This should be called - * only when both strings are of at least n UTF-16 chars long (no need for \0 - * check) - */ - - cs += Tcl_UtfToChar16(cs, &ch1); - ct += Tcl_UtfToChar16(ct, &ch2); - if (ch1 != ch2) { - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } - return (ch1 - ch2); - } - } - return 0; -} -#endif /* TCL_NO_DEPRECATED */ - -int -TclUtfNcmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - size_t numChars) /* Number of chars to compare. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { - Tcl_UniChar ch1 = 0, ch2 = 0; + Tcl_UniChar ch1, ch2; /* * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 - * (the byte 0x01.) - */ - - while (numChars-- > 0) { - /* - * n must be interpreted as chars, not bytes. This should be called - * only when both strings are of at least n chars long (no need for \0 - * check) - */ - - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - return 0; -} - -int -TclUtfNmemcmp( - const void *csPtr, /* UTF string to compare to ct. */ - const void *ctPtr, /* UTF string cs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - const char *cs = (const char *)csPtr; - const char *ct = (const char *)ctPtr; - - /* - * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 + * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 * (the byte 0x01.) */ @@ -1654,8 +1063,8 @@ TclUtfNmemcmp( * * Tcl_UtfNcasecmp -- * - * Compare at most numChars UTF-16 chars of string cs to string ct case - * insensitive. Both cs and ct are assumed to be at least numChars UTF-16 + * Compare at most numChars UTF chars of string cs to string ct case + * insensitive. Both cs and ct are assumed to be at least numChars UTF * chars long. * * Results: @@ -1667,81 +1076,13 @@ TclUtfNmemcmp( *---------------------------------------------------------------------- */ -#if !defined(TCL_NO_DEPRECATED) int Tcl_UtfNcasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - unsigned long numChars) /* Number of UTF-16 chars to compare. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { - unsigned short ch1 = 0, ch2 = 0; - - while (numChars-- > 0) { - /* - * n must be interpreted as UTF-16 chars, not bytes. - * This should be called only when both strings are of - * at least n UTF-16 chars long (no need for \0 check) - */ - cs += Tcl_UtfToChar16(cs, &ch1); - ct += Tcl_UtfToChar16(ct, &ch2); - if (ch1 != ch2) { - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - } - return 0; -} -#endif /* TCL_NO_DEPRECATED */ - - -int -TclUtfNcasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - - while (numChars-- > 0) { - /* - * n must be interpreted as chars, not bytes. - * This should be called only when both strings are of - * at least n chars long (no need for \0 check) - */ - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); - if (ch1 != ch2) { - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - } - return 0; -} - -int -TclUtfNcasememcmp( - const void *csPtr, /* UTF string to compare to ct. */ - const void *ctPtr, /* UTF string cs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ - const char *cs = (const char *)csPtr; - const char *ct = (const char *)ctPtr; - Tcl_UniChar ch1 = 0, ch2 = 0; - + Tcl_UniChar ch1, ch2; while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. @@ -1760,47 +1101,11 @@ TclUtfNcasememcmp( } return 0; } - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfCmp -- - * - * Compare UTF chars of string cs to string ct case sensitively. - * Replacement for strcmp in Tcl core, in places where UTF-8 should - * be handled. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -TclUtfCmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct) /* UTF string cs is compared to. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - - while (*cs && *ct) { - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); - if (ch1 != ch2) { - return ch1 - ch2; - } - } - return UCHAR(*cs) - UCHAR(*ct); -} - /* *---------------------------------------------------------------------- * - * TclUtfCasecmp -- + * Tcl_UtfNcasecmp -- * * Compare UTF chars of string cs to string ct case insensitively. * Replacement for strcasecmp in Tcl core, in places where UTF-8 should @@ -1817,12 +1122,12 @@ TclUtfCmp( int TclUtfCasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct) /* UTF string cs is compared to. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct) /* UTF string cs is compared to. */ { - Tcl_UniChar ch1 = 0, ch2 = 0; - while (*cs && *ct) { + Tcl_UniChar ch1, ch2; + cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { @@ -1853,19 +1158,16 @@ TclUtfCasecmp( *---------------------------------------------------------------------- */ -int +Tcl_UniChar Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - if (!UNICODE_OUT_OF_RANGE(ch)) { - int info = GetUniCharInfo(ch); + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); - } + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); } - /* Clear away extension bits, if any */ - return ch & 0x1FFFFF; + return (Tcl_UniChar) ch; } /* @@ -1884,22 +1186,18 @@ Tcl_UniCharToUpper( *---------------------------------------------------------------------- */ -int +Tcl_UniChar Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - if (!UNICODE_OUT_OF_RANGE(ch)) { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + int info = GetUniCharInfo(ch); - if ((mode & 0x02) && (mode != 0x7)) { - ch += GetDelta(info); - } + if (GetCaseType(info) & 0x02) { + ch += GetDelta(info); } - /* Clear away extension bits, if any */ - return ch & 0x1FFFFF; + return (Tcl_UniChar) ch; } - + /* *---------------------------------------------------------------------- * @@ -1916,63 +1214,28 @@ Tcl_UniCharToLower( *---------------------------------------------------------------------- */ -int +Tcl_UniChar Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - if (!UNICODE_OUT_OF_RANGE(ch)) { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ - if (mode != 0x7) { - ch += ((mode & 0x4) ? -1 : 1); - } - } else if (mode == 0x4) { - ch -= GetDelta(info); - } + ch += ((mode & 0x4) ? -1 : 1); + } else if (mode == 0x4) { + ch -= GetDelta(info); } - /* Clear away extension bits, if any */ - return ch & 0x1FFFFF; + return (Tcl_UniChar) ch; } /* *---------------------------------------------------------------------- * - * Tcl_Char16Len -- - * - * Find the length of a UniChar string. The str input must be null - * terminated. - * - * Results: - * Returns the length of str in UniChars (not bytes). - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_Size -Tcl_Char16Len( - const unsigned short *uniStr) /* Unicode string to find length of. */ -{ - Tcl_Size len = 0; - - while (*uniStr != '\0') { - len++; - uniStr++; - } - return len; -} - -/* - *---------------------------------------------------------------------- - * * Tcl_UniCharLen -- * * Find the length of a UniChar string. The str input must be null @@ -1987,12 +1250,11 @@ Tcl_Char16Len( *---------------------------------------------------------------------- */ -#undef Tcl_UniCharLen -Tcl_Size +int Tcl_UniCharLen( - const int *uniStr) /* Unicode string to find length of. */ + CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { - Tcl_Size len = 0; + int len = 0; while (*uniStr != '\0') { len++; @@ -2006,8 +1268,8 @@ Tcl_UniCharLen( * * Tcl_UniCharNcmp -- * - * Compare at most numChars chars (not bytes) of string ucs to string uct. - * Both ucs and uct are assumed to be at least numChars chars long. + * Compare at most numChars unichars of string ucs to string uct. + * Both ucs and uct are assumed to be at least numChars unichars long. * * Results: * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. @@ -2019,69 +1281,12 @@ Tcl_UniCharLen( */ int -TclUniCharNcmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ -#if defined(WORDS_BIGENDIAN) - /* - * We are definitely on a big-endian machine; memcmp() is safe - */ - - return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); - -#else /* !WORDS_BIGENDIAN */ - /* - * We can't simply call memcmp() because that is not lexically correct. - */ - - for ( ; numChars != 0; ucs++, uct++, numChars--) { - if (*ucs != *uct) { - return (*ucs - *uct); - } - } - return 0; -#endif /* WORDS_BIGENDIAN */ -} - -int -TclUniCharNmemcmp( - const void *ucsPtr, /* Unicode string to compare to uct. */ - const void *uctPtr, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of chars (not bytes) to compare. */ -{ - const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr; - const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr; -#if defined(WORDS_BIGENDIAN) - /* - * We are definitely on a big-endian machine; memcmp() is safe - */ - - return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); - -#else /* !WORDS_BIGENDIAN */ - /* - * We can't simply call memcmp() because that is not lexically correct. - */ - - for ( ; numChars != 0; ucs++, uct++, numChars--) { - if (*ucs != *uct) { - return (*ucs - *uct); - } - } - return 0; -#endif /* WORDS_BIGENDIAN */ -} - -#if !defined(TCL_NO_DEPRECATED) -int Tcl_UniCharNcmp( - const unsigned short *ucs, /* Unicode string to compare to uct. */ - const unsigned short *uct, /* Unicode string ucs is compared to. */ - unsigned long numChars) /* Number of chars (not bytes) to compare. */ + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { -#if defined(WORDS_BIGENDIAN) +#ifdef WORDS_BIGENDIAN /* * We are definitely on a big-endian machine; memcmp() is safe */ @@ -2095,27 +1300,21 @@ Tcl_UniCharNcmp( for ( ; numChars != 0; ucs++, uct++, numChars--) { if (*ucs != *uct) { - /* special case for handling upper surrogates */ - if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) { - return 1; - } else if (((*uct & 0xFC00) == 0xD800)) { - return -1; - } return (*ucs - *uct); } } return 0; #endif /* WORDS_BIGENDIAN */ } -#endif + /* *---------------------------------------------------------------------- * * Tcl_UniCharNcasecmp -- * - * Compare at most numChars chars (not bytes) of string ucs to string uct case + * Compare at most numChars unichars of string ucs to string uct case * insensitive. Both ucs and uct are assumed to be at least numChars - * chars long. + * unichars long. * * Results: * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. @@ -2127,32 +1326,11 @@ Tcl_UniCharNcmp( */ int -TclUniCharNcasecmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of chars to compare. */ -{ - for ( ; numChars != 0; numChars--, ucs++, uct++) { - if (*ucs != *uct) { - Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); - Tcl_UniChar lct = Tcl_UniCharToLower(*uct); - - if (lcs != lct) { - return (lcs - lct); - } - } - } - return 0; -} - -int -TclUniCharNcasememcmp( - const void *ucsPtr, /* Unicode string to compare to uct. */ - const void *uctPtr, /* Unicode string ucs is compared to. */ - size_t numChars) /* Number of chars (not bytes) to compare. */ +Tcl_UniCharNcasecmp( + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { - const Tcl_UniChar *ucs = (const Tcl_UniChar *)ucsPtr; - const Tcl_UniChar *uct = (const Tcl_UniChar *)uctPtr; for ( ; numChars != 0; numChars--, ucs++, uct++) { if (*ucs != *uct) { Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); @@ -2165,33 +1343,6 @@ TclUniCharNcasememcmp( } return 0; } - -#if !defined(TCL_NO_DEPRECATED) -int -Tcl_UniCharNcasecmp( - const unsigned short *ucs, /* Unicode string to compare to uct. */ - const unsigned short *uct, /* Unicode string ucs is compared to. */ - unsigned long numChars) /* Number of chars (not bytes) to compare. */ -{ - for ( ; numChars != 0; numChars--, ucs++, uct++) { - if (*ucs != *uct) { - unsigned short lcs = Tcl_UniCharToLower(*ucs); - unsigned short lct = Tcl_UniCharToLower(*uct); - - if (lcs != lct) { - /* special case for handling upper surrogates */ - if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) { - return 1; - } else if (((lct & 0xFC00) == 0xD800)) { - return -1; - } - return (lcs - lct); - } - } - } - return 0; -} -#endif /* *---------------------------------------------------------------------- @@ -2213,9 +1364,6 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -2239,9 +1387,6 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -2265,11 +1410,6 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - /* Clear away extension bits, if any */ - ch &= 0x1FFFFF; - return ((ch == 0xE0001) || ((unsigned)(ch - 0xE0020) <= 0x5F)); - } return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -2293,9 +1433,6 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -2319,9 +1456,6 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF); - } return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -2345,9 +1479,6 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -2371,9 +1502,6 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF); - } return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -2397,9 +1525,6 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -2423,20 +1548,14 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { - /* Ignore upper 11 bits. */ - ch &= 0x1FFFFF; - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { - return TclIsSpaceProcM((char) ch); - } else if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B - || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + return TclIsSpaceProc((char) ch); + } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -2463,9 +1582,6 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -2489,9 +1605,6 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } return ((WORD_BITS >> GetCategory(ch)) & 1); } @@ -2519,182 +1632,14 @@ Tcl_UniCharIsWordChar( */ int -TclUniCharCaseMatch( - const Tcl_UniChar *uniStr, /* Unicode String. */ - const Tcl_UniChar *uniPattern, - /* Pattern, which may contain special - * characters. */ - int nocase) /* 0 for case sensitive, 1 for insensitive */ -{ - Tcl_UniChar ch1 = 0, p; - - while (1) { - p = *uniPattern; - - /* - * See if we're at the end of both the pattern and the string. If so, - * we succeeded. If we're at the end of the pattern but not at the end - * of the string, we failed. - */ - - if (p == 0) { - return (*uniStr == 0); - } - if ((*uniStr == 0) && (p != '*')) { - return 0; - } - - /* - * Check for a "*" as the next pattern character. It matches any - * substring. We handle this by skipping all the characters up to the - * next matching one in the pattern, and then calling ourselves - * recursively for each postfix of string, until either we match or we - * reach the end of the string. - */ - - if (p == '*') { - /* - * Skip all successive *'s in the pattern - */ - - while (*(++uniPattern) == '*') { - /* empty body */ - } - p = *uniPattern; - if (p == 0) { - return 1; - } - if (nocase) { - p = Tcl_UniCharToLower(p); - } - while (1) { - /* - * Optimization for matching - cruise through the string - * quickly if the next char in the pattern isn't a special - * character - */ - - if ((p != '[') && (p != '?') && (p != '\\')) { - if (nocase) { - while (*uniStr && (p != *uniStr) - && (p != Tcl_UniCharToLower(*uniStr))) { - uniStr++; - } - } else { - while (*uniStr && (p != *uniStr)) { - uniStr++; - } - } - } - if (TclUniCharCaseMatch(uniStr, uniPattern, nocase)) { - return 1; - } - if (*uniStr == 0) { - return 0; - } - uniStr++; - } - } - - /* - * Check for a "?" as the next pattern character. It matches any - * single character. - */ - - if (p == '?') { - uniPattern++; - uniStr++; - continue; - } - - /* - * Check for a "[" as the next pattern character. It is followed by a - * list of characters that are acceptable, or by a range (two - * characters separated by "-"). - */ - - if (p == '[') { - Tcl_UniChar startChar, endChar; - - uniPattern++; - ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); - uniStr++; - while (1) { - if ((*uniPattern == ']') || (*uniPattern == 0)) { - return 0; - } - startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) - : *uniPattern); - uniPattern++; - if (*uniPattern == '-') { - uniPattern++; - if (*uniPattern == 0) { - return 0; - } - endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) - : *uniPattern); - uniPattern++; - if (((startChar <= ch1) && (ch1 <= endChar)) - || ((endChar <= ch1) && (ch1 <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - break; - } - } else if (startChar == ch1) { - break; - } - } - while (*uniPattern != ']') { - if (*uniPattern == 0) { - uniPattern--; - break; - } - uniPattern++; - } - uniPattern++; - continue; - } - - /* - * If the next pattern character is '\', just strip off the '\' so we - * do exact matching on the character that follows. - */ - - if (p == '\\') { - if (*(++uniPattern) == '\0') { - return 0; - } - } - - /* - * There's no special character. Just make sure that the next bytes of - * each string match. - */ - - if (nocase) { - if (Tcl_UniCharToLower(*uniStr) != - Tcl_UniCharToLower(*uniPattern)) { - return 0; - } - } else if (*uniStr != *uniPattern) { - return 0; - } - uniStr++; - uniPattern++; - } -} - -#if !defined(TCL_NO_DEPRECATED) -int Tcl_UniCharCaseMatch( - const unsigned short *uniStr, /* Unicode String. */ - const unsigned short *uniPattern, + CONST Tcl_UniChar *uniStr, /* Unicode String. */ + CONST Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - unsigned short ch1 = 0, p; + Tcl_UniChar ch1, p; while (1) { p = *uniPattern; @@ -2782,7 +1727,7 @@ Tcl_UniCharCaseMatch( */ if (p == '[') { - unsigned short startChar, endChar; + Tcl_UniChar startChar, endChar; uniPattern++; ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); @@ -2852,7 +1797,6 @@ Tcl_UniCharCaseMatch( uniPattern++; } } -#endif /* *---------------------------------------------------------------------- @@ -2877,14 +1821,14 @@ Tcl_UniCharCaseMatch( int TclUniCharMatch( - const Tcl_UniChar *string, /* Unicode String. */ - Tcl_Size strLen, /* Length of String */ - const Tcl_UniChar *pattern, /* Pattern, which may contain special + CONST Tcl_UniChar *string, /* Unicode String. */ + int strLen, /* Length of String */ + CONST Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ - Tcl_Size ptnLen, /* Length of Pattern */ + int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - const Tcl_UniChar *stringEnd, *patternEnd; + CONST Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; |
