diff options
author | William Joye <wjoye@cfa.harvard.edu> | 2018-11-26 20:06:09 (GMT) |
---|---|---|
committer | William Joye <wjoye@cfa.harvard.edu> | 2018-11-26 20:06:09 (GMT) |
commit | 3dcee315fb784599a02aaafe3a83cfea0c1d1fe9 (patch) | |
tree | 894dd8bc12dda8554293a6c5f4fb7eae06151f08 /tcl8.6/generic/tclUtf.c | |
parent | 36aac058541aebc86ae5f19a7e59fa892ac81cf1 (diff) | |
download | blt-3dcee315fb784599a02aaafe3a83cfea0c1d1fe9.zip blt-3dcee315fb784599a02aaafe3a83cfea0c1d1fe9.tar.gz blt-3dcee315fb784599a02aaafe3a83cfea0c1d1fe9.tar.bz2 |
update tcl/tk
Diffstat (limited to 'tcl8.6/generic/tclUtf.c')
-rw-r--r-- | tcl8.6/generic/tclUtf.c | 2119 |
1 files changed, 0 insertions, 2119 deletions
diff --git a/tcl8.6/generic/tclUtf.c b/tcl8.6/generic/tclUtf.c deleted file mode 100644 index c60e99e..0000000 --- a/tcl8.6/generic/tclUtf.c +++ /dev/null @@ -1,2119 +0,0 @@ -/* - * tclUtf.c -- - * - * Routines for manipulating UTF-8 strings. - * - * Copyright (c) 1997-1998 Sun Microsystems, Inc. - * - * See the file "license.terms" for information on usage and redistribution of - * this file, and for a DISCLAIMER OF ALL WARRANTIES. - */ - -#include "tclInt.h" - -/* - * Include the static character classification tables and macros. - */ - -#include "tclUniData.c" - -/* - * The following macros are used for fast character category tests. The x_BITS - * values are shifted right by the category value to determine whether the - * given category is included in the set. - */ - -#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ - | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) - -#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) - -#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) - -#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ - | (1 << PARAGRAPH_SEPARATOR)) - -#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) - -#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) - -#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ - (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ - (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | \ - (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ - (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) - -/* - * Unicode characters less than this value are represented by themselves in - * UTF-8 strings. - */ - -#define UNICODE_SELF 0x80 - -/* - * The following structures are used when mapping between Unicode (UCS-2) and - * UTF-8. - */ - -static const unsigned char totalBytes[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4,4,4,4, -#else - 1,1,1,1,1,1,1,1, -#endif - 1,1,1,1,1,1,1,1 -}; - -/* - * Functions used only in this module. - */ - -static int UtfCount(int ch); - -/* - *--------------------------------------------------------------------------- - * - * UtfCount -- - * - * Find the number of bytes in the Utf character "ch". - * - * Results: - * The return values is the number of bytes in the Utf character "ch". - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -static inline int -UtfCount( - int ch) /* The Unicode character whose size is returned. */ -{ - if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { - return 1; - } - if (ch <= 0x7FF) { - return 2; - } -#if TCL_UTF_MAX > 3 - if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) { - return 4; - } -#endif - return 3; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UniCharToUtf -- - * - * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the - * provided buffer. Equivalent to Plan 9 runetochar(). - * - * Results: - * The return values is the number of bytes in the buffer that were - * consumed. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UniCharToUtf( - int ch, /* The Tcl_UniChar to be stored in the - * buffer. */ - char *buf) /* Buffer in which the UTF-8 representation of - * the Tcl_UniChar is stored. Buffer must be - * large enough to hold the UTF-8 character - * (at most TCL_UTF_MAX bytes). */ -{ - if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { - buf[0] = (char) ch; - return 1; - } - if (ch >= 0) { - if (ch <= 0x7FF) { - buf[1] = (char) ((ch | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 6) | 0xC0); - return 2; - } - if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 - if ((ch & 0xF800) == 0xD800) { - if (ch & 0x0400) { - /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; - } else { - /* High surrogate */ - ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); - return 0; - } - } -#endif - goto three; - } - -#if TCL_UTF_MAX > 3 - if (ch <= 0x10FFFF) { - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 18) | 0xF0); - return 4; - } -#endif - } - - ch = 0xFFFD; -three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); - return 3; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UniCharToUtfDString -- - * - * Convert the given Unicode string to UTF-8. - * - * Results: - * The return value is a pointer to the UTF-8 representation of the - * Unicode string. Storage for the return value is appended to the end of - * dsPtr. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -char * -Tcl_UniCharToUtfDString( - const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ - int uniLength, /* Length of Unicode string in Tcl_UniChars - * (must be >= 0). */ - Tcl_DString *dsPtr) /* UTF-8 representation of string is appended - * to this previously initialized DString. */ -{ - const Tcl_UniChar *w, *wEnd; - char *p, *string; - int oldLength; - - /* - * UTF-8 string length in bytes will be <= Unicode string length * - * TCL_UTF_MAX. - */ - - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); - string = Tcl_DStringValue(dsPtr) + oldLength; - - p = string; - wEnd = uniStr + uniLength; - for (w = uniStr; w < wEnd; ) { - p += Tcl_UniCharToUtf(*w, p); - w++; - } - Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); - - return string; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfToUniChar -- - * - * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 - * sequences are converted to valid Tcl_UniChars and processing - * continues. Equivalent to Plan 9 chartorune(). - * - * The caller must ensure that the source buffer is long enough that this - * routine does not run off the end and dereference non-existent memory - * looking for trail bytes. If the source buffer is known to be '\0' - * terminated, this cannot happen. Otherwise, the caller should call - * Tcl_UtfCharComplete() before calling this routine to ensure that - * enough bytes remain in the string. - * - * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: - * For any UTF-8 string containing a character outside of the BMP, the - * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr - * is used to remember whether the high surrogate is already produced, it - * is recommended to initialize the variable it points to as 0 before - * the first call to Tcl_UtfToUniChar is done. - * - * Results: - * *chPtr is filled with the Tcl_UniChar, and the return value is the - * number of bytes from the UTF-8 string that were consumed. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UtfToUniChar( - register const char *src, /* The UTF-8 string. */ - register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by - * the UTF-8 string. */ -{ - register int byte; - - /* - * Unroll 1 to 3 (or 4) byte UTF-8 sequences. - */ - - byte = *((unsigned char *) src); - if (byte < 0xC0) { - /* - * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid - * characters representing themselves. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } else if (byte < 0xE0) { - if ((src[1] & 0xC0) == 0x80) { - /* - * Two-byte-character lead-byte followed by a trail-byte. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); - if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { - return 2; - } - } - - /* - * A two-byte-character lead-byte not followed by trail-byte - * represents itself. - */ - } else if (byte < 0xF0) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { - /* - * Three-byte-character lead byte followed by two trail bytes. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) - | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); - if (*chPtr > 0x7FF) { - return 3; - } - } - - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ - } -#if TCL_UTF_MAX > 3 - else if (byte < 0xF8) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { - /* - * Four-byte-character lead byte followed by three trail bytes. - */ -#if TCL_UTF_MAX == 4 - Tcl_UniChar surrogate; - - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); - if (byte & 0x100000) { - /* out of range, < 0x10000 or > 0x10ffff */ - } else if (*chPtr != surrogate) { - /* produce high surrogate, but don't advance source pointer */ - *chPtr = surrogate; - return 0; - } else { - /* produce low surrogate, and advance source pointer */ - *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); - return 4; - } -#else - *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { - return 4; - } -#endif - } - - /* - * A four-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ - } -#endif - - *chPtr = (Tcl_UniChar) byte; - return 1; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfToUniCharDString -- - * - * Convert the UTF-8 string to Unicode. - * - * Results: - * The return value is a pointer to the Unicode representation of the - * UTF-8 string. Storage for the return value is appended to the end of - * dsPtr. The Unicode string is terminated with a Unicode NULL character. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -Tcl_UniChar * -Tcl_UtfToUniCharDString( - const char *src, /* UTF-8 string to convert to Unicode. */ - int length, /* Length of UTF-8 string in bytes, or -1 for - * strlen(). */ - Tcl_DString *dsPtr) /* Unicode representation of string is - * appended to this previously initialized - * DString. */ -{ - Tcl_UniChar ch = 0, *w, *wString; - const char *p, *end; - int oldLength; - - if (length < 0) { - length = strlen(src); - } - - /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in - * bytes. - */ - - oldLength = Tcl_DStringLength(dsPtr); -/* TODO: fix overreach! */ - Tcl_DStringSetLength(dsPtr, - (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); - wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); - - w = wString; - end = src + length; - for (p = src; p < end; ) { - p += TclUtfToUniChar(p, &ch); - *w++ = ch; - } - *w = '\0'; - Tcl_DStringSetLength(dsPtr, - (oldLength + ((char *) w - (char *) wString))); - - return wString; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfCharComplete -- - * - * Determine if the UTF-8 string of the given length is long enough to be - * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 - * string is properly formed. Equivalent to Plan 9 fullrune(). - * - * Results: - * The return value is 0 if the string is not long enough, non-zero - * otherwise. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UtfCharComplete( - const char *src, /* String to check if first few bytes contain - * a complete UTF-8 character. */ - int length) /* Length of above string in bytes. */ -{ - return length >= totalBytes[(unsigned char)*src]; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_NumUtfChars -- - * - * Returns the number of characters (not bytes) in the UTF-8 string, not - * including the terminating NULL byte. This is equivalent to Plan 9 - * utflen() and utfnlen(). - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_NumUtfChars( - register const char *src, /* The UTF-8 string to measure. */ - int length) /* The length of the string in bytes, or -1 - * for strlen(string). */ -{ - Tcl_UniChar ch = 0; - register int i = 0; - - /* - * The separate implementations are faster. - * - * Since this is a time-sensitive function, we also do the check for the - * single-byte char case specially. - */ - - if (length < 0) { - while (*src != '\0') { - src += TclUtfToUniChar(src, &ch); - i++; - } - if (i < 0) i = INT_MAX; /* Bug [2738427] */ - } else { - register const char *endPtr = src + length - TCL_UTF_MAX; - - while (src < endPtr) { - src += TclUtfToUniChar(src, &ch); - i++; - } - endPtr += TCL_UTF_MAX; - while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - src += TclUtfToUniChar(src, &ch); - i++; - } - if (src < endPtr) { - i += endPtr - src; - } - } - return i; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfFindFirst -- - * - * Returns a pointer to the first occurance of the given Unicode character - * in the NULL-terminated UTF-8 string. The NULL terminator is considered - * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). - * - * Results: - * As above. If the Unicode character does not exist in the given string, - * the return value is NULL. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -const char * -Tcl_UtfFindFirst( - const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Unicode character to search for. */ -{ - int len, fullchar; - Tcl_UniChar find = 0; - - while (1) { - len = TclUtfToUniChar(src, &find); - fullchar = find; -#if TCL_UTF_MAX == 4 - if (!len) { - len += TclUtfToUniChar(src, &find); - fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; - } -#endif - if (fullchar == ch) { - return src; - } - if (*src == '\0') { - return NULL; - } - src += len; - } -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfFindLast -- - * - * Returns a pointer to the last occurance of the given Unicode character - * in the NULL-terminated UTF-8 string. The NULL terminator is considered - * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). - * - * Results: - * As above. If the Unicode character does not exist in the given string, the - * return value is NULL. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -const char * -Tcl_UtfFindLast( - const char *src, /* The UTF-8 string to be searched. */ - int ch) /* The Unicode character to search for. */ -{ - int len, fullchar; - Tcl_UniChar find = 0; - const char *last; - - last = NULL; - while (1) { - len = TclUtfToUniChar(src, &find); - fullchar = find; -#if TCL_UTF_MAX == 4 - if (!len) { - len += TclUtfToUniChar(src, &find); - fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; - } -#endif - if (fullchar == ch) { - last = src; - } - if (*src == '\0') { - break; - } - src += len; - } - return last; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfNext -- - * - * Given a pointer to some current location in a UTF-8 string, move - * forward one character. The caller must ensure that they are not asking - * for the next character after the last character in the string. - * - * Results: - * The return value is the pointer to the next character in the UTF-8 - * string. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -const char * -Tcl_UtfNext( - const char *src) /* The current location in the string. */ -{ - Tcl_UniChar ch = 0; - int len = TclUtfToUniChar(src, &ch); - -#if TCL_UTF_MAX == 4 - if (len == 0) { - len = TclUtfToUniChar(src, &ch); - } -#endif - return src + len; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfPrev -- - * - * Given a pointer to some current location in a UTF-8 string, move - * backwards one character. This works correctly when the pointer is in - * the middle of a UTF-8 character. - * - * Results: - * The return value is a pointer to the previous character in the UTF-8 - * string. If the current location was already at the beginning of the - * string, the return value will also be a pointer to the beginning of - * the string. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -const char * -Tcl_UtfPrev( - const char *src, /* The current location in the string. */ - const char *start) /* Pointer to the beginning of the string, to - * avoid going backwards too far. */ -{ - const char *look; - int i, byte; - - look = --src; - for (i = 0; i < TCL_UTF_MAX; i++) { - if (look < start) { - if (src < start) { - src = start; - } - break; - } - byte = *((unsigned char *) look); - if (byte < 0x80) { - break; - } - if (byte >= 0xC0) { - return look; - } - look--; - } - return src; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UniCharAtIndex -- - * - * Returns the Tcl_UniChar represented at the specified character - * (not byte) position in the UTF-8 string. - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharAtIndex( - register const char *src, /* The UTF-8 string to dereference. */ - register int index) /* The position of the desired character. */ -{ - Tcl_UniChar ch = 0; - - while (index >= 0) { - index--; - src += TclUtfToUniChar(src, &ch); - } - return ch; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfAtIndex -- - * - * Returns a pointer to the specified character (not byte) position in - * the UTF-8 string. - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -const char * -Tcl_UtfAtIndex( - register const char *src, /* The UTF-8 string. */ - register int index) /* The position of the desired character. */ -{ - Tcl_UniChar ch = 0; - - while (index > 0) { - index--; - src += TclUtfToUniChar(src, &ch); - } - return src; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfBackslash -- - * - * Figure out how to handle a backslash sequence. - * - * Results: - * Stores the bytes represented by the backslash sequence in dst and - * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes - * are written to dst; dst must have been large enough to accept those - * bytes. If readPtr isn't NULL then it is filled in with a count of the - * number of bytes in the backslash sequence. - * - * Side effects: - * The maximum number of bytes it takes to represent a Unicode character - * in UTF-8 is guaranteed to be less than the number of bytes used to - * express the backslash sequence that represents that Unicode character. - * If the target buffer into which the caller is going to store the bytes - * that represent the Unicode character is at least as large as the - * source buffer from which the backslashed sequence was extracted, no - * buffer overruns should occur. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UtfBackslash( - const char *src, /* Points to the backslash character of a - * backslash sequence. */ - int *readPtr, /* Fill in with number of characters read from - * src, unless NULL. */ - char *dst) /* Filled with the bytes represented by the - * backslash sequence. */ -{ -#define LINE_LENGTH 128 - int numRead; - int result; - - result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); - if (numRead == LINE_LENGTH) { - /* - * We ate a whole line. Pay the price of a strlen() - */ - - result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); - } - if (readPtr != NULL) { - *readPtr = numRead; - } - return result; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfToUpper -- - * - * Convert lowercase characters to uppercase characters in a UTF string - * in place. The conversion may shrink the UTF string. - * - * Results: - * Returns the number of bytes in the resulting string excluding the - * trailing null. - * - * Side effects: - * Writes a terminating null after the last converted character. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfToUpper( - char *str) /* String to convert in place. */ -{ - Tcl_UniChar ch = 0, upChar; - char *src, *dst; - int bytes; - - /* - * Iterate over the string until we hit the terminating null. - */ - - src = dst = str; - while (*src) { - bytes = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); - - /* - * To keep badly formed Utf strings from getting inflated by the - * conversion (thereby causing a segfault), only copy the upper case - * char to dst if its size is <= the original char. - */ - - if (bytes < UtfCount(upChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; - } else { - dst += Tcl_UniCharToUtf(upChar, dst); - } - src += bytes; - } - *dst = '\0'; - return (dst - str); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfToLower -- - * - * Convert uppercase characters to lowercase characters in a UTF string - * in place. The conversion may shrink the UTF string. - * - * Results: - * Returns the number of bytes in the resulting string excluding the - * trailing null. - * - * Side effects: - * Writes a terminating null after the last converted character. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfToLower( - char *str) /* String to convert in place. */ -{ - Tcl_UniChar ch = 0, lowChar; - char *src, *dst; - int bytes; - - /* - * Iterate over the string until we hit the terminating null. - */ - - src = dst = str; - while (*src) { - bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); - - /* - * To keep badly formed Utf strings from getting inflated by the - * conversion (thereby causing a segfault), only copy the lower case - * char to dst if its size is <= the original char. - */ - - if (bytes < UtfCount(lowChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; - } else { - dst += Tcl_UniCharToUtf(lowChar, dst); - } - src += bytes; - } - *dst = '\0'; - return (dst - str); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfToTitle -- - * - * Changes the first character of a UTF string to title case or uppercase - * and the rest of the string to lowercase. The conversion happens in - * place and may shrink the UTF string. - * - * Results: - * Returns the number of bytes in the resulting string excluding the - * trailing null. - * - * Side effects: - * Writes a terminating null after the last converted character. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfToTitle( - char *str) /* String to convert in place. */ -{ - Tcl_UniChar ch = 0, titleChar, lowChar; - char *src, *dst; - int bytes; - - /* - * Capitalize the first character and then lowercase the rest of the - * characters until we get to a null. - */ - - src = dst = str; - - if (*src) { - bytes = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); - - if (bytes < UtfCount(titleChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; - } else { - dst += Tcl_UniCharToUtf(titleChar, dst); - } - src += bytes; - } - while (*src) { - bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); - - if (bytes < UtfCount(lowChar)) { - memcpy(dst, src, (size_t) bytes); - dst += bytes; - } else { - dst += Tcl_UniCharToUtf(lowChar, dst); - } - src += bytes; - } - *dst = '\0'; - return (dst - str); -} - -/* - *---------------------------------------------------------------------- - * - * TclpUtfNcmp2 -- - * - * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and - * ct are assumed to be at least numBytes bytes long. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -TclpUtfNcmp2( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - unsigned long numBytes) /* Number of *bytes* to compare. */ -{ - /* - * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to - * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes - * fine in the strcmp manner. - */ - - register int result = 0; - - for ( ; numBytes != 0; numBytes--, cs++, ct++) { - if (*cs != *ct) { - result = UCHAR(*cs) - UCHAR(*ct); - break; - } - } - if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { - unsigned char c1, c2; - - c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); - c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); - result = (c1 - c2); - } - return result; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfNcmp -- - * - * Compare at most numChars UTF chars of string cs to string ct. Both cs - * and ct are assumed to be at least numChars UTF chars long. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfNcmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - unsigned long numChars) /* Number of UTF chars to compare. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - - /* - * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the - * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 - * (the byte 0x01.) - */ - - while (numChars-- > 0) { - /* - * n must be interpreted as chars, not bytes. This should be called - * only when both strings are of at least n chars long (no need for \0 - * check) - */ - - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); -#if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } -#endif - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - return 0; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfNcasecmp -- - * - * Compare at most numChars UTF chars of string cs to string ct case - * insensitive. Both cs and ct are assumed to be at least numChars UTF - * chars long. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfNcasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ - unsigned long numChars) /* Number of UTF chars to compare. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - - while (numChars-- > 0) { - /* - * n must be interpreted as chars, not bytes. - * This should be called only when both strings are of - * at least n chars long (no need for \0 check) - */ - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); -#if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } -#endif - if (ch1 != ch2) { - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) { - return (ch1 - ch2); - } - } - } - return 0; -} - -/* - *---------------------------------------------------------------------- - * - * TclUtfCasecmp -- - * - * Compare UTF chars of string cs to string ct case insensitively. - * Replacement for strcasecmp in Tcl core, in places where UTF-8 should - * be handled. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -TclUtfCasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct) /* UTF string cs is compared to. */ -{ - Tcl_UniChar ch1 = 0, ch2 = 0; - - while (*cs && *ct) { - cs += TclUtfToUniChar(cs, &ch1); - ct += TclUtfToUniChar(ct, &ch2); -#if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } -#endif - if (ch1 != ch2) { - ch1 = Tcl_UniCharToLower(ch1); - ch2 = Tcl_UniCharToLower(ch2); - if (ch1 != ch2) { - return ch1 - ch2; - } - } - } - return UCHAR(*cs) - UCHAR(*ct); -} - - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharToUpper -- - * - * Compute the uppercase equivalent of the given Unicode character. - * - * Results: - * Returns the uppercase Unicode character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharToUpper( - int ch) /* Unicode character to convert. */ -{ - int info = GetUniCharInfo(ch); - - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); - } - return (Tcl_UniChar) ch; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharToLower -- - * - * Compute the lowercase equivalent of the given Unicode character. - * - * Results: - * Returns the lowercase Unicode character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharToLower( - int ch) /* Unicode character to convert. */ -{ - int info = GetUniCharInfo(ch); - - if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); - } - return (Tcl_UniChar) ch; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharToTitle -- - * - * Compute the titlecase equivalent of the given Unicode character. - * - * Results: - * Returns the titlecase Unicode character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharToTitle( - int ch) /* Unicode character to convert. */ -{ - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); - - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ - - ch += ((mode & 0x4) ? -1 : 1); - } else if (mode == 0x4) { - ch -= GetDelta(info); - } - return (Tcl_UniChar) ch; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharLen -- - * - * Find the length of a UniChar string. The str input must be null - * terminated. - * - * Results: - * Returns the length of str in UniChars (not bytes). - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharLen( - const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ -{ - int len = 0; - - while (*uniStr != '\0') { - len++; - uniStr++; - } - return len; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharNcmp -- - * - * Compare at most numChars unichars of string ucs to string uct. - * Both ucs and uct are assumed to be at least numChars unichars long. - * - * Results: - * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharNcmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ - unsigned long numChars) /* Number of unichars to compare. */ -{ -#ifdef WORDS_BIGENDIAN - /* - * We are definitely on a big-endian machine; memcmp() is safe - */ - - return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); - -#else /* !WORDS_BIGENDIAN */ - /* - * We can't simply call memcmp() because that is not lexically correct. - */ - - for ( ; numChars != 0; ucs++, uct++, numChars--) { - if (*ucs != *uct) { - return (*ucs - *uct); - } - } - return 0; -#endif /* WORDS_BIGENDIAN */ -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharNcasecmp -- - * - * Compare at most numChars unichars of string ucs to string uct case - * insensitive. Both ucs and uct are assumed to be at least numChars - * unichars long. - * - * Results: - * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharNcasecmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ - unsigned long numChars) /* Number of unichars to compare. */ -{ - for ( ; numChars != 0; numChars--, ucs++, uct++) { - if (*ucs != *uct) { - Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); - Tcl_UniChar lct = Tcl_UniCharToLower(*uct); - - if (lcs != lct) { - return (lcs - lct); - } - } - } - return 0; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsAlnum -- - * - * Test if a character is an alphanumeric Unicode character. - * - * Results: - * Returns 1 if character is alphanumeric. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsAlnum( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsAlpha -- - * - * Test if a character is an alphabetic Unicode character. - * - * Results: - * Returns 1 if character is alphabetic. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsAlpha( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return ((ALPHA_BITS >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsControl -- - * - * Test if a character is a Unicode control character. - * - * Results: - * Returns non-zero if character is a control. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsControl( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1FFFFF; - if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) { - return 1; - } - if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) { - return 1; - } - return 0; - } -#endif - return ((CONTROL_BITS >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsDigit -- - * - * Test if a character is a numeric Unicode character. - * - * Results: - * Returns non-zero if character is a digit. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsDigit( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsGraph -- - * - * Test if a character is any Unicode print character except space. - * - * Results: - * Returns non-zero if character is printable, but not space. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsGraph( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1FFFFF; - return (ch >= 0xE0100) && (ch <= 0xE01EF); - } -#endif - return ((GRAPH_BITS >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsLower -- - * - * Test if a character is a lowercase Unicode character. - * - * Results: - * Returns non-zero if character is lowercase. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsLower( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return (GetCategory(ch) == LOWERCASE_LETTER); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsPrint -- - * - * Test if a character is a Unicode print character. - * - * Results: - * Returns non-zero if character is printable. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsPrint( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1FFFFF; - return (ch >= 0xE0100) && (ch <= 0xE01EF); - } -#endif - return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsPunct -- - * - * Test if a character is a Unicode punctuation character. - * - * Results: - * Returns non-zero if character is punct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsPunct( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return ((PUNCT_BITS >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsSpace -- - * - * Test if a character is a whitespace Unicode character. - * - * Results: - * Returns non-zero if character is a space. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsSpace( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - /* Ignore upper 11 bits. */ - ch &= 0x1FFFFF; -#else - /* Ignore upper 16 bits. */ - ch &= 0xFFFF; -#endif - - /* - * If the character is within the first 127 characters, just use the - * standard C function, otherwise consult the Unicode table. - */ - - if (ch < 0x80) { - return TclIsSpaceProc((char) ch); -#if TCL_UTF_MAX > 3 - } else if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; -#endif - } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B - || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { - return 1; - } else { - return ((SPACE_BITS >> GetCategory(ch)) & 1); - } -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsUpper -- - * - * Test if a character is a uppercase Unicode character. - * - * Results: - * Returns non-zero if character is uppercase. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsUpper( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return (GetCategory(ch) == UPPERCASE_LETTER); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsWordChar -- - * - * Test if a character is alphanumeric or a connector punctuation mark. - * - * Results: - * Returns 1 if character is a word character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsWordChar( - int ch) /* Unicode character to test. */ -{ -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif - return ((WORD_BITS >> GetCategory(ch)) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharCaseMatch -- - * - * See if a particular Unicode string matches a particular pattern. - * Allows case insensitivity. This is the Unicode equivalent of the char* - * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. - * This has no provision for counted UniChar strings, thus should not be - * used where NULLs are expected in the UniChar string. Use - * TclUniCharMatch where possible. - * - * Results: - * The return value is 1 if string matches pattern, and 0 otherwise. The - * matching operation permits the following special characters in the - * pattern: *?\[] (see the manual entry for details on what these mean). - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharCaseMatch( - const Tcl_UniChar *uniStr, /* Unicode String. */ - const Tcl_UniChar *uniPattern, - /* Pattern, which may contain special - * characters. */ - int nocase) /* 0 for case sensitive, 1 for insensitive */ -{ - Tcl_UniChar ch1 = 0, p; - - while (1) { - p = *uniPattern; - - /* - * See if we're at the end of both the pattern and the string. If so, - * we succeeded. If we're at the end of the pattern but not at the end - * of the string, we failed. - */ - - if (p == 0) { - return (*uniStr == 0); - } - if ((*uniStr == 0) && (p != '*')) { - return 0; - } - - /* - * Check for a "*" as the next pattern character. It matches any - * substring. We handle this by skipping all the characters up to the - * next matching one in the pattern, and then calling ourselves - * recursively for each postfix of string, until either we match or we - * reach the end of the string. - */ - - if (p == '*') { - /* - * Skip all successive *'s in the pattern - */ - - while (*(++uniPattern) == '*') { - /* empty body */ - } - p = *uniPattern; - if (p == 0) { - return 1; - } - if (nocase) { - p = Tcl_UniCharToLower(p); - } - while (1) { - /* - * Optimization for matching - cruise through the string - * quickly if the next char in the pattern isn't a special - * character - */ - - if ((p != '[') && (p != '?') && (p != '\\')) { - if (nocase) { - while (*uniStr && (p != *uniStr) - && (p != Tcl_UniCharToLower(*uniStr))) { - uniStr++; - } - } else { - while (*uniStr && (p != *uniStr)) { - uniStr++; - } - } - } - if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { - return 1; - } - if (*uniStr == 0) { - return 0; - } - uniStr++; - } - } - - /* - * Check for a "?" as the next pattern character. It matches any - * single character. - */ - - if (p == '?') { - uniPattern++; - uniStr++; - continue; - } - - /* - * Check for a "[" as the next pattern character. It is followed by a - * list of characters that are acceptable, or by a range (two - * characters separated by "-"). - */ - - if (p == '[') { - Tcl_UniChar startChar, endChar; - - uniPattern++; - ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); - uniStr++; - while (1) { - if ((*uniPattern == ']') || (*uniPattern == 0)) { - return 0; - } - startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) - : *uniPattern); - uniPattern++; - if (*uniPattern == '-') { - uniPattern++; - if (*uniPattern == 0) { - return 0; - } - endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) - : *uniPattern); - uniPattern++; - if (((startChar <= ch1) && (ch1 <= endChar)) - || ((endChar <= ch1) && (ch1 <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - break; - } - } else if (startChar == ch1) { - break; - } - } - while (*uniPattern != ']') { - if (*uniPattern == 0) { - uniPattern--; - break; - } - uniPattern++; - } - uniPattern++; - continue; - } - - /* - * If the next pattern character is '\', just strip off the '\' so we - * do exact matching on the character that follows. - */ - - if (p == '\\') { - if (*(++uniPattern) == '\0') { - return 0; - } - } - - /* - * There's no special character. Just make sure that the next bytes of - * each string match. - */ - - if (nocase) { - if (Tcl_UniCharToLower(*uniStr) != - Tcl_UniCharToLower(*uniPattern)) { - return 0; - } - } else if (*uniStr != *uniPattern) { - return 0; - } - uniStr++; - uniPattern++; - } -} - -/* - *---------------------------------------------------------------------- - * - * TclUniCharMatch -- - * - * See if a particular Unicode string matches a particular pattern. - * Allows case insensitivity. This is the Unicode equivalent of the char* - * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted - * Strings, so embedded NULLs are allowed. - * - * Results: - * The return value is 1 if string matches pattern, and 0 otherwise. The - * matching operation permits the following special characters in the - * pattern: *?\[] (see the manual entry for details on what these mean). - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -TclUniCharMatch( - const Tcl_UniChar *string, /* Unicode String. */ - int strLen, /* Length of String */ - const Tcl_UniChar *pattern, /* Pattern, which may contain special - * characters. */ - int ptnLen, /* Length of Pattern */ - int nocase) /* 0 for case sensitive, 1 for insensitive */ -{ - const Tcl_UniChar *stringEnd, *patternEnd; - Tcl_UniChar p; - - stringEnd = string + strLen; - patternEnd = pattern + ptnLen; - - while (1) { - /* - * See if we're at the end of both the pattern and the string. If so, - * we succeeded. If we're at the end of the pattern but not at the end - * of the string, we failed. - */ - - if (pattern == patternEnd) { - return (string == stringEnd); - } - p = *pattern; - if ((string == stringEnd) && (p != '*')) { - return 0; - } - - /* - * Check for a "*" as the next pattern character. It matches any - * substring. We handle this by skipping all the characters up to the - * next matching one in the pattern, and then calling ourselves - * recursively for each postfix of string, until either we match or we - * reach the end of the string. - */ - - if (p == '*') { - /* - * Skip all successive *'s in the pattern. - */ - - while (*(++pattern) == '*') { - /* empty body */ - } - if (pattern == patternEnd) { - return 1; - } - p = *pattern; - if (nocase) { - p = Tcl_UniCharToLower(p); - } - while (1) { - /* - * Optimization for matching - cruise through the string - * quickly if the next char in the pattern isn't a special - * character. - */ - - if ((p != '[') && (p != '?') && (p != '\\')) { - if (nocase) { - while ((string < stringEnd) && (p != *string) - && (p != Tcl_UniCharToLower(*string))) { - string++; - } - } else { - while ((string < stringEnd) && (p != *string)) { - string++; - } - } - } - if (TclUniCharMatch(string, stringEnd - string, - pattern, patternEnd - pattern, nocase)) { - return 1; - } - if (string == stringEnd) { - return 0; - } - string++; - } - } - - /* - * Check for a "?" as the next pattern character. It matches any - * single character. - */ - - if (p == '?') { - pattern++; - string++; - continue; - } - - /* - * Check for a "[" as the next pattern character. It is followed by a - * list of characters that are acceptable, or by a range (two - * characters separated by "-"). - */ - - if (p == '[') { - Tcl_UniChar ch1, startChar, endChar; - - pattern++; - ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); - string++; - while (1) { - if ((*pattern == ']') || (pattern == patternEnd)) { - return 0; - } - startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); - pattern++; - if (*pattern == '-') { - pattern++; - if (pattern == patternEnd) { - return 0; - } - endChar = (nocase ? Tcl_UniCharToLower(*pattern) - : *pattern); - pattern++; - if (((startChar <= ch1) && (ch1 <= endChar)) - || ((endChar <= ch1) && (ch1 <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - break; - } - } else if (startChar == ch1) { - break; - } - } - while (*pattern != ']') { - if (pattern == patternEnd) { - pattern--; - break; - } - pattern++; - } - pattern++; - continue; - } - - /* - * If the next pattern character is '\', just strip off the '\' so we - * do exact matching on the character that follows. - */ - - if (p == '\\') { - if (++pattern == patternEnd) { - return 0; - } - } - - /* - * There's no special character. Just make sure that the next bytes of - * each string match. - */ - - if (nocase) { - if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { - return 0; - } - } else if (*string != *pattern) { - return 0; - } - string++; - pattern++; - } -} - -/* - * Local Variables: - * mode: c - * c-basic-offset: 4 - * fill-column: 78 - * End: - */ |