diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 1457 |
1 files changed, 936 insertions, 521 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 5fe3c41..15529c7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -5,10 +5,8 @@ * * Copyright (c) 1997-1998 Sun Microsystems, Inc. * - * See the file "license.terms" for information on usage and redistribution - * of this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtf.c,v 1.11 2000/01/11 22:09:00 hobbs Exp $ + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES. */ #include "tclInt.h" @@ -20,49 +18,48 @@ #include "tclUniData.c" /* - * The following macros are used for fast character category tests. The - * x_BITS values are shifted right by the category value to determine whether - * the given category is included in the set. - */ + * The following macros are used for fast character category tests. The x_BITS + * values are shifted right by the category value to determine whether the + * given category is included in the set. + */ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ - | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) + | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) + +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE)) #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ - | (1 << PARAGRAPH_SEPARATOR)) + | (1 << PARAGRAPH_SEPARATOR)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ - (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ - (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ - (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ - (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ - (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ - (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ - (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ - (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) + (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ + (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ + (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) + +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ + (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ + (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ + (1 << OTHER_NUMBER) | \ + (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ + (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) /* - * Unicode characters less than this value are represented by themselves - * in UTF-8 strings. + * Unicode characters less than this value are represented by themselves in + * UTF-8 strings. */ #define UNICODE_SELF 0x80 /* - * The following structures are used when mapping between Unicode (UCS-2) - * and UTF-8. + * The following structures are used when mapping between Unicode (UCS-2) and + * UTF-8. */ - -CONST unsigned char totalBytes[256] = { + +static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -89,11 +86,10 @@ CONST unsigned char totalBytes[256] = { }; /* - * Procedures used only in this module. + * Functions used only in this module. */ -static int UtfCount _ANSI_ARGS_((int ch)); - +static int UtfCount(int ch); /* *--------------------------------------------------------------------------- @@ -110,10 +106,10 @@ static int UtfCount _ANSI_ARGS_((int ch)); * *--------------------------------------------------------------------------- */ - -static int -UtfCount(ch) - int ch; /* The Tcl_UniChar whose size is returned. */ + +INLINE static int +UtfCount( + int ch) /* The Tcl_UniChar whose size is returned. */ { if ((ch > 0) && (ch < UNICODE_SELF)) { return 1; @@ -144,70 +140,72 @@ UtfCount(ch) * Tcl_UniCharToUtf -- * * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the - * provided buffer. Equivalent to Plan 9 runetochar(). + * provided buffer. Equivalent to Plan 9 runetochar(). * * Results: - * The return values is the number of bytes in the buffer that - * were consumed. + * The return values is the number of bytes in the buffer that were + * consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - + INLINE int -Tcl_UniCharToUtf(ch, str) - int ch; /* The Tcl_UniChar to be stored in the +Tcl_UniCharToUtf( + int ch, /* The Tcl_UniChar to be stored in the * buffer. */ - char *str; /* Buffer in which the UTF-8 representation - * of the Tcl_UniChar is stored. Buffer must - * be large enough to hold the UTF-8 character + char *buf) /* Buffer in which the UTF-8 representation of + * the Tcl_UniChar is stored. Buffer must be + * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ { if ((ch > 0) && (ch < UNICODE_SELF)) { - str[0] = (char) ch; + buf[0] = (char) ch; return 1; } - if (ch <= 0x7FF) { - str[1] = (char) ((ch | 0x80) & 0xBF); - str[0] = (char) ((ch >> 6) | 0xC0); - return 2; - } - if (ch <= 0xFFFF) { + if (ch >= 0) { + if (ch <= 0x7FF) { + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 6) | 0xC0); + return 2; + } + if (ch <= 0xFFFF) { three: - str[2] = (char) ((ch | 0x80) & 0xBF); - str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 12) | 0xE0); - return 3; - } + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { - str[3] = (char) ((ch | 0x80) & 0xBF); - str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 18) | 0xF0); - return 4; - } - if (ch <= 0x3FFFFFF) { - str[4] = (char) ((ch | 0x80) & 0xBF); - str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); - str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 24) | 0xF8); - return 5; - } - if (ch <= 0x7FFFFFFF) { - str[5] = (char) ((ch | 0x80) & 0xBF); - str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); - str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); - str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 30) | 0xFC); - return 6; - } + if (ch <= 0x1FFFFF) { + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 18) | 0xF0); + return 4; + } + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } #endif + } ch = 0xFFFD; goto three; @@ -222,25 +220,24 @@ Tcl_UniCharToUtf(ch, str) * * Results: * The return value is a pointer to the UTF-8 representation of the - * Unicode string. Storage for the return value is appended to the - * end of dsPtr. + * Unicode string. Storage for the return value is appended to the end of + * dsPtr. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - + char * -Tcl_UniCharToUtfDString(wString, numChars, dsPtr) - CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ - int numChars; /* Length of Unicode string in Tcl_UniChars +Tcl_UniCharToUtfDString( + const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ - Tcl_DString *dsPtr; /* UTF-8 representation of string is - * appended to this previously initialized - * DString. */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ { - CONST Tcl_UniChar *w, *wEnd; + const Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -250,12 +247,12 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr) */ oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); + Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; - wEnd = wString + numChars; - for (w = wString; w < wEnd; ) { + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { p += Tcl_UniCharToUtf(*w, p); w++; } @@ -269,16 +266,16 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr) * * Tcl_UtfToUniChar -- * - * Extract the Tcl_UniChar represented by the UTF-8 string. Bad - * UTF-8 sequences are converted to valid Tcl_UniChars and processing - * continues. Equivalent to Plan 9 chartorune(). + * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 + * sequences are converted to valid Tcl_UniChars and processing + * continues. Equivalent to Plan 9 chartorune(). * - * The caller must ensure that the source buffer is long enough that - * this routine does not run off the end and dereference non-existent - * memory looking for trail bytes. If the source buffer is known to - * be '\0' terminated, this cannot happen. Otherwise, the caller - * should call Tcl_UtfCharComplete() before calling this routine to - * ensure that enough bytes remain in the string. + * The caller must ensure that the source buffer is long enough that this + * routine does not run off the end and dereference non-existent memory + * looking for trail bytes. If the source buffer is known to be '\0' + * terminated, this cannot happen. Otherwise, the caller should call + * Tcl_UtfCharComplete() before calling this routine to ensure that + * enough bytes remain in the string. * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the @@ -289,55 +286,57 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr) * *--------------------------------------------------------------------------- */ - + int -Tcl_UtfToUniChar(str, chPtr) - register CONST char *str; /* The UTF-8 string. */ - register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented - * by the UTF-8 string. */ +Tcl_UtfToUniChar( + register const char *src, /* The UTF-8 string. */ + register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + * the UTF-8 string. */ { register int byte; - + /* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ - byte = *((unsigned char *) str); + byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ - + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xE0) { - if ((str[1] & 0xC0) == 0x80) { + if ((src[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed by a trail-byte. */ - - *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); + + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); return 2; } + /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - + *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xF0) { - if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* * Three-byte-character lead byte followed by two trail bytes. */ - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) - | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); return 3; } + /* * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. @@ -347,7 +346,7 @@ Tcl_UtfToUniChar(str, chPtr) return 1; } #if TCL_UTF_MAX > 3 - else { + { int ch, total, trail; total = totalBytes[byte]; @@ -355,13 +354,13 @@ Tcl_UtfToUniChar(str, chPtr) if (trail > 0) { ch = byte & (0x3F >> trail); do { - str++; - if ((*str & 0xC0) != 0x80) { + src++; + if ((*src & 0xC0) != 0x80) { *chPtr = byte; return 1; } ch <<= 6; - ch |= (*str & 0x3F); + ch |= (*src & 0x3F); trail--; } while (trail > 0); *chPtr = ch; @@ -383,9 +382,8 @@ Tcl_UtfToUniChar(str, chPtr) * * Results: * The return value is a pointer to the Unicode representation of the - * UTF-8 string. Storage for the return value is appended to the - * end of dsPtr. The Unicode string is terminated with a Unicode - * NULL character. + * UTF-8 string. Storage for the return value is appended to the end of + * dsPtr. The Unicode string is terminated with a Unicode NULL character. * * Side effects: * None. @@ -394,36 +392,37 @@ Tcl_UtfToUniChar(str, chPtr) */ Tcl_UniChar * -Tcl_UtfToUniCharDString(string, length, dsPtr) - CONST char *string; /* UTF-8 string to convert to Unicode. */ - int length; /* Length of UTF-8 string in bytes, or -1 - * for strlen(). */ - Tcl_DString *dsPtr; /* Unicode representation of string is +Tcl_UtfToUniCharDString( + const char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is * appended to this previously initialized * DString. */ { Tcl_UniChar *w, *wString; - CONST char *p, *end; + const char *p, *end; int oldLength; if (length < 0) { - length = strlen(string); + length = strlen(src); } /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length - * in bytes. + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. */ oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; - end = string + length; - for (p = string; p < end; ) { - p += Tcl_UtfToUniChar(p, w); + end = src + length; + for (p = src; p < end; ) { + p += TclUtfToUniChar(p, w); w++; } *w = '\0'; @@ -438,9 +437,9 @@ Tcl_UtfToUniCharDString(string, length, dsPtr) * * Tcl_UtfCharComplete -- * - * Determine if the UTF-8 string of the given length is long enough - * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the - * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). + * Determine if the UTF-8 string of the given length is long enough to be + * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 + * string is properly formed. Equivalent to Plan 9 fullrune(). * * Results: * The return value is 0 if the string is not long enough, non-zero @@ -453,15 +452,15 @@ Tcl_UtfToUniCharDString(string, length, dsPtr) */ int -Tcl_UtfCharComplete(str, len) - CONST char *str; /* String to check if first few bytes - * contain a complete UTF-8 character. */ - int len; /* Length of above string in bytes. */ +Tcl_UtfCharComplete( + const char *src, /* String to check if first few bytes contain + * a complete UTF-8 character. */ + int length) /* Length of above string in bytes. */ { int ch; - ch = *((unsigned char *) str); - return len >= totalBytes[ch]; + ch = *((unsigned char *) src); + return length >= totalBytes[ch]; } /* @@ -469,48 +468,54 @@ Tcl_UtfCharComplete(str, len) * * Tcl_NumUtfChars -- * - * Returns the number of characters (not bytes) in the UTF-8 string, - * not including the terminating NULL byte. This is equivalent to - * Plan 9 utflen() and utfnlen(). + * Returns the number of characters (not bytes) in the UTF-8 string, not + * including the terminating NULL byte. This is equivalent to Plan 9 + * utflen() and utfnlen(). * * Results: - * As above. + * As above. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - -int -Tcl_NumUtfChars(str, len) - register CONST char *str; /* The UTF-8 string to measure. */ - int len; /* The length of the string in bytes, or -1 + +int +Tcl_NumUtfChars( + register const char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { Tcl_UniChar ch; register Tcl_UniChar *chPtr = &ch; - register int n; - int i; + register int i; /* * The separate implementations are faster. + * + * Since this is a time-sensitive function, we also do the check for the + * single-byte char case specially. */ - + i = 0; - if (len < 0) { - while (1) { - str += Tcl_UtfToUniChar(str, chPtr); - if (ch == '\0') { - break; - } + if (length < 0) { + while (*src != '\0') { + src += TclUtfToUniChar(src, chPtr); i++; } } else { - while (len > 0) { - n = Tcl_UtfToUniChar(str, chPtr); - len -= n; - str += n; + register int n; + + while (length > 0) { + if (UCHAR(*src) < 0xC0) { + length--; + src++; + } else { + n = Tcl_UtfToUniChar(src, chPtr); + length -= n; + src += n; + } i++; } } @@ -522,37 +527,37 @@ Tcl_NumUtfChars(str, len) * * Tcl_UtfFindFirst -- * - * Returns a pointer to the first occurance of the given Tcl_UniChar - * in the NULL-terminated UTF-8 string. The NULL terminator is - * considered part of the UTF-8 string. Equivalent to Plan 9 - * utfrune(). + * Returns a pointer to the first occurance of the given Tcl_UniChar in + * the NULL-terminated UTF-8 string. The NULL terminator is considered + * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, - * the return value is NULL. + * As above. If the Tcl_UniChar does not exist in the given string, the + * return value is NULL. * * Side effects: * None. * *--------------------------------------------------------------------------- */ -char * -Tcl_UtfFindFirst(string, ch) - CONST char *string; /* The UTF-8 string to be searched. */ - int ch; /* The Tcl_UniChar to search for. */ + +const char * +Tcl_UtfFindFirst( + const char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - + while (1) { - len = Tcl_UtfToUniChar(string, &find); + len = TclUtfToUniChar(src, &find); if (find == ch) { - return (char *) string; + return src; } - if (*string == '\0') { + if (*src == '\0') { return NULL; } - string += len; + src += len; } } @@ -561,14 +566,13 @@ Tcl_UtfFindFirst(string, ch) * * Tcl_UtfFindLast -- * - * Returns a pointer to the last occurance of the given Tcl_UniChar - * in the NULL-terminated UTF-8 string. The NULL terminator is - * considered part of the UTF-8 string. Equivalent to Plan 9 - * utfrrune(). + * Returns a pointer to the last occurance of the given Tcl_UniChar in + * the NULL-terminated UTF-8 string. The NULL terminator is considered + * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). * * Results: - * As above. If the Tcl_UniChar does not exist in the given string, - * the return value is NULL. + * As above. If the Tcl_UniChar does not exist in the given string, the + * return value is NULL. * * Side effects: * None. @@ -576,27 +580,27 @@ Tcl_UtfFindFirst(string, ch) *--------------------------------------------------------------------------- */ -char * -Tcl_UtfFindLast(string, ch) - CONST char *string; /* The UTF-8 string to be searched. */ - int ch; /* The Tcl_UniChar to search for. */ +const char * +Tcl_UtfFindLast( + const char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - CONST char *last; - + const char *last; + last = NULL; while (1) { - len = Tcl_UtfToUniChar(string, &find); + len = TclUtfToUniChar(src, &find); if (find == ch) { - last = string; + last = src; } - if (*string == '\0') { + if (*src == '\0') { break; } - string += len; + src += len; } - return (char *) last; + return last; } /* @@ -604,28 +608,27 @@ Tcl_UtfFindLast(string, ch) * * Tcl_UtfNext -- * - * Given a pointer to some current location in a UTF-8 string, - * move forward one character. The caller must ensure that they - * are not asking for the next character after the last character - * in the string. + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. * * Results: - * The return value is the pointer to the next character in - * the UTF-8 string. + * The return value is the pointer to the next character in the UTF-8 + * string. * * Side effects: * None. * *--------------------------------------------------------------------------- */ - -char * -Tcl_UtfNext(str) - CONST char *str; /* The current location in the string. */ + +const char * +Tcl_UtfNext( + const char *src) /* The current location in the string. */ { Tcl_UniChar ch; - return (char *) str + Tcl_UtfToUniChar(str, &ch); + return src + TclUtfToUniChar(src, &ch); } /* @@ -633,14 +636,15 @@ Tcl_UtfNext(str) * * Tcl_UtfPrev -- * - * Given a pointer to some current location in a UTF-8 string, - * move backwards one character. + * Given a pointer to some current location in a UTF-8 string, move + * backwards one character. This works correctly when the pointer is in + * the middle of a UTF-8 character. * * Results: - * The return value is a pointer to the previous character in the - * UTF-8 string. If the current location was already at the - * beginning of the string, the return value will also be a - * pointer to the beginning of the string. + * The return value is a pointer to the previous character in the UTF-8 + * string. If the current location was already at the beginning of the + * string, the return value will also be a pointer to the beginning of + * the string. * * Side effects: * None. @@ -648,47 +652,43 @@ Tcl_UtfNext(str) *--------------------------------------------------------------------------- */ -char * -Tcl_UtfPrev(str, start) - CONST char *str; /* The current location in the string. */ - CONST char *start; /* Pointer to the beginning of the - * string, to avoid going backwards too - * far. */ +const char * +Tcl_UtfPrev( + const char *src, /* The current location in the string. */ + const char *start) /* Pointer to the beginning of the string, to + * avoid going backwards too far. */ { - CONST char *look; + const char *look; int i, byte; - - str--; - look = str; + + src--; + look = src; for (i = 0; i < TCL_UTF_MAX; i++) { if (look < start) { - if (str < start) { - str = start; + if (src < start) { + src = start; } break; } byte = *((unsigned char *) look); if (byte < 0x80) { break; - } + } if (byte >= 0xC0) { - if (totalBytes[byte] != i + 1) { - break; - } - return (char *) look; + return look; } look--; } - return (char *) str; + return src; } - + /* *--------------------------------------------------------------------------- * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified - * character (not byte) position in the UTF-8 string. + * Returns the Unicode character represented at the specified character + * (not byte) position in the UTF-8 string. * * Results: * As above. @@ -698,17 +698,17 @@ Tcl_UtfPrev(str, start) * *--------------------------------------------------------------------------- */ - + Tcl_UniChar -Tcl_UniCharAtIndex(src, index) - register CONST char *src; /* The UTF-8 string to dereference. */ - register int index; /* The position of the desired character. */ +Tcl_UniCharAtIndex( + register const char *src, /* The UTF-8 string to dereference. */ + register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; while (index >= 0) { index--; - src += Tcl_UtfToUniChar(src, &ch); + src += TclUtfToUniChar(src, &ch); } return ch; } @@ -718,8 +718,8 @@ Tcl_UniCharAtIndex(src, index) * * Tcl_UtfAtIndex -- * - * Returns a pointer to the specified character (not byte) position - * in the UTF-8 string. + * Returns a pointer to the specified character (not byte) position in + * the UTF-8 string. * * Results: * As above. @@ -730,18 +730,18 @@ Tcl_UniCharAtIndex(src, index) *--------------------------------------------------------------------------- */ -char * -Tcl_UtfAtIndex(src, index) - register CONST char *src; /* The UTF-8 string. */ - register int index; /* The position of the desired character. */ +const char * +Tcl_UtfAtIndex( + register const char *src, /* The UTF-8 string. */ + register int index) /* The position of the desired character. */ { Tcl_UniChar ch; - + while (index > 0) { index--; - src += Tcl_UtfToUniChar(src, &ch); + src += TclUtfToUniChar(src, &ch); } - return (char *) src; + return src; } /* @@ -753,145 +753,48 @@ Tcl_UtfAtIndex(src, index) * * Results: * Stores the bytes represented by the backslash sequence in dst and - * returns the number of bytes written to dst. At most TCL_UTF_MAX - * bytes are written to dst; dst must have been large enough to accept - * those bytes. If readPtr isn't NULL then it is filled in with a - * count of the number of bytes in the backslash sequence. + * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes + * are written to dst; dst must have been large enough to accept those + * bytes. If readPtr isn't NULL then it is filled in with a count of the + * number of bytes in the backslash sequence. * * Side effects: - * The maximum number of bytes it takes to represent a Unicode - * character in UTF-8 is guaranteed to be less than the number of - * bytes used to express the backslash sequence that represents - * that Unicode character. If the target buffer into which the - * caller is going to store the bytes that represent the Unicode - * character is at least as large as the source buffer from which - * the backslashed sequence was extracted, no buffer overruns should - * occur. + * The maximum number of bytes it takes to represent a Unicode character + * in UTF-8 is guaranteed to be less than the number of bytes used to + * express the backslash sequence that represents that Unicode character. + * If the target buffer into which the caller is going to store the bytes + * that represent the Unicode character is at least as large as the + * source buffer from which the backslashed sequence was extracted, no + * buffer overruns should occur. * *--------------------------------------------------------------------------- */ int -Tcl_UtfBackslash(src, readPtr, dst) - CONST char *src; /* Points to the backslash character of - * a backslash sequence. */ - int *readPtr; /* Fill in with number of characters read - * from src, unless NULL. */ - char *dst; /* Filled with the bytes represented by the +Tcl_UtfBackslash( + const char *src, /* Points to the backslash character of a + * backslash sequence. */ + int *readPtr, /* Fill in with number of characters read from + * src, unless NULL. */ + char *dst) /* Filled with the bytes represented by the * backslash sequence. */ { - register CONST char *p = src+1; - int result, count, n; - char buf[TCL_UTF_MAX]; - - if (dst == NULL) { - dst = buf; - } +#define LINE_LENGTH 128 + int numRead; + int result; - count = 2; - switch (*p) { + result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); + if (numRead == LINE_LENGTH) { /* - * Note: in the conversions below, use absolute values (e.g., - * 0xa) rather than symbolic values (e.g. \n) that get converted - * by the compiler. It's possible that compilers on some - * platforms will do the symbolic conversions differently, which - * could result in non-portable Tcl scripts. - */ - - case 'a': - result = 0x7; - break; - case 'b': - result = 0x8; - break; - case 'f': - result = 0xc; - break; - case 'n': - result = 0xa; - break; - case 'r': - result = 0xd; - break; - case 't': - result = 0x9; - break; - case 'v': - result = 0xb; - break; - case 'x': - if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ - char *end; - - result = (unsigned char) strtoul(p+1, &end, 16); - count = end - src; - } else { - count = 2; - result = 'x'; - } - break; - case 'u': - result = 0; - for (count = 0; count < 4; count++) { - p++; - if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ - break; - } - n = *p - '0'; - if (n > 9) { - n = n + '0' + 10 - 'A'; - } - if (n > 16) { - n = n + 'A' - 'a'; - } - result = (result << 4) + n; - } - if (count == 0) { - result = 'u'; - } - count += 2; - break; - - case '\n': - do { - p++; - } while ((*p == ' ') || (*p == '\t')); - result = ' '; - count = p - src; - break; - case 0: - result = '\\'; - count = 1; - break; - default: - /* - * Check for an octal number \oo?o? - */ - if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */ - result = (unsigned char)(*p - '0'); - p++; - if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ - break; - } - count = 3; - result = (unsigned char)((result << 3) + (*p - '0')); - p++; - if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ - break; - } - count = 4; - result = (unsigned char)((result << 3) + (*p - '0')); - break; - } - result = *p; - count = 2; - break; - } + * We ate a whole line. Pay the price of a strlen() + */ + result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); + } if (readPtr != NULL) { - *readPtr = count; + *readPtr = numRead; } - return Tcl_UniCharToUtf(result, dst); + return result; } /* @@ -899,12 +802,12 @@ Tcl_UtfBackslash(src, readPtr, dst) * * Tcl_UtfToUpper -- * - * Convert lowercase characters to uppercase characters in a UTF - * string in place. The conversion may shrink the UTF string. + * Convert lowercase characters to uppercase characters in a UTF string + * in place. The conversion may shrink the UTF string. * * Results: - * Returns the number of bytes in the resulting string - * excluding the trailing null. + * Returns the number of bytes in the resulting string excluding the + * trailing null. * * Side effects: * Writes a terminating null after the last converted character. @@ -913,8 +816,8 @@ Tcl_UtfBackslash(src, readPtr, dst) */ int -Tcl_UtfToUpper(str) - char *str; /* String to convert in place. */ +Tcl_UtfToUpper( + char *str) /* String to convert in place. */ { Tcl_UniChar ch, upChar; char *src, *dst; @@ -926,15 +829,15 @@ Tcl_UtfToUpper(str) src = dst = str; while (*src) { - bytes = Tcl_UtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); upChar = Tcl_UniCharToUpper(ch); /* - * To keep badly formed Utf strings from getting inflated by - * the conversion (thereby causing a segfault), only copy the - * upper case char to dst if its size is <= the original char. + * To keep badly formed Utf strings from getting inflated by the + * conversion (thereby causing a segfault), only copy the upper case + * char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(upChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -952,12 +855,12 @@ Tcl_UtfToUpper(str) * * Tcl_UtfToLower -- * - * Convert uppercase characters to lowercase characters in a UTF - * string in place. The conversion may shrink the UTF string. + * Convert uppercase characters to lowercase characters in a UTF string + * in place. The conversion may shrink the UTF string. * * Results: - * Returns the number of bytes in the resulting string - * excluding the trailing null. + * Returns the number of bytes in the resulting string excluding the + * trailing null. * * Side effects: * Writes a terminating null after the last converted character. @@ -966,28 +869,28 @@ Tcl_UtfToUpper(str) */ int -Tcl_UtfToLower(str) - char *str; /* String to convert in place. */ +Tcl_UtfToLower( + char *str) /* String to convert in place. */ { Tcl_UniChar ch, lowChar; char *src, *dst; int bytes; - + /* * Iterate over the string until we hit the terminating null. */ src = dst = str; while (*src) { - bytes = Tcl_UtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); /* - * To keep badly formed Utf strings from getting inflated by - * the conversion (thereby causing a segfault), only copy the - * lower case char to dst if its size is <= the original char. + * To keep badly formed Utf strings from getting inflated by the + * conversion (thereby causing a segfault), only copy the lower case + * char to dst if its size is <= the original char. */ - + if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; @@ -1005,13 +908,13 @@ Tcl_UtfToLower(str) * * Tcl_UtfToTitle -- * - * Changes the first character of a UTF string to title case or - * uppercase and the rest of the string to lowercase. The - * conversion happens in place and may shrink the UTF string. + * Changes the first character of a UTF string to title case or uppercase + * and the rest of the string to lowercase. The conversion happens in + * place and may shrink the UTF string. * * Results: - * Returns the number of bytes in the resulting string - * excluding the trailing null. + * Returns the number of bytes in the resulting string excluding the + * trailing null. * * Side effects: * Writes a terminating null after the last converted character. @@ -1020,13 +923,13 @@ Tcl_UtfToLower(str) */ int -Tcl_UtfToTitle(str) - char *str; /* String to convert in place. */ +Tcl_UtfToTitle( + char *str) /* String to convert in place. */ { Tcl_UniChar ch, titleChar, lowChar; char *src, *dst; int bytes; - + /* * Capitalize the first character and then lowercase the rest of the * characters until we get to a null. @@ -1035,7 +938,7 @@ Tcl_UtfToTitle(str) src = dst = str; if (*src) { - bytes = Tcl_UtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); if (bytes < UtfCount(titleChar)) { @@ -1047,7 +950,7 @@ Tcl_UtfToTitle(str) src += bytes; } while (*src) { - bytes = Tcl_UtfToUniChar(src, &ch); + bytes = TclUtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); if (bytes < UtfCount(lowChar)) { @@ -1065,10 +968,57 @@ Tcl_UtfToTitle(str) /* *---------------------------------------------------------------------- * + * TclpUtfNcmp2 -- + * + * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and + * ct are assumed to be at least numBytes bytes long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclpUtfNcmp2( + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ + unsigned long numBytes) /* Number of *bytes* to compare. */ +{ + /* + * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to + * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes + * fine in the strcmp manner. + */ + + register int result = 0; + + for ( ; numBytes != 0; numBytes--, cs++, ct++) { + if (*cs != *ct) { + result = UCHAR(*cs) - UCHAR(*ct); + break; + } + } + if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { + unsigned char c1, c2; + + c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); + c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); + result = (c1 - c2); + } + return result; +} + +/* + *---------------------------------------------------------------------- + * * Tcl_UtfNcmp -- * - * Compare at most n UTF chars of string cs to string ct. Both cs - * and ct are assumed to be at least n UTF chars long. + * Compare at most numChars UTF chars of string cs to string ct. Both cs + * and ct are assumed to be at least numChars UTF chars long. * * Results: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. @@ -1080,27 +1030,28 @@ Tcl_UtfToTitle(str) */ int -Tcl_UtfNcmp(cs, ct, n) - CONST char *cs; /* UTF string to compare to ct. */ - CONST char *ct; /* UTF string cs is compared to. */ - unsigned long n; /* Number of UTF chars to compare. */ +Tcl_UtfNcmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; + /* - * Another approach that should work is: - * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs)); - * That assumes that ct is a properly formed UTF, so we will just - * be comparing the bytes that compromise those strings to the - * char length n. + * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the + * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 + * (the byte 0x01.) */ - while (n-- > 0) { + + while (numChars-- > 0) { /* - * n must be interpreted as chars, not bytes. - * This should be called only when both strings are of - * at least n chars long (no need for \0 check) + * n must be interpreted as chars, not bytes. This should be called + * only when both strings are of at least n chars long (no need for \0 + * check) */ - cs += Tcl_UtfToUniChar(cs, &ch1); - ct += Tcl_UtfToUniChar(ct, &ch2); + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { return (ch1 - ch2); } @@ -1113,9 +1064,9 @@ Tcl_UtfNcmp(cs, ct, n) * * Tcl_UtfNcasecmp -- * - * Compare at most n UTF chars of string cs to string ct case - * insensitive. Both cs and ct are assumed to be at least n - * UTF chars long. + * Compare at most numChars UTF chars of string cs to string ct case + * insensitive. Both cs and ct are assumed to be at least numChars UTF + * chars long. * * Results: * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. @@ -1127,20 +1078,20 @@ Tcl_UtfNcmp(cs, ct, n) */ int -Tcl_UtfNcasecmp(cs, ct, n) - CONST char *cs; /* UTF string to compare to ct. */ - CONST char *ct; /* UTF string cs is compared to. */ - unsigned long n; /* Number of UTF chars to compare. */ +Tcl_UtfNcasecmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; - while (n-- > 0) { + while (numChars-- > 0) { /* * n must be interpreted as chars, not bytes. * This should be called only when both strings are of * at least n chars long (no need for \0 check) */ - cs += Tcl_UtfToUniChar(cs, &ch1); - ct += Tcl_UtfToUniChar(ct, &ch2); + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); @@ -1155,6 +1106,46 @@ Tcl_UtfNcasecmp(cs, ct, n) /* *---------------------------------------------------------------------- * + * Tcl_UtfNcasecmp -- + * + * Compare UTF chars of string cs to string ct case insensitively. + * Replacement for strcasecmp in Tcl core, in places where UTF-8 should + * be handled. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCasecmp( + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ +{ + while (*cs && *ct) { + Tcl_UniChar ch1, ch2; + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) { + return ch1 - ch2; + } + } + } + return UCHAR(*cs) - UCHAR(*ct); +} + + +/* + *---------------------------------------------------------------------- + * * Tcl_UniCharToUpper -- * * Compute the uppercase equivalent of the given Unicode character. @@ -1169,16 +1160,15 @@ Tcl_UtfNcasecmp(cs, ct, n) */ Tcl_UniChar -Tcl_UniCharToUpper(ch) - int ch; /* Unicode character to convert. */ +Tcl_UniCharToUpper( + int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x04) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; + ch -= GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1198,16 +1188,15 @@ Tcl_UniCharToUpper(ch) */ Tcl_UniChar -Tcl_UniCharToLower(ch) - int ch; /* Unicode character to convert. */ +Tcl_UniCharToLower( + int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); if (GetCaseType(info) & 0x02) { - return (Tcl_UniChar) (ch + GetDelta(info)); - } else { - return ch; + ch += GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1227,8 +1216,8 @@ Tcl_UniCharToLower(ch) */ Tcl_UniChar -Tcl_UniCharToTitle(ch) - int ch; /* Unicode character to convert. */ +Tcl_UniCharToTitle( + int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); int mode = GetCaseType(info); @@ -1238,12 +1227,11 @@ Tcl_UniCharToTitle(ch) * Subtract or add one depending on the original case. */ - return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); + ch += ((mode & 0x4) ? -1 : 1); } else if (mode == 0x4) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; + ch -= GetDelta(info); } + return (Tcl_UniChar) ch; } /* @@ -1251,7 +1239,7 @@ Tcl_UniCharToTitle(ch) * * Tcl_UniCharLen -- * - * Find the length of a UniChar string. The str input must be null + * Find the length of a UniChar string. The str input must be null * terminated. * * Results: @@ -1264,14 +1252,14 @@ Tcl_UniCharToTitle(ch) */ int -Tcl_UniCharLen(str) - Tcl_UniChar *str; /* Unicode string to find length of. */ +Tcl_UniCharLen( + const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; - - while (*str != '\0') { + + while (*uniStr != '\0') { len++; - str++; + uniStr++; } return len; } @@ -1281,11 +1269,11 @@ Tcl_UniCharLen(str) * * Tcl_UniCharNcmp -- * - * Compare at most n unichars of string cs to string ct. Both cs - * and ct are assumed to be at least n unichars long. + * Compare at most numChars unichars of string ucs to string uct. + * Both ucs and uct are assumed to be at least numChars unichars long. * * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. * * Side effects: * None. @@ -1294,17 +1282,64 @@ Tcl_UniCharLen(str) */ int -Tcl_UniCharNcmp(cs, ct, n) - CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ - CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ - unsigned long n; /* Number of unichars to compare. */ +Tcl_UniCharNcmp( + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ { - for ( ; n != 0; n--, cs++, ct++) { - if (*cs != *ct) { - return *cs - *ct; +#ifdef WORDS_BIGENDIAN + /* + * We are definitely on a big-endian machine; memcmp() is safe + */ + + return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); + +#else /* !WORDS_BIGENDIAN */ + /* + * We can't simply call memcmp() because that is not lexically correct. + */ + + for ( ; numChars != 0; ucs++, uct++, numChars--) { + if (*ucs != *uct) { + return (*ucs - *uct); } - if (*cs == '\0') { - break; + } + return 0; +#endif /* WORDS_BIGENDIAN */ +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcasecmp -- + * + * Compare at most numChars unichars of string ucs to string uct case + * insensitive. Both ucs and uct are assumed to be at least numChars + * unichars long. + * + * Results: + * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcasecmp( + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ + for ( ; numChars != 0; numChars--, ucs++, uct++) { + if (*ucs != *uct) { + Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); + Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + + if (lcs != lct) { + return (lcs - lct); + } } } return 0; @@ -1327,12 +1362,10 @@ Tcl_UniCharNcmp(cs, ct, n) */ int -Tcl_UniCharIsAlnum(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsAlnum( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); + return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } /* @@ -1352,11 +1385,10 @@ Tcl_UniCharIsAlnum(ch) */ int -Tcl_UniCharIsAlpha(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsAlpha( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((ALPHA_BITS >> category) & 1); + return ((ALPHA_BITS >> GetCategory(ch)) & 1); } /* @@ -1376,10 +1408,10 @@ Tcl_UniCharIsAlpha(ch) */ int -Tcl_UniCharIsControl(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsControl( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); + return ((CONTROL_BITS >> GetCategory(ch)) & 1); } /* @@ -1399,11 +1431,10 @@ Tcl_UniCharIsControl(ch) */ int -Tcl_UniCharIsDigit(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsDigit( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) - == DECIMAL_DIGIT_NUMBER); + return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } /* @@ -1423,11 +1454,10 @@ Tcl_UniCharIsDigit(ch) */ int -Tcl_UniCharIsGraph(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsGraph( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); + return ((GRAPH_BITS >> GetCategory(ch)) & 1); } /* @@ -1447,10 +1477,10 @@ Tcl_UniCharIsGraph(ch) */ int -Tcl_UniCharIsLower(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsLower( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); + return (GetCategory(ch) == LOWERCASE_LETTER); } /* @@ -1470,11 +1500,10 @@ Tcl_UniCharIsLower(ch) */ int -Tcl_UniCharIsPrint(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsPrint( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PRINT_BITS >> category) & 1); + return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } /* @@ -1494,11 +1523,10 @@ Tcl_UniCharIsPrint(ch) */ int -Tcl_UniCharIsPunct(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsPunct( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((PUNCT_BITS >> category) & 1); + return ((PUNCT_BITS >> GetCategory(ch)) & 1); } /* @@ -1518,21 +1546,22 @@ Tcl_UniCharIsPunct(ch) */ int -Tcl_UniCharIsSpace(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsSpace( + int ch) /* Unicode character to test. */ { - register int category; - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { - return isspace(UCHAR(ch)); /* INTL: ISO space */ + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + return TclIsSpaceProc((char) ch); + } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e + || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060 + || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) { + return 1; } else { - category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((SPACE_BITS >> category) & 1); + return ((SPACE_BITS >> GetCategory(ch)) & 1); } } @@ -1553,10 +1582,10 @@ Tcl_UniCharIsSpace(ch) */ int -Tcl_UniCharIsUpper(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsUpper( + int ch) /* Unicode character to test. */ { - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); + return (GetCategory(ch) == UPPERCASE_LETTER); } /* @@ -1564,8 +1593,7 @@ Tcl_UniCharIsUpper(ch) * * Tcl_UniCharIsWordChar -- * - * Test if a character is alphanumeric or a connector punctuation - * mark. + * Test if a character is alphanumeric or a connector punctuation mark. * * Results: * Returns 1 if character is a word character. @@ -1577,10 +1605,397 @@ Tcl_UniCharIsUpper(ch) */ int -Tcl_UniCharIsWordChar(ch) - int ch; /* Unicode character to test. */ +Tcl_UniCharIsWordChar( + int ch) /* Unicode character to test. */ { - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((WORD_BITS >> GetCategory(ch)) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharCaseMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. This is the Unicode equivalent of the char* + * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. + * This has no provision for counted UniChar strings, thus should not be + * used where NULLs are expected in the UniChar string. Use + * TclUniCharMatch where possible. + * + * Results: + * The return value is 1 if string matches pattern, and 0 otherwise. The + * matching operation permits the following special characters in the + * pattern: *?\[] (see the manual entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharCaseMatch( + const Tcl_UniChar *uniStr, /* Unicode String. */ + const Tcl_UniChar *uniPattern, + /* Pattern, which may contain special + * characters. */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ +{ + Tcl_UniChar ch1, p; + + while (1) { + p = *uniPattern; + + /* + * See if we're at the end of both the pattern and the string. If so, + * we succeeded. If we're at the end of the pattern but not at the end + * of the string, we failed. + */ + + if (p == 0) { + return (*uniStr == 0); + } + if ((*uniStr == 0) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern + */ + + while (*(++uniPattern) == '*') { + /* empty body */ + } + p = *uniPattern; + if (p == 0) { + return 1; + } + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*uniStr && (p != *uniStr) + && (p != Tcl_UniCharToLower(*uniStr))) { + uniStr++; + } + } else { + while (*uniStr && (p != *uniStr)) { + uniStr++; + } + } + } + if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { + return 1; + } + if (*uniStr == 0) { + return 0; + } + uniStr++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches any + * single character. + */ + + if (p == '?') { + uniPattern++; + uniStr++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed by a + * list of characters that are acceptable, or by a range (two + * characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar startChar, endChar; + + uniPattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); + uniStr++; + while (1) { + if ((*uniPattern == ']') || (*uniPattern == 0)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (*uniPattern == '-') { + uniPattern++; + if (*uniPattern == 0) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*uniPattern != ']') { + if (*uniPattern == 0) { + uniPattern--; + break; + } + uniPattern++; + } + uniPattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' so we + * do exact matching on the character that follows. + */ - return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); + if (p == '\\') { + if (*(++uniPattern) == '\0') { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next bytes of + * each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*uniStr) != + Tcl_UniCharToLower(*uniPattern)) { + return 0; + } + } else if (*uniStr != *uniPattern) { + return 0; + } + uniStr++; + uniPattern++; + } +} + +/* + *---------------------------------------------------------------------- + * + * TclUniCharMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. This is the Unicode equivalent of the char* + * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted + * Strings, so embedded NULLs are allowed. + * + * Results: + * The return value is 1 if string matches pattern, and 0 otherwise. The + * matching operation permits the following special characters in the + * pattern: *?\[] (see the manual entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUniCharMatch( + const Tcl_UniChar *string, /* Unicode String. */ + int strLen, /* Length of String */ + const Tcl_UniChar *pattern, /* Pattern, which may contain special + * characters. */ + int ptnLen, /* Length of Pattern */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ +{ + const Tcl_UniChar *stringEnd, *patternEnd; + Tcl_UniChar p; + + stringEnd = string + strLen; + patternEnd = pattern + ptnLen; + + while (1) { + /* + * See if we're at the end of both the pattern and the string. If so, + * we succeeded. If we're at the end of the pattern but not at the end + * of the string, we failed. + */ + + if (pattern == patternEnd) { + return (string == stringEnd); + } + p = *pattern; + if ((string == stringEnd) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern. + */ + + while (*(++pattern) == '*') { + /* empty body */ + } + if (pattern == patternEnd) { + return 1; + } + p = *pattern; + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character. + */ + + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while ((string < stringEnd) && (p != *string) + && (p != Tcl_UniCharToLower(*string))) { + string++; + } + } else { + while ((string < stringEnd) && (p != *string)) { + string++; + } + } + } + if (TclUniCharMatch(string, stringEnd - string, + pattern, patternEnd - pattern, nocase)) { + return 1; + } + if (string == stringEnd) { + return 0; + } + string++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches any + * single character. + */ + + if (p == '?') { + pattern++; + string++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed by a + * list of characters that are acceptable, or by a range (two + * characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar ch1, startChar, endChar; + + pattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + string++; + while (1) { + if ((*pattern == ']') || (pattern == patternEnd)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + pattern++; + if (*pattern == '-') { + pattern++; + if (pattern == patternEnd) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*pattern) + : *pattern); + pattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*pattern != ']') { + if (pattern == patternEnd) { + pattern--; + break; + } + pattern++; + } + pattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' so we + * do exact matching on the character that follows. + */ + + if (p == '\\') { + if (++pattern == patternEnd) { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next bytes of + * each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + return 0; + } + } else if (*string != *pattern) { + return 0; + } + string++; + pattern++; + } } + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ |