diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 1287 |
1 files changed, 1287 insertions, 0 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c new file mode 100644 index 0000000..89c6b60 --- /dev/null +++ b/generic/tclUtf.c @@ -0,0 +1,1287 @@ +/* + * tclUtf.c -- + * + * Routines for manipulating UTF-8 strings. + * + * Copyright (c) 1997-1998 Sun Microsystems, Inc. + * + * See the file "license.terms" for information on usage and redistribution + * of this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * RCS: @(#) $Id: tclUtf.c,v 1.2 1999/04/16 00:46:55 stanton Exp $ + */ + +#include "tclInt.h" + +/* + * Include the static character classification tables and macros. + */ + +#include "tclUniData.c" + +/* + * The following macros are used for fast character category tests. The + * x_BITS values are shifted right by the category value to determine whether + * the given category is included in the set. + */ + +#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ + | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) + +#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) + +#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ + | (1 << PARAGRAPH_SEPARATOR)) + +#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) + +/* + * Unicode characters less than this value are represented by themselves + * in UTF-8 strings. + */ + +#define UNICODE_SELF 0x80 + +/* + * The following structures are used when mapping between Unicode (UCS-2) + * and UTF-8. + */ + +CONST unsigned char totalBytes[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#if TCL_UTF_MAX > 3 + 4,4,4,4,4,4,4,4, +#else + 1,1,1,1,1,1,1,1, +#endif +#if TCL_UTF_MAX > 4 + 5,5,5,5, +#else + 1,1,1,1, +#endif +#if TCL_UTF_MAX > 5 + 6,6,6,6 +#else + 1,1,1,1 +#endif +}; + + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharToUtf -- + * + * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the + * provided buffer. Equivalent to Plan 9 runetochar(). + * + * Results: + * The return values is the number of bytes in the buffer that + * were consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +INLINE int +Tcl_UniCharToUtf(ch, str) + int ch; /* The Tcl_UniChar to be stored in the + * buffer. */ + char *str; /* Buffer in which the UTF-8 representation + * of the Tcl_UniChar is stored. Buffer must + * be large enough to hold the UTF-8 character + * (at most TCL_UTF_MAX bytes). */ +{ + if ((ch > 0) && (ch < UNICODE_SELF)) { + str[0] = (char) ch; + return 1; + } + if (ch <= 0x7FF) { + str[1] = (char) ((ch | 0x80) & 0xBF); + str[0] = (char) ((ch >> 6) | 0xC0); + return 2; + } + if (ch <= 0xFFFF) { + three: + str[2] = (char) ((ch | 0x80) & 0xBF); + str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + str[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + +#if TCL_UTF_MAX > 3 + if (ch <= 0x1FFFFF) { + str[3] = (char) ((ch | 0x80) & 0xBF); + str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + str[0] = (char) ((ch >> 18) | 0xF0); + return 4; + } + if (ch <= 0x3FFFFFF) { + str[4] = (char) ((ch | 0x80) & 0xBF); + str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + str[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + str[5] = (char) ((ch | 0x80) & 0xBF); + str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + str[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } +#endif + + ch = 0xFFFD; + goto three; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharToUtfDString -- + * + * Convert the given Unicode string to UTF-8. + * + * Results: + * The return value is a pointer to the UTF-8 representation of the + * Unicode string. Storage for the return value is appended to the + * end of dsPtr. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UniCharToUtfDString(wString, numChars, dsPtr) + CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ + int numChars; /* Length of Unicode string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr; /* UTF-8 representation of string is + * appended to this previously initialized + * DString. */ +{ + CONST Tcl_UniChar *w, *wEnd; + char *p, *string; + int oldLength; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * + * TCL_UTF_MAX. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = wString + numChars; + for (w = wString; w < wEnd; ) { + p += Tcl_UniCharToUtf(*w, p); + w++; + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfToUniChar -- + * + * Extract the Tcl_UniChar represented by the UTF-8 string. Bad + * UTF-8 sequences are converted to valid Tcl_UniChars and processing + * continues. Equivalent to Plan 9 chartorune(). + * + * The caller must ensure that the source buffer is long enough that + * this routine does not run off the end and dereference non-existent + * memory looking for trail bytes. If the source buffer is known to + * be '\0' terminated, this cannot happen. Otherwise, the caller + * should call Tcl_UtfCharComplete() before calling this routine to + * ensure that enough bytes remain in the string. + * + * Results: + * *chPtr is filled with the Tcl_UniChar, and the return value is the + * number of bytes from the UTF-8 string that were consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_UtfToUniChar(str, chPtr) + register CONST char *str; /* The UTF-8 string. */ + register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented + * by the UTF-8 string. */ +{ + register int byte; + + /* + * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. + */ + + byte = *((unsigned char *) str); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * characters representing themselves. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xE0) { + if ((str[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); + return 2; + } + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xF0) { + if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); + return 3; + } + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } +#if TCL_UTF_MAX > 3 + else { + int ch, total, trail; + + total = totalBytes[byte]; + trail = total - 1; + if (trail > 0) { + ch = byte & (0x3F >> trail); + do { + str++; + if ((*str & 0xC0) != 0x80) { + *chPtr = byte; + return 1; + } + ch <<= 6; + ch |= (*str & 0x3F); + trail--; + } while (trail > 0); + *chPtr = ch; + return total; + } + } +#endif + + *chPtr = (Tcl_UniChar) byte; + return 1; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfToUniCharDString -- + * + * Convert the UTF-8 string to Unicode. + * + * Results: + * The return value is a pointer to the Unicode representation of the + * UTF-8 string. Storage for the return value is appended to the + * end of dsPtr. The Unicode string is terminated with a Unicode + * NULL character. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +Tcl_UniChar * +Tcl_UtfToUniCharDString(string, length, dsPtr) + CONST char *string; /* UTF-8 string to convert to Unicode. */ + int length; /* Length of UTF-8 string in bytes, or -1 + * for strlen(). */ + Tcl_DString *dsPtr; /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + Tcl_UniChar *w, *wString; + CONST char *p, *end; + int oldLength; + + if (length < 0) { + length = strlen(string); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length + * in bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, + (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); + wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + end = string + length; + for (p = string; p < end; ) { + p += Tcl_UtfToUniChar(p, w); + w++; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + (oldLength + ((char *) w - (char *) wString))); + + return wString; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfCharComplete -- + * + * Determine if the UTF-8 string of the given length is long enough + * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the + * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). + * + * Results: + * The return value is 0 if the string is not long enough, non-zero + * otherwise. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_UtfCharComplete(str, len) + CONST char *str; /* String to check if first few bytes + * contain a complete UTF-8 character. */ + int len; /* Length of above string in bytes. */ +{ + int ch; + + ch = *((unsigned char *) str); + return len >= totalBytes[ch]; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_NumUtfChars -- + * + * Returns the number of characters (not bytes) in the UTF-8 string, + * not including the terminating NULL byte. This is equivalent to + * Plan 9 utflen() and utfnlen(). + * + * Results: + * As above. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_NumUtfChars(str, len) + register CONST char *str; /* The UTF-8 string to measure. */ + int len; /* The length of the string in bytes, or -1 + * for strlen(string). */ +{ + Tcl_UniChar ch; + register Tcl_UniChar *chPtr = &ch; + register int n; + int i; + + /* + * The separate implementations are faster. + */ + + i = 0; + if (len < 0) { + while (1) { + str += Tcl_UtfToUniChar(str, chPtr); + if (ch == '\0') { + break; + } + i++; + } + } else { + while (len > 0) { + n = Tcl_UtfToUniChar(str, chPtr); + len -= n; + str += n; + i++; + } + } + return i; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfFindFirst -- + * + * Returns a pointer to the first occurance of the given Tcl_UniChar + * in the NULL-terminated UTF-8 string. The NULL terminator is + * considered part of the UTF-8 string. Equivalent to Plan 9 + * utfrune(). + * + * Results: + * As above. If the Tcl_UniChar does not exist in the given string, + * the return value is NULL. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ +char * +Tcl_UtfFindFirst(string, ch) + CONST char *string; /* The UTF-8 string to be searched. */ + int ch; /* The Tcl_UniChar to search for. */ +{ + int len; + Tcl_UniChar find; + + while (1) { + len = Tcl_UtfToUniChar(string, &find); + if (find == ch) { + return (char *) string; + } + if (*string == '\0') { + return NULL; + } + string += len; + } +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfFindLast -- + * + * Returns a pointer to the last occurance of the given Tcl_UniChar + * in the NULL-terminated UTF-8 string. The NULL terminator is + * considered part of the UTF-8 string. Equivalent to Plan 9 + * utfrrune(). + * + * Results: + * As above. If the Tcl_UniChar does not exist in the given string, + * the return value is NULL. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UtfFindLast(string, ch) + CONST char *string; /* The UTF-8 string to be searched. */ + int ch; /* The Tcl_UniChar to search for. */ +{ + int len; + Tcl_UniChar find; + CONST char *last; + + last = NULL; + while (1) { + len = Tcl_UtfToUniChar(string, &find); + if (find == ch) { + last = string; + } + if (*string == '\0') { + break; + } + string += len; + } + return (char *) last; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfNext -- + * + * Given a pointer to some current location in a UTF-8 string, + * move forward one character. The caller must ensure that they + * are not asking for the next character after the last character + * in the string. + * + * Results: + * The return value is the pointer to the next character in + * the UTF-8 string. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UtfNext(str) + CONST char *str; /* The current location in the string. */ +{ + Tcl_UniChar ch; + + return (char *) str + Tcl_UtfToUniChar(str, &ch); +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfPrev -- + * + * Given a pointer to some current location in a UTF-8 string, + * move backwards one character. + * + * Results: + * The return value is a pointer to the previous character in the + * UTF-8 string. If the current location was already at the + * beginning of the string, the return value will also be a + * pointer to the beginning of the string. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UtfPrev(str, start) + CONST char *str; /* The current location in the string. */ + CONST char *start; /* Pointer to the beginning of the + * string, to avoid going backwards too + * far. */ +{ + CONST char *look; + int i, byte; + + str--; + look = str; + for (i = 0; i < TCL_UTF_MAX; i++) { + if (look < start) { + if (str < start) { + str = start; + } + break; + } + byte = *((unsigned char *) look); + if (byte < 0x80) { + break; + } + if (byte >= 0xC0) { + if (totalBytes[byte] != i + 1) { + break; + } + return (char *) look; + } + look--; + } + return (char *) str; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharAtIndex -- + * + * Returns the Unicode character represented at the specified + * character (not byte) position in the UTF-8 string. + * + * Results: + * As above. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharAtIndex(src, index) + register CONST char *src; /* The UTF-8 string to dereference. */ + register int index; /* The position of the desired character. */ +{ + Tcl_UniChar ch; + + while (index >= 0) { + index--; + src += Tcl_UtfToUniChar(src, &ch); + } + return ch; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfAtIndex -- + * + * Returns a pointer to the specified character (not byte) position + * in the UTF-8 string. + * + * Results: + * As above. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UtfAtIndex(src, index) + register CONST char *src; /* The UTF-8 string. */ + register int index; /* The position of the desired character. */ +{ + Tcl_UniChar ch; + + while (index > 0) { + index--; + src += Tcl_UtfToUniChar(src, &ch); + } + return (char *) src; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfBackslash -- + * + * Figure out how to handle a backslash sequence. + * + * Results: + * Stores the bytes represented by the backslash sequence in dst and + * returns the number of bytes written to dst. At most TCL_UTF_MAX + * bytes are written to dst; dst must have been large enough to accept + * those bytes. If readPtr isn't NULL then it is filled in with a + * count of the number of bytes in the backslash sequence. + * + * Side effects: + * The maximum number of bytes it takes to represent a Unicode + * character in UTF-8 is guaranteed to be less than the number of + * bytes used to express the backslash sequence that represents + * that Unicode character. If the target buffer into which the + * caller is going to store the bytes that represent the Unicode + * character is at least as large as the source buffer from which + * the backslashed sequence was extracted, no buffer overruns should + * occur. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_UtfBackslash(src, readPtr, dst) + CONST char *src; /* Points to the backslash character of + * a backslash sequence. */ + int *readPtr; /* Fill in with number of characters read + * from src, unless NULL. */ + char *dst; /* Filled with the bytes represented by the + * backslash sequence. */ +{ + register CONST char *p = src+1; + int result, count, n; + char buf[TCL_UTF_MAX]; + + if (dst == NULL) { + dst = buf; + } + + count = 2; + switch (*p) { + /* + * Note: in the conversions below, use absolute values (e.g., + * 0xa) rather than symbolic values (e.g. \n) that get converted + * by the compiler. It's possible that compilers on some + * platforms will do the symbolic conversions differently, which + * could result in non-portable Tcl scripts. + */ + + case 'a': + result = 0x7; + break; + case 'b': + result = 0x8; + break; + case 'f': + result = 0xc; + break; + case 'n': + result = 0xa; + break; + case 'r': + result = 0xd; + break; + case 't': + result = 0x9; + break; + case 'v': + result = 0xb; + break; + case 'x': + if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ + char *end; + + result = (unsigned char) strtoul(p+1, &end, 16); + count = end - src; + } else { + count = 2; + result = 'x'; + } + break; + case 'u': + result = 0; + for (count = 0; count < 4; count++) { + p++; + if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ + break; + } + n = *p - '0'; + if (n > 9) { + n = n + '0' + 10 - 'A'; + } + if (n > 16) { + n = n + 'A' - 'a'; + } + result = (result << 4) + n; + } + if (count == 0) { + result = 'u'; + } + count += 2; + break; + + case '\n': + do { + p++; + } while ((*p == ' ') || (*p == '\t')); + result = ' '; + count = p - src; + break; + case 0: + result = '\\'; + count = 1; + break; + default: + if (isdigit(UCHAR(*p))) { /* INTL: digit */ + result = (unsigned char)(*p - '0'); + p++; + if (!isdigit(UCHAR(*p))) { /* INTL: digit */ + break; + } + count = 3; + result = (unsigned char)((result << 3) + (*p - '0')); + p++; + if (!isdigit(UCHAR(*p))) { /* INTL: digit */ + break; + } + count = 4; + result = (unsigned char)((result << 3) + (*p - '0')); + break; + } + result = *p; + count = 2; + break; + } + + if (readPtr != NULL) { + *readPtr = count; + } + return Tcl_UniCharToUtf(result, dst); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfToUpper -- + * + * Convert lowercase characters to uppercase characters in a UTF + * string in place. The conversion may shrink the UTF string. + * + * Results: + * Returns the number of bytes in the resulting string + * excluding the trailing null. + * + * Side effects: + * Writes a terminating null after the last converted character. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfToUpper(str) + char *str; /* String to convert in place. */ +{ + Tcl_UniChar ch; + char *src, *dst; + + /* + * Iterate over the string until we hit the terminating null. + */ + + src = dst = str; + while (*src) { + src += Tcl_UtfToUniChar(src, &ch); + dst += Tcl_UniCharToUtf(Tcl_UniCharToUpper(ch), dst); + } + *dst = '\0'; + return (dst - str); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfToLower -- + * + * Convert uppercase characters to lowercase characters in a UTF + * string in place. The conversion may shrink the UTF string. + * + * Results: + * Returns the number of bytes in the resulting string + * excluding the trailing null. + * + * Side effects: + * Writes a terminating null after the last converted character. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfToLower(str) + char *str; /* String to convert in place. */ +{ + Tcl_UniChar ch; + char *src, *dst; + + /* + * Iterate over the string until we hit the terminating null. + */ + + src = dst = str; + while (*src) { + src += Tcl_UtfToUniChar(src, &ch); + dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst); + } + *dst = '\0'; + return (dst - str); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfToTitle -- + * + * Changes the first character of a UTF string to title case or + * uppercase and the rest of the string to lowercase. The + * conversion happens in place and may shrink the UTF string. + * + * Results: + * Returns the number of bytes in the resulting string + * excluding the trailing null. + * + * Side effects: + * Writes a terminating null after the last converted character. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfToTitle(str) + char *str; /* String to convert in place. */ +{ + Tcl_UniChar ch; + char *src, *dst; + + /* + * Capitalize the first character and then lowercase the rest of the + * characters until we get to a null. + */ + + src = dst = str; + + if (*src) { + src += Tcl_UtfToUniChar(src, &ch); + dst += Tcl_UniCharToUtf(Tcl_UniCharToTitle(ch), dst); + } + while (*src) { + src += Tcl_UtfToUniChar(src, &ch); + dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst); + } + *dst = '\0'; + return (dst - str); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharToUpper -- + * + * Compute the uppercase equivalent of the given Unicode character. + * + * Results: + * Returns the uppercase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharToUpper(ch) + int ch; /* Unicode character to convert. */ +{ + int info = GetUniCharInfo(ch); + + if (GetCaseType(info) & 0x04) { + return (Tcl_UniChar) (ch - GetDelta(info)); + } else { + return ch; + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharToLower -- + * + * Compute the lowercase equivalent of the given Unicode character. + * + * Results: + * Returns the lowercase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharToLower(ch) + int ch; /* Unicode character to convert. */ +{ + int info = GetUniCharInfo(ch); + + if (GetCaseType(info) & 0x02) { + return (Tcl_UniChar) (ch + GetDelta(info)); + } else { + return ch; + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharToTitle -- + * + * Compute the titlecase equivalent of the given Unicode character. + * + * Results: + * Returns the titlecase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharToTitle(ch) + int ch; /* Unicode character to convert. */ +{ + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); + + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ + + return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); + } else if (mode == 0x4) { + return (Tcl_UniChar) (ch - GetDelta(info)); + } else { + return ch; + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharLen -- + * + * Find the length of a UniChar string. The str input must be null + * terminated. + * + * Results: + * Returns the length of str in UniChars (not bytes). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharLen(str) + Tcl_UniChar *str; /* Unicode string to find length of. */ +{ + int len = 0; + + while (*str != '\0') { + len++; + str++; + } + return len; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcmp -- + * + * Compare at most n unichars of string cs to string ct. Both cs + * and ct are assumed to be at least n unichars long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcmp(cs, ct, n) + CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ + CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ + size_t n; /* Number of unichars to compare. */ +{ + for ( ; n != 0; n--, cs++, ct++) { + if (*cs != *ct) { + return *cs - *ct; + } + if (*cs == '\0') { + break; + } + } + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsAlnum -- + * + * Test if a character is an alphanumeric Unicode character. + * + * Results: + * Returns 1 if character is alphanumeric. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsAlnum(ch) + int ch; /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + + return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsAlpha -- + * + * Test if a character is an alphabetic Unicode character. + * + * Results: + * Returns 1 if character is alphabetic. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsAlpha(ch) + int ch; /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((ALPHA_BITS >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsDigit -- + * + * Test if a character is a numeric Unicode character. + * + * Results: + * Returns non-zero if character is a digit. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsDigit(ch) + int ch; /* Unicode character to test. */ +{ + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) + == DECIMAL_DIGIT_NUMBER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsLower -- + * + * Test if a character is a lowercase Unicode character. + * + * Results: + * Returns non-zero if character is lowercase. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsLower(ch) + int ch; /* Unicode character to test. */ +{ + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsSpace -- + * + * Test if a character is a whitespace Unicode character. + * + * Results: + * Returns non-zero if character is a space. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsSpace(ch) + int ch; /* Unicode character to test. */ +{ + register int category; + + /* + * If the character is within the first 127 characters, just use the + * standard C function, otherwise consult the Unicode table. + */ + + if (ch < 0x80) { + return isspace(UCHAR(ch)); /* INTL: ISO space */ + } else { + category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((SPACE_BITS >> category) & 1); + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsUpper -- + * + * Test if a character is a uppercase Unicode character. + * + * Results: + * Returns non-zero if character is uppercase. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsUpper(ch) + int ch; /* Unicode character to test. */ +{ + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsWordChar -- + * + * Test if a character is alphanumeric or a connector punctuation + * mark. + * + * Results: + * Returns 1 if character is a word character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsWordChar(ch) + int ch; /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + + return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); +} |