diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 1287 |
1 files changed, 0 insertions, 1287 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c deleted file mode 100644 index 89c6b60..0000000 --- a/generic/tclUtf.c +++ /dev/null @@ -1,1287 +0,0 @@ -/* - * tclUtf.c -- - * - * Routines for manipulating UTF-8 strings. - * - * Copyright (c) 1997-1998 Sun Microsystems, Inc. - * - * See the file "license.terms" for information on usage and redistribution - * of this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtf.c,v 1.2 1999/04/16 00:46:55 stanton Exp $ - */ - -#include "tclInt.h" - -/* - * Include the static character classification tables and macros. - */ - -#include "tclUniData.c" - -/* - * The following macros are used for fast character category tests. The - * x_BITS values are shifted right by the category value to determine whether - * the given category is included in the set. - */ - -#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ - | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) - -#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) - -#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ - | (1 << PARAGRAPH_SEPARATOR)) - -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) - -/* - * Unicode characters less than this value are represented by themselves - * in UTF-8 strings. - */ - -#define UNICODE_SELF 0x80 - -/* - * The following structures are used when mapping between Unicode (UCS-2) - * and UTF-8. - */ - -CONST unsigned char totalBytes[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -#if TCL_UTF_MAX > 3 - 4,4,4,4,4,4,4,4, -#else - 1,1,1,1,1,1,1,1, -#endif -#if TCL_UTF_MAX > 4 - 5,5,5,5, -#else - 1,1,1,1, -#endif -#if TCL_UTF_MAX > 5 - 6,6,6,6 -#else - 1,1,1,1 -#endif -}; - - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UniCharToUtf -- - * - * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the - * provided buffer. Equivalent to Plan 9 runetochar(). - * - * Results: - * The return values is the number of bytes in the buffer that - * were consumed. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -INLINE int -Tcl_UniCharToUtf(ch, str) - int ch; /* The Tcl_UniChar to be stored in the - * buffer. */ - char *str; /* Buffer in which the UTF-8 representation - * of the Tcl_UniChar is stored. Buffer must - * be large enough to hold the UTF-8 character - * (at most TCL_UTF_MAX bytes). */ -{ - if ((ch > 0) && (ch < UNICODE_SELF)) { - str[0] = (char) ch; - return 1; - } - if (ch <= 0x7FF) { - str[1] = (char) ((ch | 0x80) & 0xBF); - str[0] = (char) ((ch >> 6) | 0xC0); - return 2; - } - if (ch <= 0xFFFF) { - three: - str[2] = (char) ((ch | 0x80) & 0xBF); - str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 12) | 0xE0); - return 3; - } - -#if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { - str[3] = (char) ((ch | 0x80) & 0xBF); - str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 18) | 0xF0); - return 4; - } - if (ch <= 0x3FFFFFF) { - str[4] = (char) ((ch | 0x80) & 0xBF); - str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); - str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 24) | 0xF8); - return 5; - } - if (ch <= 0x7FFFFFFF) { - str[5] = (char) ((ch | 0x80) & 0xBF); - str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); - str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); - str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); - str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); - str[0] = (char) ((ch >> 30) | 0xFC); - return 6; - } -#endif - - ch = 0xFFFD; - goto three; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UniCharToUtfDString -- - * - * Convert the given Unicode string to UTF-8. - * - * Results: - * The return value is a pointer to the UTF-8 representation of the - * Unicode string. Storage for the return value is appended to the - * end of dsPtr. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -char * -Tcl_UniCharToUtfDString(wString, numChars, dsPtr) - CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ - int numChars; /* Length of Unicode string in Tcl_UniChars - * (must be >= 0). */ - Tcl_DString *dsPtr; /* UTF-8 representation of string is - * appended to this previously initialized - * DString. */ -{ - CONST Tcl_UniChar *w, *wEnd; - char *p, *string; - int oldLength; - - /* - * UTF-8 string length in bytes will be <= Unicode string length * - * TCL_UTF_MAX. - */ - - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); - string = Tcl_DStringValue(dsPtr) + oldLength; - - p = string; - wEnd = wString + numChars; - for (w = wString; w < wEnd; ) { - p += Tcl_UniCharToUtf(*w, p); - w++; - } - Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); - - return string; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfToUniChar -- - * - * Extract the Tcl_UniChar represented by the UTF-8 string. Bad - * UTF-8 sequences are converted to valid Tcl_UniChars and processing - * continues. Equivalent to Plan 9 chartorune(). - * - * The caller must ensure that the source buffer is long enough that - * this routine does not run off the end and dereference non-existent - * memory looking for trail bytes. If the source buffer is known to - * be '\0' terminated, this cannot happen. Otherwise, the caller - * should call Tcl_UtfCharComplete() before calling this routine to - * ensure that enough bytes remain in the string. - * - * Results: - * *chPtr is filled with the Tcl_UniChar, and the return value is the - * number of bytes from the UTF-8 string that were consumed. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UtfToUniChar(str, chPtr) - register CONST char *str; /* The UTF-8 string. */ - register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented - * by the UTF-8 string. */ -{ - register int byte; - - /* - * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. - */ - - byte = *((unsigned char *) str); - if (byte < 0xC0) { - /* - * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid - * characters representing themselves. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } else if (byte < 0xE0) { - if ((str[1] & 0xC0) == 0x80) { - /* - * Two-byte-character lead-byte followed by a trail-byte. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); - return 2; - } - /* - * A two-byte-character lead-byte not followed by trail-byte - * represents itself. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } else if (byte < 0xF0) { - if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { - /* - * Three-byte-character lead byte followed by two trail bytes. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) - | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); - return 3; - } - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ - - *chPtr = (Tcl_UniChar) byte; - return 1; - } -#if TCL_UTF_MAX > 3 - else { - int ch, total, trail; - - total = totalBytes[byte]; - trail = total - 1; - if (trail > 0) { - ch = byte & (0x3F >> trail); - do { - str++; - if ((*str & 0xC0) != 0x80) { - *chPtr = byte; - return 1; - } - ch <<= 6; - ch |= (*str & 0x3F); - trail--; - } while (trail > 0); - *chPtr = ch; - return total; - } - } -#endif - - *chPtr = (Tcl_UniChar) byte; - return 1; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfToUniCharDString -- - * - * Convert the UTF-8 string to Unicode. - * - * Results: - * The return value is a pointer to the Unicode representation of the - * UTF-8 string. Storage for the return value is appended to the - * end of dsPtr. The Unicode string is terminated with a Unicode - * NULL character. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -Tcl_UniChar * -Tcl_UtfToUniCharDString(string, length, dsPtr) - CONST char *string; /* UTF-8 string to convert to Unicode. */ - int length; /* Length of UTF-8 string in bytes, or -1 - * for strlen(). */ - Tcl_DString *dsPtr; /* Unicode representation of string is - * appended to this previously initialized - * DString. */ -{ - Tcl_UniChar *w, *wString; - CONST char *p, *end; - int oldLength; - - if (length < 0) { - length = strlen(string); - } - - /* - * Unicode string length in Tcl_UniChars will be <= UTF-8 string length - * in bytes. - */ - - oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, - (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); - wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); - - w = wString; - end = string + length; - for (p = string; p < end; ) { - p += Tcl_UtfToUniChar(p, w); - w++; - } - *w = '\0'; - Tcl_DStringSetLength(dsPtr, - (oldLength + ((char *) w - (char *) wString))); - - return wString; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfCharComplete -- - * - * Determine if the UTF-8 string of the given length is long enough - * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the - * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). - * - * Results: - * The return value is 0 if the string is not long enough, non-zero - * otherwise. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UtfCharComplete(str, len) - CONST char *str; /* String to check if first few bytes - * contain a complete UTF-8 character. */ - int len; /* Length of above string in bytes. */ -{ - int ch; - - ch = *((unsigned char *) str); - return len >= totalBytes[ch]; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_NumUtfChars -- - * - * Returns the number of characters (not bytes) in the UTF-8 string, - * not including the terminating NULL byte. This is equivalent to - * Plan 9 utflen() and utfnlen(). - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_NumUtfChars(str, len) - register CONST char *str; /* The UTF-8 string to measure. */ - int len; /* The length of the string in bytes, or -1 - * for strlen(string). */ -{ - Tcl_UniChar ch; - register Tcl_UniChar *chPtr = &ch; - register int n; - int i; - - /* - * The separate implementations are faster. - */ - - i = 0; - if (len < 0) { - while (1) { - str += Tcl_UtfToUniChar(str, chPtr); - if (ch == '\0') { - break; - } - i++; - } - } else { - while (len > 0) { - n = Tcl_UtfToUniChar(str, chPtr); - len -= n; - str += n; - i++; - } - } - return i; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfFindFirst -- - * - * Returns a pointer to the first occurance of the given Tcl_UniChar - * in the NULL-terminated UTF-8 string. The NULL terminator is - * considered part of the UTF-8 string. Equivalent to Plan 9 - * utfrune(). - * - * Results: - * As above. If the Tcl_UniChar does not exist in the given string, - * the return value is NULL. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ -char * -Tcl_UtfFindFirst(string, ch) - CONST char *string; /* The UTF-8 string to be searched. */ - int ch; /* The Tcl_UniChar to search for. */ -{ - int len; - Tcl_UniChar find; - - while (1) { - len = Tcl_UtfToUniChar(string, &find); - if (find == ch) { - return (char *) string; - } - if (*string == '\0') { - return NULL; - } - string += len; - } -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfFindLast -- - * - * Returns a pointer to the last occurance of the given Tcl_UniChar - * in the NULL-terminated UTF-8 string. The NULL terminator is - * considered part of the UTF-8 string. Equivalent to Plan 9 - * utfrrune(). - * - * Results: - * As above. If the Tcl_UniChar does not exist in the given string, - * the return value is NULL. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -char * -Tcl_UtfFindLast(string, ch) - CONST char *string; /* The UTF-8 string to be searched. */ - int ch; /* The Tcl_UniChar to search for. */ -{ - int len; - Tcl_UniChar find; - CONST char *last; - - last = NULL; - while (1) { - len = Tcl_UtfToUniChar(string, &find); - if (find == ch) { - last = string; - } - if (*string == '\0') { - break; - } - string += len; - } - return (char *) last; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfNext -- - * - * Given a pointer to some current location in a UTF-8 string, - * move forward one character. The caller must ensure that they - * are not asking for the next character after the last character - * in the string. - * - * Results: - * The return value is the pointer to the next character in - * the UTF-8 string. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -char * -Tcl_UtfNext(str) - CONST char *str; /* The current location in the string. */ -{ - Tcl_UniChar ch; - - return (char *) str + Tcl_UtfToUniChar(str, &ch); -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfPrev -- - * - * Given a pointer to some current location in a UTF-8 string, - * move backwards one character. - * - * Results: - * The return value is a pointer to the previous character in the - * UTF-8 string. If the current location was already at the - * beginning of the string, the return value will also be a - * pointer to the beginning of the string. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -char * -Tcl_UtfPrev(str, start) - CONST char *str; /* The current location in the string. */ - CONST char *start; /* Pointer to the beginning of the - * string, to avoid going backwards too - * far. */ -{ - CONST char *look; - int i, byte; - - str--; - look = str; - for (i = 0; i < TCL_UTF_MAX; i++) { - if (look < start) { - if (str < start) { - str = start; - } - break; - } - byte = *((unsigned char *) look); - if (byte < 0x80) { - break; - } - if (byte >= 0xC0) { - if (totalBytes[byte] != i + 1) { - break; - } - return (char *) look; - } - look--; - } - return (char *) str; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UniCharAtIndex -- - * - * Returns the Unicode character represented at the specified - * character (not byte) position in the UTF-8 string. - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharAtIndex(src, index) - register CONST char *src; /* The UTF-8 string to dereference. */ - register int index; /* The position of the desired character. */ -{ - Tcl_UniChar ch; - - while (index >= 0) { - index--; - src += Tcl_UtfToUniChar(src, &ch); - } - return ch; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfAtIndex -- - * - * Returns a pointer to the specified character (not byte) position - * in the UTF-8 string. - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -char * -Tcl_UtfAtIndex(src, index) - register CONST char *src; /* The UTF-8 string. */ - register int index; /* The position of the desired character. */ -{ - Tcl_UniChar ch; - - while (index > 0) { - index--; - src += Tcl_UtfToUniChar(src, &ch); - } - return (char *) src; -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_UtfBackslash -- - * - * Figure out how to handle a backslash sequence. - * - * Results: - * Stores the bytes represented by the backslash sequence in dst and - * returns the number of bytes written to dst. At most TCL_UTF_MAX - * bytes are written to dst; dst must have been large enough to accept - * those bytes. If readPtr isn't NULL then it is filled in with a - * count of the number of bytes in the backslash sequence. - * - * Side effects: - * The maximum number of bytes it takes to represent a Unicode - * character in UTF-8 is guaranteed to be less than the number of - * bytes used to express the backslash sequence that represents - * that Unicode character. If the target buffer into which the - * caller is going to store the bytes that represent the Unicode - * character is at least as large as the source buffer from which - * the backslashed sequence was extracted, no buffer overruns should - * occur. - * - *--------------------------------------------------------------------------- - */ - -int -Tcl_UtfBackslash(src, readPtr, dst) - CONST char *src; /* Points to the backslash character of - * a backslash sequence. */ - int *readPtr; /* Fill in with number of characters read - * from src, unless NULL. */ - char *dst; /* Filled with the bytes represented by the - * backslash sequence. */ -{ - register CONST char *p = src+1; - int result, count, n; - char buf[TCL_UTF_MAX]; - - if (dst == NULL) { - dst = buf; - } - - count = 2; - switch (*p) { - /* - * Note: in the conversions below, use absolute values (e.g., - * 0xa) rather than symbolic values (e.g. \n) that get converted - * by the compiler. It's possible that compilers on some - * platforms will do the symbolic conversions differently, which - * could result in non-portable Tcl scripts. - */ - - case 'a': - result = 0x7; - break; - case 'b': - result = 0x8; - break; - case 'f': - result = 0xc; - break; - case 'n': - result = 0xa; - break; - case 'r': - result = 0xd; - break; - case 't': - result = 0x9; - break; - case 'v': - result = 0xb; - break; - case 'x': - if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ - char *end; - - result = (unsigned char) strtoul(p+1, &end, 16); - count = end - src; - } else { - count = 2; - result = 'x'; - } - break; - case 'u': - result = 0; - for (count = 0; count < 4; count++) { - p++; - if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ - break; - } - n = *p - '0'; - if (n > 9) { - n = n + '0' + 10 - 'A'; - } - if (n > 16) { - n = n + 'A' - 'a'; - } - result = (result << 4) + n; - } - if (count == 0) { - result = 'u'; - } - count += 2; - break; - - case '\n': - do { - p++; - } while ((*p == ' ') || (*p == '\t')); - result = ' '; - count = p - src; - break; - case 0: - result = '\\'; - count = 1; - break; - default: - if (isdigit(UCHAR(*p))) { /* INTL: digit */ - result = (unsigned char)(*p - '0'); - p++; - if (!isdigit(UCHAR(*p))) { /* INTL: digit */ - break; - } - count = 3; - result = (unsigned char)((result << 3) + (*p - '0')); - p++; - if (!isdigit(UCHAR(*p))) { /* INTL: digit */ - break; - } - count = 4; - result = (unsigned char)((result << 3) + (*p - '0')); - break; - } - result = *p; - count = 2; - break; - } - - if (readPtr != NULL) { - *readPtr = count; - } - return Tcl_UniCharToUtf(result, dst); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfToUpper -- - * - * Convert lowercase characters to uppercase characters in a UTF - * string in place. The conversion may shrink the UTF string. - * - * Results: - * Returns the number of bytes in the resulting string - * excluding the trailing null. - * - * Side effects: - * Writes a terminating null after the last converted character. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfToUpper(str) - char *str; /* String to convert in place. */ -{ - Tcl_UniChar ch; - char *src, *dst; - - /* - * Iterate over the string until we hit the terminating null. - */ - - src = dst = str; - while (*src) { - src += Tcl_UtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(Tcl_UniCharToUpper(ch), dst); - } - *dst = '\0'; - return (dst - str); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfToLower -- - * - * Convert uppercase characters to lowercase characters in a UTF - * string in place. The conversion may shrink the UTF string. - * - * Results: - * Returns the number of bytes in the resulting string - * excluding the trailing null. - * - * Side effects: - * Writes a terminating null after the last converted character. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfToLower(str) - char *str; /* String to convert in place. */ -{ - Tcl_UniChar ch; - char *src, *dst; - - /* - * Iterate over the string until we hit the terminating null. - */ - - src = dst = str; - while (*src) { - src += Tcl_UtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst); - } - *dst = '\0'; - return (dst - str); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UtfToTitle -- - * - * Changes the first character of a UTF string to title case or - * uppercase and the rest of the string to lowercase. The - * conversion happens in place and may shrink the UTF string. - * - * Results: - * Returns the number of bytes in the resulting string - * excluding the trailing null. - * - * Side effects: - * Writes a terminating null after the last converted character. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UtfToTitle(str) - char *str; /* String to convert in place. */ -{ - Tcl_UniChar ch; - char *src, *dst; - - /* - * Capitalize the first character and then lowercase the rest of the - * characters until we get to a null. - */ - - src = dst = str; - - if (*src) { - src += Tcl_UtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(Tcl_UniCharToTitle(ch), dst); - } - while (*src) { - src += Tcl_UtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst); - } - *dst = '\0'; - return (dst - str); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharToUpper -- - * - * Compute the uppercase equivalent of the given Unicode character. - * - * Results: - * Returns the uppercase Unicode character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharToUpper(ch) - int ch; /* Unicode character to convert. */ -{ - int info = GetUniCharInfo(ch); - - if (GetCaseType(info) & 0x04) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; - } -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharToLower -- - * - * Compute the lowercase equivalent of the given Unicode character. - * - * Results: - * Returns the lowercase Unicode character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharToLower(ch) - int ch; /* Unicode character to convert. */ -{ - int info = GetUniCharInfo(ch); - - if (GetCaseType(info) & 0x02) { - return (Tcl_UniChar) (ch + GetDelta(info)); - } else { - return ch; - } -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharToTitle -- - * - * Compute the titlecase equivalent of the given Unicode character. - * - * Results: - * Returns the titlecase Unicode character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -Tcl_UniChar -Tcl_UniCharToTitle(ch) - int ch; /* Unicode character to convert. */ -{ - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); - - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ - - return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); - } else if (mode == 0x4) { - return (Tcl_UniChar) (ch - GetDelta(info)); - } else { - return ch; - } -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharLen -- - * - * Find the length of a UniChar string. The str input must be null - * terminated. - * - * Results: - * Returns the length of str in UniChars (not bytes). - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharLen(str) - Tcl_UniChar *str; /* Unicode string to find length of. */ -{ - int len = 0; - - while (*str != '\0') { - len++; - str++; - } - return len; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharNcmp -- - * - * Compare at most n unichars of string cs to string ct. Both cs - * and ct are assumed to be at least n unichars long. - * - * Results: - * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharNcmp(cs, ct, n) - CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ - CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ - size_t n; /* Number of unichars to compare. */ -{ - for ( ; n != 0; n--, cs++, ct++) { - if (*cs != *ct) { - return *cs - *ct; - } - if (*cs == '\0') { - break; - } - } - return 0; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsAlnum -- - * - * Test if a character is an alphanumeric Unicode character. - * - * Results: - * Returns 1 if character is alphanumeric. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsAlnum(ch) - int ch; /* Unicode character to test. */ -{ - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsAlpha -- - * - * Test if a character is an alphabetic Unicode character. - * - * Results: - * Returns 1 if character is alphabetic. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsAlpha(ch) - int ch; /* Unicode character to test. */ -{ - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((ALPHA_BITS >> category) & 1); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsDigit -- - * - * Test if a character is a numeric Unicode character. - * - * Results: - * Returns non-zero if character is a digit. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsDigit(ch) - int ch; /* Unicode character to test. */ -{ - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) - == DECIMAL_DIGIT_NUMBER); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsLower -- - * - * Test if a character is a lowercase Unicode character. - * - * Results: - * Returns non-zero if character is lowercase. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsLower(ch) - int ch; /* Unicode character to test. */ -{ - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsSpace -- - * - * Test if a character is a whitespace Unicode character. - * - * Results: - * Returns non-zero if character is a space. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsSpace(ch) - int ch; /* Unicode character to test. */ -{ - register int category; - - /* - * If the character is within the first 127 characters, just use the - * standard C function, otherwise consult the Unicode table. - */ - - if (ch < 0x80) { - return isspace(UCHAR(ch)); /* INTL: ISO space */ - } else { - category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - return ((SPACE_BITS >> category) & 1); - } -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsUpper -- - * - * Test if a character is a uppercase Unicode character. - * - * Results: - * Returns non-zero if character is uppercase. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsUpper(ch) - int ch; /* Unicode character to test. */ -{ - return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_UniCharIsWordChar -- - * - * Test if a character is alphanumeric or a connector punctuation - * mark. - * - * Results: - * Returns 1 if character is a word character. - * - * Side effects: - * None. - * - *---------------------------------------------------------------------- - */ - -int -Tcl_UniCharIsWordChar(ch) - int ch; /* Unicode character to test. */ -{ - register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - - return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); -} |