diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 296 |
1 files changed, 129 insertions, 167 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b33bf6a..e5497a4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -59,7 +59,7 @@ * UTF-8. */ -static const unsigned char totalBytes[256] = { +static CONST unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -73,13 +73,28 @@ static const unsigned char totalBytes[256] = { #else 1,1,1,1,1,1,1,1, #endif - 1,1,1,1,1,1,1,1 +#if TCL_UTF_MAX > 4 + 5,5,5,5, +#else + 1,1,1,1, +#endif +#if TCL_UTF_MAX > 5 + 6,6,6,6 +#else + 1,1,1,1 +#endif }; + +/* + * Functions used only in this module. + */ + +static int UtfCount(int ch); /* *--------------------------------------------------------------------------- * - * TclUtfCount -- + * UtfCount -- * * Find the number of bytes in the Utf character "ch". * @@ -92,20 +107,29 @@ static const unsigned char totalBytes[256] = { *--------------------------------------------------------------------------- */ -int -TclUtfCount( +INLINE static int +UtfCount( int ch) /* The Tcl_UniChar whose size is returned. */ { - if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { + if ((ch > 0) && (ch < UNICODE_SELF)) { return 1; } if (ch <= 0x7FF) { return 2; } + if (ch <= 0xFFFF) { + return 3; + } #if TCL_UTF_MAX > 3 - if (((unsigned)(ch - 0x10000) <= 0xfffff)) { + if (ch <= 0x1FFFFF) { return 4; } + if (ch <= 0x3FFFFFF) { + return 5; + } + if (ch <= 0x7FFFFFFF) { + return 6; + } #endif return 3; } @@ -128,7 +152,7 @@ TclUtfCount( *--------------------------------------------------------------------------- */ -int +INLINE int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the * buffer. */ @@ -137,7 +161,7 @@ Tcl_UniCharToUtf( * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ { - if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { + if ((ch > 0) && (ch < UNICODE_SELF)) { buf[0] = (char) ch; return 1; } @@ -148,43 +172,43 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 - if ((ch & 0xF800) == 0xD800) { - if (ch & 0x0400) { - /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; - } else { - /* High surrogate */ - ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); - return 0; - } - } -#endif - goto three; + three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; } #if TCL_UTF_MAX > 3 - if (ch <= 0x10FFFF) { + if (ch <= 0x1FFFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } #endif } ch = 0xFFFD; -three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); - return 3; + goto three; } /* @@ -207,13 +231,13 @@ three: char * Tcl_UniCharToUtfDString( - const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - const Tcl_UniChar *w, *wEnd; + CONST Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -265,7 +289,7 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register const char *src, /* The UTF-8 string. */ + register CONST char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { @@ -299,6 +323,9 @@ Tcl_UtfToUniChar( * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ + + *chPtr = (Tcl_UniChar) byte; + return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* @@ -314,23 +341,31 @@ Tcl_UtfToUniChar( * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ + + *chPtr = (Tcl_UniChar) byte; + return 1; } #if TCL_UTF_MAX > 3 - else if (byte < 0xF8) { - if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { - /* - * Four-byte-character lead byte followed by three trail bytes. - */ - - *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); - return 4; + { + int ch, total, trail; + + total = totalBytes[byte]; + trail = total - 1; + if (trail > 0) { + ch = byte & (0x3F >> trail); + do { + src++; + if ((*src & 0xC0) != 0x80) { + *chPtr = byte; + return 1; + } + ch <<= 6; + ch |= (*src & 0x3F); + trail--; + } while (trail > 0); + *chPtr = ch; + return total; } - - /* - * A three-byte-character lead-byte not followed by two trail-bytes - * represents itself. - */ } #endif @@ -358,7 +393,7 @@ Tcl_UtfToUniChar( Tcl_UniChar * Tcl_UtfToUniCharDString( - const char *src, /* UTF-8 string to convert to Unicode. */ + CONST char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ Tcl_DString *dsPtr) /* Unicode representation of string is @@ -366,7 +401,7 @@ Tcl_UtfToUniCharDString( * DString. */ { Tcl_UniChar *w, *wString; - const char *p, *end; + CONST char *p, *end; int oldLength; if (length < 0) { @@ -379,7 +414,6 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); -/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); @@ -418,7 +452,7 @@ Tcl_UtfToUniCharDString( int Tcl_UtfCharComplete( - const char *src, /* String to check if first few bytes contain + CONST char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { @@ -448,7 +482,7 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register const char *src, /* The UTF-8 string to measure. */ + register CONST char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { @@ -506,9 +540,9 @@ Tcl_NumUtfChars( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfFindFirst( - const char *src, /* The UTF-8 string to be searched. */ + CONST char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; @@ -545,14 +579,14 @@ Tcl_UtfFindFirst( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfFindLast( - const char *src, /* The UTF-8 string to be searched. */ + CONST char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - const char *last; + CONST char *last; last = NULL; while (1) { @@ -587,9 +621,9 @@ Tcl_UtfFindLast( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfNext( - const char *src) /* The current location in the string. */ + CONST char *src) /* The current location in the string. */ { Tcl_UniChar ch; @@ -617,13 +651,13 @@ Tcl_UtfNext( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfPrev( - const char *src, /* The current location in the string. */ - const char *start) /* Pointer to the beginning of the string, to + CONST char *src, /* The current location in the string. */ + CONST char *start) /* Pointer to the beginning of the string, to * avoid going backwards too far. */ { - const char *look; + CONST char *look; int i, byte; src--; @@ -666,10 +700,10 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register const char *src, /* The UTF-8 string to dereference. */ + register CONST char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; + Tcl_UniChar ch; while (index >= 0) { index--; @@ -695,9 +729,9 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -const char * +CONST char * Tcl_UtfAtIndex( - register const char *src, /* The UTF-8 string. */ + register CONST char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch; @@ -737,7 +771,7 @@ Tcl_UtfAtIndex( int Tcl_UtfBackslash( - const char *src, /* Points to the backslash character of a + CONST char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ @@ -803,7 +837,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (bytes < TclUtfCount(upChar)) { + if (bytes < UtfCount(upChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -856,7 +890,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (bytes < TclUtfCount(lowChar)) { + if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -906,7 +940,7 @@ Tcl_UtfToTitle( bytes = TclUtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); - if (bytes < TclUtfCount(titleChar)) { + if (bytes < UtfCount(titleChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -918,7 +952,7 @@ Tcl_UtfToTitle( bytes = TclUtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); - if (bytes < TclUtfCount(lowChar)) { + if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -949,8 +983,8 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ unsigned long numBytes) /* Number of *bytes* to compare. */ { /* @@ -996,8 +1030,8 @@ TclpUtfNcmp2( int Tcl_UtfNcmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1044,8 +1078,8 @@ Tcl_UtfNcmp( int Tcl_UtfNcasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct, /* UTF string cs is compared to. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1088,8 +1122,8 @@ Tcl_UtfNcasecmp( int TclUtfCasecmp( - const char *cs, /* UTF string to compare to ct. */ - const char *ct) /* UTF string cs is compared to. */ + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct) /* UTF string cs is compared to. */ { while (*cs && *ct) { Tcl_UniChar ch1, ch2; @@ -1218,7 +1252,7 @@ Tcl_UniCharToTitle( int Tcl_UniCharLen( - const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ + CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; @@ -1248,8 +1282,8 @@ Tcl_UniCharLen( int Tcl_UniCharNcmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { #ifdef WORDS_BIGENDIAN @@ -1293,8 +1327,8 @@ Tcl_UniCharNcmp( int Tcl_UniCharNcasecmp( - const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { @@ -1330,11 +1364,6 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1358,11 +1387,6 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1386,18 +1410,6 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1fffff; - if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) { - return 1; - } - if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) { - return 1; - } - return 0; - } -#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1421,11 +1433,6 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1449,12 +1456,6 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1fffff; - return (ch >= 0xe0100) && (ch <= 0xe01ef); - } -#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1478,11 +1479,6 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1506,12 +1502,6 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - ch &= 0x1fffff; - return (ch >= 0xe0100) && (ch <= 0xe01ef); - } -#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1535,11 +1525,6 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1563,27 +1548,14 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - /* Ignore upper 11 bits. */ - ch &= 0x1fffff; -#else - /* Ignore upper 16 bits. */ - ch &= 0xffff; -#endif - /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (ch < 0x80) { + if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { return TclIsSpaceProc((char) ch); -#if TCL_UTF_MAX > 3 - } else if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; -#endif - } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b - || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) { + } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1610,11 +1582,6 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1638,11 +1605,6 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { -#if TCL_UTF_MAX > 3 - if (UNICODE_OUT_OF_RANGE(ch)) { - return 0; - } -#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } @@ -1671,8 +1633,8 @@ Tcl_UniCharIsWordChar( int Tcl_UniCharCaseMatch( - const Tcl_UniChar *uniStr, /* Unicode String. */ - const Tcl_UniChar *uniPattern, + CONST Tcl_UniChar *uniStr, /* Unicode String. */ + CONST Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ @@ -1859,14 +1821,14 @@ Tcl_UniCharCaseMatch( int TclUniCharMatch( - const Tcl_UniChar *string, /* Unicode String. */ + CONST Tcl_UniChar *string, /* Unicode String. */ int strLen, /* Length of String */ - const Tcl_UniChar *pattern, /* Pattern, which may contain special + CONST Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - const Tcl_UniChar *stringEnd, *patternEnd; + CONST Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; |
