diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 274 |
1 files changed, 159 insertions, 115 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e5497a4..68119a4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -59,7 +59,7 @@ * UTF-8. */ -static CONST unsigned char totalBytes[256] = { +static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -73,16 +73,7 @@ static CONST unsigned char totalBytes[256] = { #else 1,1,1,1,1,1,1,1, #endif -#if TCL_UTF_MAX > 4 - 5,5,5,5, -#else - 1,1,1,1, -#endif -#if TCL_UTF_MAX > 5 - 6,6,6,6 -#else - 1,1,1,1 -#endif + 1,1,1,1,1,1,1,1 }; /* @@ -111,25 +102,16 @@ INLINE static int UtfCount( int ch) /* The Tcl_UniChar whose size is returned. */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { return 1; } if (ch <= 0x7FF) { return 2; } - if (ch <= 0xFFFF) { - return 3; - } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if (((unsigned)(ch - 0x10000) <= 0xfffff)) { return 4; } - if (ch <= 0x3FFFFFF) { - return 5; - } - if (ch <= 0x7FFFFFFF) { - return 6; - } #endif return 3; } @@ -161,7 +143,7 @@ Tcl_UniCharToUtf( * large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */ { - if ((ch > 0) && (ch < UNICODE_SELF)) { + if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; } @@ -172,43 +154,43 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { - three: - buf[2] = (char) ((ch | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 12) | 0xE0); - return 3; +#if TCL_UTF_MAX == 4 + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x0400) { + /* Low surrogate */ + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); + return 4; + } else { + /* High surrogate */ + ch += 0x40; + buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); + buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + return 0; + } + } +#endif + goto three; } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } - if (ch <= 0x3FFFFFF) { - buf[4] = (char) ((ch | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 24) | 0xF8); - return 5; - } - if (ch <= 0x7FFFFFFF) { - buf[5] = (char) ((ch | 0x80) & 0xBF); - buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 30) | 0xFC); - return 6; - } #endif } ch = 0xFFFD; - goto three; +three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; } /* @@ -231,13 +213,13 @@ Tcl_UniCharToUtf( char * Tcl_UniCharToUtfDString( - CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - CONST Tcl_UniChar *w, *wEnd; + const Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -289,7 +271,7 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register CONST char *src, /* The UTF-8 string. */ + register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { @@ -323,9 +305,6 @@ Tcl_UtfToUniChar( * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* @@ -341,31 +320,23 @@ Tcl_UtfToUniChar( * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ - - *chPtr = (Tcl_UniChar) byte; - return 1; } #if TCL_UTF_MAX > 3 - { - int ch, total, trail; - - total = totalBytes[byte]; - trail = total - 1; - if (trail > 0) { - ch = byte & (0x3F >> trail); - do { - src++; - if ((*src & 0xC0) != 0x80) { - *chPtr = byte; - return 1; - } - ch <<= 6; - ch |= (*src & 0x3F); - trail--; - } while (trail > 0); - *chPtr = ch; - return total; + else if (byte < 0xF8) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { + /* + * Four-byte-character lead byte followed by three trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12) + | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); + return 4; } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ } #endif @@ -393,7 +364,7 @@ Tcl_UtfToUniChar( Tcl_UniChar * Tcl_UtfToUniCharDString( - CONST char *src, /* UTF-8 string to convert to Unicode. */ + const char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ Tcl_DString *dsPtr) /* Unicode representation of string is @@ -401,7 +372,7 @@ Tcl_UtfToUniCharDString( * DString. */ { Tcl_UniChar *w, *wString; - CONST char *p, *end; + const char *p, *end; int oldLength; if (length < 0) { @@ -414,6 +385,7 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); @@ -452,7 +424,7 @@ Tcl_UtfToUniCharDString( int Tcl_UtfCharComplete( - CONST char *src, /* String to check if first few bytes contain + const char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { @@ -482,7 +454,7 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register CONST char *src, /* The UTF-8 string to measure. */ + register const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { @@ -540,9 +512,9 @@ Tcl_NumUtfChars( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfFindFirst( - CONST char *src, /* The UTF-8 string to be searched. */ + const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; @@ -579,14 +551,14 @@ Tcl_UtfFindFirst( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfFindLast( - CONST char *src, /* The UTF-8 string to be searched. */ + const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - CONST char *last; + const char *last; last = NULL; while (1) { @@ -621,9 +593,9 @@ Tcl_UtfFindLast( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfNext( - CONST char *src) /* The current location in the string. */ + const char *src) /* The current location in the string. */ { Tcl_UniChar ch; @@ -651,13 +623,13 @@ Tcl_UtfNext( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfPrev( - CONST char *src, /* The current location in the string. */ - CONST char *start) /* Pointer to the beginning of the string, to + const char *src, /* The current location in the string. */ + const char *start) /* Pointer to the beginning of the string, to * avoid going backwards too far. */ { - CONST char *look; + const char *look; int i, byte; src--; @@ -700,10 +672,10 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register CONST char *src, /* The UTF-8 string to dereference. */ + register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; while (index >= 0) { index--; @@ -729,9 +701,9 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfAtIndex( - register CONST char *src, /* The UTF-8 string. */ + register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch; @@ -771,7 +743,7 @@ Tcl_UtfAtIndex( int Tcl_UtfBackslash( - CONST char *src, /* Points to the backslash character of a + const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ @@ -983,8 +955,8 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numBytes) /* Number of *bytes* to compare. */ { /* @@ -1030,8 +1002,8 @@ TclpUtfNcmp2( int Tcl_UtfNcmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1078,8 +1050,8 @@ Tcl_UtfNcmp( int Tcl_UtfNcasecmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1122,8 +1094,8 @@ Tcl_UtfNcasecmp( int TclUtfCasecmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct) /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ { while (*cs && *ct) { Tcl_UniChar ch1, ch2; @@ -1252,7 +1224,7 @@ Tcl_UniCharToTitle( int Tcl_UniCharLen( - CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ + const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; @@ -1282,8 +1254,8 @@ Tcl_UniCharLen( int Tcl_UniCharNcmp( - CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { #ifdef WORDS_BIGENDIAN @@ -1327,8 +1299,8 @@ Tcl_UniCharNcmp( int Tcl_UniCharNcasecmp( - CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { @@ -1364,6 +1336,11 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1387,6 +1364,11 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1410,6 +1392,18 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) { + return 1; + } + if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) { + return 1; + } + return 0; + } +#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1433,6 +1427,11 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1456,6 +1455,12 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + return (ch >= 0xe0100) && (ch <= 0xe01ef); + } +#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1479,6 +1484,11 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1502,6 +1512,12 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + return (ch >= 0xe0100) && (ch <= 0xe01ef); + } +#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1525,6 +1541,11 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1548,14 +1569,27 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + /* Ignore upper 11 bits. */ + ch &= 0x1fffff; +#else + /* Ignore upper 16 bits. */ + ch &= 0xffff; +#endif + /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + if (ch < 0x80) { return TclIsSpaceProc((char) ch); - } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { +#if TCL_UTF_MAX > 3 + } else if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; +#endif + } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b + || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1582,6 +1616,11 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1605,6 +1644,11 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } @@ -1633,8 +1677,8 @@ Tcl_UniCharIsWordChar( int Tcl_UniCharCaseMatch( - CONST Tcl_UniChar *uniStr, /* Unicode String. */ - CONST Tcl_UniChar *uniPattern, + const Tcl_UniChar *uniStr, /* Unicode String. */ + const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ @@ -1821,14 +1865,14 @@ Tcl_UniCharCaseMatch( int TclUniCharMatch( - CONST Tcl_UniChar *string, /* Unicode String. */ + const Tcl_UniChar *string, /* Unicode String. */ int strLen, /* Length of String */ - CONST Tcl_UniChar *pattern, /* Pattern, which may contain special + const Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - CONST Tcl_UniChar *stringEnd, *patternEnd; + const Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; |