diff options
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 208 |
1 files changed, 136 insertions, 72 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index e5497a4..b878149 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -59,7 +59,7 @@ * UTF-8. */ -static CONST unsigned char totalBytes[256] = { +static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -117,19 +117,10 @@ UtfCount( if (ch <= 0x7FF) { return 2; } - if (ch <= 0xFFFF) { - return 3; - } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) { return 4; } - if (ch <= 0x3FFFFFF) { - return 5; - } - if (ch <= 0x7FFFFFFF) { - return 6; - } #endif return 3; } @@ -172,6 +163,23 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { +#if TCL_UTF_MAX == 4 + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x0400) { + /* Low surrogate */ + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); + return 4; + } else { + /* High surrogate */ + ch += 0x40; + buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); + buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + return 0; + } + } +#endif three: buf[2] = (char) ((ch | 0x80) & 0xBF); buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); @@ -180,30 +188,13 @@ Tcl_UniCharToUtf( } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } - if (ch <= 0x3FFFFFF) { - buf[4] = (char) ((ch | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 24) | 0xF8); - return 5; - } - if (ch <= 0x7FFFFFFF) { - buf[5] = (char) ((ch | 0x80) & 0xBF); - buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 30) | 0xFC); - return 6; - } #endif } @@ -231,13 +222,13 @@ Tcl_UniCharToUtf( char * Tcl_UniCharToUtfDString( - CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ int uniLength, /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ Tcl_DString *dsPtr) /* UTF-8 representation of string is appended * to this previously initialized DString. */ { - CONST Tcl_UniChar *w, *wEnd; + const Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; @@ -289,7 +280,7 @@ Tcl_UniCharToUtfDString( int Tcl_UtfToUniChar( - register CONST char *src, /* The UTF-8 string. */ + register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { @@ -393,7 +384,7 @@ Tcl_UtfToUniChar( Tcl_UniChar * Tcl_UtfToUniCharDString( - CONST char *src, /* UTF-8 string to convert to Unicode. */ + const char *src, /* UTF-8 string to convert to Unicode. */ int length, /* Length of UTF-8 string in bytes, or -1 for * strlen(). */ Tcl_DString *dsPtr) /* Unicode representation of string is @@ -401,7 +392,7 @@ Tcl_UtfToUniCharDString( * DString. */ { Tcl_UniChar *w, *wString; - CONST char *p, *end; + const char *p, *end; int oldLength; if (length < 0) { @@ -414,6 +405,7 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */ Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); @@ -452,7 +444,7 @@ Tcl_UtfToUniCharDString( int Tcl_UtfCharComplete( - CONST char *src, /* String to check if first few bytes contain + const char *src, /* String to check if first few bytes contain * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { @@ -482,7 +474,7 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register CONST char *src, /* The UTF-8 string to measure. */ + register const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ { @@ -540,9 +532,9 @@ Tcl_NumUtfChars( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfFindFirst( - CONST char *src, /* The UTF-8 string to be searched. */ + const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; @@ -579,14 +571,14 @@ Tcl_UtfFindFirst( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfFindLast( - CONST char *src, /* The UTF-8 string to be searched. */ + const char *src, /* The UTF-8 string to be searched. */ int ch) /* The Tcl_UniChar to search for. */ { int len; Tcl_UniChar find; - CONST char *last; + const char *last; last = NULL; while (1) { @@ -621,9 +613,9 @@ Tcl_UtfFindLast( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfNext( - CONST char *src) /* The current location in the string. */ + const char *src) /* The current location in the string. */ { Tcl_UniChar ch; @@ -651,13 +643,13 @@ Tcl_UtfNext( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfPrev( - CONST char *src, /* The current location in the string. */ - CONST char *start) /* Pointer to the beginning of the string, to + const char *src, /* The current location in the string. */ + const char *start) /* Pointer to the beginning of the string, to * avoid going backwards too far. */ { - CONST char *look; + const char *look; int i, byte; src--; @@ -700,10 +692,10 @@ Tcl_UtfPrev( Tcl_UniChar Tcl_UniCharAtIndex( - register CONST char *src, /* The UTF-8 string to dereference. */ + register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { - Tcl_UniChar ch; + Tcl_UniChar ch = 0; while (index >= 0) { index--; @@ -729,9 +721,9 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ -CONST char * +const char * Tcl_UtfAtIndex( - register CONST char *src, /* The UTF-8 string. */ + register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch; @@ -771,7 +763,7 @@ Tcl_UtfAtIndex( int Tcl_UtfBackslash( - CONST char *src, /* Points to the backslash character of a + const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr, /* Fill in with number of characters read from * src, unless NULL. */ @@ -983,8 +975,8 @@ Tcl_UtfToTitle( int TclpUtfNcmp2( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numBytes) /* Number of *bytes* to compare. */ { /* @@ -1030,8 +1022,8 @@ TclpUtfNcmp2( int Tcl_UtfNcmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1078,8 +1070,8 @@ Tcl_UtfNcmp( int Tcl_UtfNcasecmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct, /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct, /* UTF string cs is compared to. */ unsigned long numChars) /* Number of UTF chars to compare. */ { Tcl_UniChar ch1, ch2; @@ -1122,8 +1114,8 @@ Tcl_UtfNcasecmp( int TclUtfCasecmp( - CONST char *cs, /* UTF string to compare to ct. */ - CONST char *ct) /* UTF string cs is compared to. */ + const char *cs, /* UTF string to compare to ct. */ + const char *ct) /* UTF string cs is compared to. */ { while (*cs && *ct) { Tcl_UniChar ch1, ch2; @@ -1252,7 +1244,7 @@ Tcl_UniCharToTitle( int Tcl_UniCharLen( - CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ + const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ { int len = 0; @@ -1282,8 +1274,8 @@ Tcl_UniCharLen( int Tcl_UniCharNcmp( - CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { #ifdef WORDS_BIGENDIAN @@ -1327,8 +1319,8 @@ Tcl_UniCharNcmp( int Tcl_UniCharNcasecmp( - CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ - CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ unsigned long numChars) /* Number of unichars to compare. */ { for ( ; numChars != 0; numChars--, ucs++, uct++) { @@ -1364,6 +1356,11 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1387,6 +1384,11 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1410,6 +1412,18 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) { + return 1; + } + if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) { + return 1; + } + return 0; + } +#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1433,6 +1447,11 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1456,6 +1475,12 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + return (ch >= 0xe0100) && (ch <= 0xe01ef); + } +#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1479,6 +1504,11 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1502,6 +1532,12 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + return (ch >= 0xe0100) && (ch <= 0xe01ef); + } +#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1525,6 +1561,11 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1548,14 +1589,27 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + /* Ignore upper 11 bits. */ + ch &= 0x1fffff; +#else + /* Ignore upper 16 bits. */ + ch &= 0xffff; +#endif + /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + if (ch < 0x80) { return TclIsSpaceProc((char) ch); - } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) { +#if TCL_UTF_MAX > 3 + } else if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; +#endif + } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b + || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1582,6 +1636,11 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1605,6 +1664,11 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } @@ -1633,8 +1697,8 @@ Tcl_UniCharIsWordChar( int Tcl_UniCharCaseMatch( - CONST Tcl_UniChar *uniStr, /* Unicode String. */ - CONST Tcl_UniChar *uniPattern, + const Tcl_UniChar *uniStr, /* Unicode String. */ + const Tcl_UniChar *uniPattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ @@ -1821,14 +1885,14 @@ Tcl_UniCharCaseMatch( int TclUniCharMatch( - CONST Tcl_UniChar *string, /* Unicode String. */ + const Tcl_UniChar *string, /* Unicode String. */ int strLen, /* Length of String */ - CONST Tcl_UniChar *pattern, /* Pattern, which may contain special + const Tcl_UniChar *pattern, /* Pattern, which may contain special * characters. */ int ptnLen, /* Length of Pattern */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - CONST Tcl_UniChar *stringEnd, *patternEnd; + const Tcl_UniChar *stringEnd, *patternEnd; Tcl_UniChar p; stringEnd = string + strLen; |