From a18e29f96b7b7a0699ed4d5df433b0b0660341e3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 1 Sep 2015 18:54:58 +0000 Subject: Various Unicode handling enhancements, when building with TCL_UTF_MAX > 3, inspired by androwish. No effect if TCL_UTF_MAX=3 (which is the default) --- generic/tclUniData.c | 6 ++- generic/tclUtf.c | 125 ++++++++++++++++++++++++++++++++++++++------------- tools/uniParse.tcl | 6 ++- 3 files changed, 103 insertions(+), 34 deletions(-) diff --git a/generic/tclUniData.c b/generic/tclUniData.c index d2f66fe..1ca119d 100644 --- a/generic/tclUniData.c +++ b/generic/tclUniData.c @@ -1556,4 +1556,8 @@ enum { * Unicode character tables. */ -#define GetUniCharInfo(ch) (groups[groupMap[pageMap[((ch) & 0xffff) >> OFFSET_BITS] | ((ch) & ((1 << OFFSET_BITS)-1))]]) +#if TCL_UTF_MAX > 3 +# define GetUniCharInfo(ch) (groups[groupMap[pageMap[((ch) & 0x1fffff) >> OFFSET_BITS] | ((ch) & ((1 << OFFSET_BITS)-1))]]) +#else +# define GetUniCharInfo(ch) (groups[groupMap[pageMap[((ch) & 0xffff) >> OFFSET_BITS] | ((ch) & ((1 << OFFSET_BITS)-1))]]) +#endif diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 15529c7..b878149 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -117,19 +117,10 @@ UtfCount( if (ch <= 0x7FF) { return 2; } - if (ch <= 0xFFFF) { - return 3; - } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) { return 4; } - if (ch <= 0x3FFFFFF) { - return 5; - } - if (ch <= 0x7FFFFFFF) { - return 6; - } #endif return 3; } @@ -172,6 +163,23 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { +#if TCL_UTF_MAX == 4 + if ((ch & 0xF800) == 0xD800) { + if (ch & 0x0400) { + /* Low surrogate */ + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); + return 4; + } else { + /* High surrogate */ + ch += 0x40; + buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); + buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); + buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + return 0; + } + } +#endif three: buf[2] = (char) ((ch | 0x80) & 0xBF); buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); @@ -180,30 +188,13 @@ Tcl_UniCharToUtf( } #if TCL_UTF_MAX > 3 - if (ch <= 0x1FFFFF) { + if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } - if (ch <= 0x3FFFFFF) { - buf[4] = (char) ((ch | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 24) | 0xF8); - return 5; - } - if (ch <= 0x7FFFFFFF) { - buf[5] = (char) ((ch | 0x80) & 0xBF); - buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); - buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); - buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); - buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); - buf[0] = (char) ((ch >> 30) | 0xFC); - return 6; - } #endif } @@ -1365,6 +1356,11 @@ int Tcl_UniCharIsAlnum( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } @@ -1388,6 +1384,11 @@ int Tcl_UniCharIsAlpha( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((ALPHA_BITS >> GetCategory(ch)) & 1); } @@ -1411,6 +1412,18 @@ int Tcl_UniCharIsControl( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) { + return 1; + } + if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) { + return 1; + } + return 0; + } +#endif return ((CONTROL_BITS >> GetCategory(ch)) & 1); } @@ -1434,6 +1447,11 @@ int Tcl_UniCharIsDigit( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } @@ -1457,6 +1475,12 @@ int Tcl_UniCharIsGraph( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + return (ch >= 0xe0100) && (ch <= 0xe01ef); + } +#endif return ((GRAPH_BITS >> GetCategory(ch)) & 1); } @@ -1480,6 +1504,11 @@ int Tcl_UniCharIsLower( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == LOWERCASE_LETTER); } @@ -1503,6 +1532,12 @@ int Tcl_UniCharIsPrint( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + ch &= 0x1fffff; + return (ch >= 0xe0100) && (ch <= 0xe01ef); + } +#endif return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } @@ -1526,6 +1561,11 @@ int Tcl_UniCharIsPunct( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((PUNCT_BITS >> GetCategory(ch)) & 1); } @@ -1549,16 +1589,27 @@ int Tcl_UniCharIsSpace( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + /* Ignore upper 11 bits. */ + ch &= 0x1fffff; +#else + /* Ignore upper 16 bits. */ + ch &= 0xffff; +#endif + /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ - if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { + if (ch < 0x80) { return TclIsSpaceProc((char) ch); - } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e - || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060 - || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) { +#if TCL_UTF_MAX > 3 + } else if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; +#endif + } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b + || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); @@ -1585,6 +1636,11 @@ int Tcl_UniCharIsUpper( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return (GetCategory(ch) == UPPERCASE_LETTER); } @@ -1608,6 +1664,11 @@ int Tcl_UniCharIsWordChar( int ch) /* Unicode character to test. */ { +#if TCL_UTF_MAX > 3 + if (UNICODE_OUT_OF_RANGE(ch)) { + return 0; + } +#endif return ((WORD_BITS >> GetCategory(ch)) & 1); } diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl index e33b3c7..8125790 100644 --- a/tools/uniParse.tcl +++ b/tools/uniParse.tcl @@ -396,7 +396,11 @@ enum { * Unicode character tables. */ -#define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#if TCL_UTF_MAX > 3 +# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1fffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#else +# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) +#endif " close $f -- cgit v0.12