summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2015-09-01 18:54:58 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2015-09-01 18:54:58 (GMT)
commita18e29f96b7b7a0699ed4d5df433b0b0660341e3 (patch)
tree2fc69d530f862f96aedd1ffe37b39f4153ccf273
parent67acc8945524908e6b8e9e1f8b1f949d3a146e11 (diff)
downloadtcl-a18e29f96b7b7a0699ed4d5df433b0b0660341e3.zip
tcl-a18e29f96b7b7a0699ed4d5df433b0b0660341e3.tar.gz
tcl-a18e29f96b7b7a0699ed4d5df433b0b0660341e3.tar.bz2
Various Unicode handling enhancements, when building with TCL_UTF_MAX > 3, inspired by androwish. No effect if TCL_UTF_MAX=3 (which is the default)
-rw-r--r--generic/tclUniData.c6
-rw-r--r--generic/tclUtf.c125
-rw-r--r--tools/uniParse.tcl6
3 files changed, 103 insertions, 34 deletions
diff --git a/generic/tclUniData.c b/generic/tclUniData.c
index d2f66fe..1ca119d 100644
--- a/generic/tclUniData.c
+++ b/generic/tclUniData.c
@@ -1556,4 +1556,8 @@ enum {
* Unicode character tables.
*/
-#define GetUniCharInfo(ch) (groups[groupMap[pageMap[((ch) & 0xffff) >> OFFSET_BITS] | ((ch) & ((1 << OFFSET_BITS)-1))]])
+#if TCL_UTF_MAX > 3
+# define GetUniCharInfo(ch) (groups[groupMap[pageMap[((ch) & 0x1fffff) >> OFFSET_BITS] | ((ch) & ((1 << OFFSET_BITS)-1))]])
+#else
+# define GetUniCharInfo(ch) (groups[groupMap[pageMap[((ch) & 0xffff) >> OFFSET_BITS] | ((ch) & ((1 << OFFSET_BITS)-1))]])
+#endif
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 15529c7..b878149 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -117,19 +117,10 @@ UtfCount(
if (ch <= 0x7FF) {
return 2;
}
- if (ch <= 0xFFFF) {
- return 3;
- }
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) {
return 4;
}
- if (ch <= 0x3FFFFFF) {
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- return 6;
- }
#endif
return 3;
}
@@ -172,6 +163,23 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
+#if TCL_UTF_MAX == 4
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x0400) {
+ /* Low surrogate */
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
+ return 4;
+ } else {
+ /* High surrogate */
+ ch += 0x40;
+ buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
+ buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
+ buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ return 0;
+ }
+ }
+#endif
three:
buf[2] = (char) ((ch | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
@@ -180,30 +188,13 @@ Tcl_UniCharToUtf(
}
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (ch <= 0x10FFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
- if (ch <= 0x3FFFFFF) {
- buf[4] = (char) ((ch | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- buf[5] = (char) ((ch | 0x80) & 0xBF);
- buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
#endif
}
@@ -1365,6 +1356,11 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
@@ -1388,6 +1384,11 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
@@ -1411,6 +1412,18 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+ return 1;
+ }
+ if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+ return 1;
+ }
+ return 0;
+ }
+#endif
return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
@@ -1434,6 +1447,11 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
@@ -1457,6 +1475,12 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ }
+#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
@@ -1480,6 +1504,11 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == LOWERCASE_LETTER);
}
@@ -1503,6 +1532,12 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ ch &= 0x1fffff;
+ return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ }
+#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
@@ -1526,6 +1561,11 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
@@ -1549,16 +1589,27 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ /* Ignore upper 11 bits. */
+ ch &= 0x1fffff;
+#else
+ /* Ignore upper 16 bits. */
+ ch &= 0xffff;
+#endif
+
/*
* If the character is within the first 127 characters, just use the
* standard C function, otherwise consult the Unicode table.
*/
- if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
+ if (ch < 0x80) {
return TclIsSpaceProc((char) ch);
- } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e
- || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060
- || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) {
+#if TCL_UTF_MAX > 3
+ } else if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+#endif
+ } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
+ || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1585,6 +1636,11 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return (GetCategory(ch) == UPPERCASE_LETTER);
}
@@ -1608,6 +1664,11 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
+#if TCL_UTF_MAX > 3
+ if (UNICODE_OUT_OF_RANGE(ch)) {
+ return 0;
+ }
+#endif
return ((WORD_BITS >> GetCategory(ch)) & 1);
}
diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl
index e33b3c7..8125790 100644
--- a/tools/uniParse.tcl
+++ b/tools/uniParse.tcl
@@ -396,7 +396,11 @@ enum {
* Unicode character tables.
*/
-#define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
+#if TCL_UTF_MAX > 3
+# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1fffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
+#else
+# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xffff) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
+#endif
"
close $f