summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2016-08-30 13:00:32 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2016-08-30 13:00:32 (GMT)
commit3b2cb79ab18e9506dfb66e2786a7ee04dd30a780 (patch)
treecfec5955da6f587fe00bd8471120c07add91560c
parentf2115ca0855e6d7ed4c7f8a069b3abc054579390 (diff)
downloadtcl-3b2cb79ab18e9506dfb66e2786a7ee04dd30a780.zip
tcl-3b2cb79ab18e9506dfb66e2786a7ee04dd30a780.tar.gz
tcl-3b2cb79ab18e9506dfb66e2786a7ee04dd30a780.tar.bz2
Don't ever allow UTF-8 sequences of more than 4 characters to be generated or parsed, even when TCL_UTF_MAX>4: According to current Unicode standard, a byte string of >4 characters can never form a single UTF-8 character.
And a few minor micro-optimizations related to UTF-8 handling.
-rw-r--r--generic/tclUtf.c68
1 files changed, 24 insertions, 44 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b878149..68119a4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -73,16 +73,7 @@ static const unsigned char totalBytes[256] = {
#else
1,1,1,1,1,1,1,1,
#endif
-#if TCL_UTF_MAX > 4
- 5,5,5,5,
-#else
- 1,1,1,1,
-#endif
-#if TCL_UTF_MAX > 5
- 6,6,6,6
-#else
- 1,1,1,1
-#endif
+ 1,1,1,1,1,1,1,1
};
/*
@@ -111,14 +102,14 @@ INLINE static int
UtfCount(
int ch) /* The Tcl_UniChar whose size is returned. */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
#if TCL_UTF_MAX > 3
- if ((ch > 0xFFFF) && (ch <= 0x10FFFF)) {
+ if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
return 4;
}
#endif
@@ -152,7 +143,7 @@ Tcl_UniCharToUtf(
* large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
return 1;
}
@@ -180,11 +171,7 @@ Tcl_UniCharToUtf(
}
}
#endif
- three:
- buf[2] = (char) ((ch | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
+ goto three;
}
#if TCL_UTF_MAX > 3
@@ -199,7 +186,11 @@ Tcl_UniCharToUtf(
}
ch = 0xFFFD;
- goto three;
+three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
}
/*
@@ -314,9 +305,6 @@ Tcl_UtfToUniChar(
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
} else if (byte < 0xF0) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
@@ -332,31 +320,23 @@ Tcl_UtfToUniChar(
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
}
#if TCL_UTF_MAX > 3
- {
- int ch, total, trail;
-
- total = totalBytes[byte];
- trail = total - 1;
- if (trail > 0) {
- ch = byte & (0x3F >> trail);
- do {
- src++;
- if ((*src & 0xC0) != 0x80) {
- *chPtr = byte;
- return 1;
- }
- ch <<= 6;
- ch |= (*src & 0x3F);
- trail--;
- } while (trail > 0);
- *chPtr = ch;
- return total;
+ else if (byte < 0xF8) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+ /*
+ * Four-byte-character lead byte followed by three trail bytes.
+ */
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+ return 4;
}
+
+ /*
+ * A three-byte-character lead-byte not followed by two trail-bytes
+ * represents itself.
+ */
}
#endif