summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2017-06-06 09:02:28 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2017-06-06 09:02:28 (GMT)
commit829350cc1a1b9d065ccda17583ee55c731598f13 (patch)
tree343eb18a047fe870681ccaed606fbefaba313b49 /generic/tclUtf.c
parentab808831bd7d76198dfc874049b9fd00b009646e (diff)
downloadtcl-829350cc1a1b9d065ccda17583ee55c731598f13.zip
tcl-829350cc1a1b9d065ccda17583ee55c731598f13.tar.gz
tcl-829350cc1a1b9d065ccda17583ee55c731598f13.tar.bz2
[67aa9a2070] Tcl_UtfToUniChar returns single byte for invalid UTF-8 input as documented.
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c127
1 files changed, 52 insertions, 75 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e5497a4..3937141 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -73,16 +73,7 @@ static CONST unsigned char totalBytes[256] = {
#else
1,1,1,1,1,1,1,1,
#endif
-#if TCL_UTF_MAX > 4
- 5,5,5,5,
-#else
- 1,1,1,1,
-#endif
-#if TCL_UTF_MAX > 5
- 6,6,6,6
-#else
- 1,1,1,1
-#endif
+ 1,1,1,1,1,1,1,1
};
/*
@@ -111,25 +102,16 @@ INLINE static int
UtfCount(
int ch) /* The Tcl_UniChar whose size is returned. */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
- if (ch <= 0xFFFF) {
- return 3;
- }
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
- if (ch <= 0x3FFFFFF) {
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- return 6;
- }
#endif
return 3;
}
@@ -161,7 +143,7 @@ Tcl_UniCharToUtf(
* large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
return 1;
}
@@ -172,43 +154,43 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
- three:
- buf[2] = (char) ((ch | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
+#if TCL_UTF_MAX == 4
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x0400) {
+ /* Low surrogate */
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
+ return 4;
+ } else {
+ /* High surrogate */
+ ch += 0x40;
+ buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
+ buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
+ buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ return 0;
+ }
+ }
+#endif
+ goto three;
}
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (ch <= 0x10FFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
- if (ch <= 0x3FFFFFF) {
- buf[4] = (char) ((ch | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- buf[5] = (char) ((ch | 0x80) & 0xBF);
- buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
#endif
}
ch = 0xFFFD;
- goto three;
+three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
}
/*
@@ -316,16 +298,15 @@ Tcl_UtfToUniChar(
*/
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- return 2;
+ if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+ return 2;
+ }
}
/*
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
} else if (byte < 0xF0) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
@@ -334,38 +315,34 @@ Tcl_UtfToUniChar(
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- return 3;
+ if (*chPtr > 0x7ff) {
+ return 3;
+ }
}
/*
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
}
#if TCL_UTF_MAX > 3
- {
- int ch, total, trail;
-
- total = totalBytes[byte];
- trail = total - 1;
- if (trail > 0) {
- ch = byte & (0x3F >> trail);
- do {
- src++;
- if ((*src & 0xC0) != 0x80) {
- *chPtr = byte;
- return 1;
- }
- ch <<= 6;
- ch |= (*src & 0x3F);
- trail--;
- } while (trail > 0);
- *chPtr = ch;
- return total;
+ else if (byte < 0xF8) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+ /*
+ * Four-byte-character lead byte followed by three trail bytes.
+ */
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+ if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+ return 4;
+ }
}
+
+ /*
+ * A four-byte-character lead-byte not followed by two trail-bytes
+ * represents itself.
+ */
}
#endif
@@ -1038,7 +1015,7 @@ Tcl_UtfNcmp(
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
+ * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
* (the byte 0x01.)
*/
@@ -1555,7 +1532,7 @@ Tcl_UniCharIsSpace(
if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
return TclIsSpaceProc((char) ch);
- } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
+ } else if ((Tcl_UniChar) ch == 0x180E || (Tcl_UniChar) ch == 0x202F) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);