summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c154
1 files changed, 66 insertions, 88 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e5497a4..a405367 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -73,16 +73,7 @@ static CONST unsigned char totalBytes[256] = {
#else
1,1,1,1,1,1,1,1,
#endif
-#if TCL_UTF_MAX > 4
- 5,5,5,5,
-#else
- 1,1,1,1,
-#endif
-#if TCL_UTF_MAX > 5
- 6,6,6,6
-#else
- 1,1,1,1
-#endif
+ 1,1,1,1,1,1,1,1
};
/*
@@ -111,25 +102,16 @@ INLINE static int
UtfCount(
int ch) /* The Tcl_UniChar whose size is returned. */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
}
if (ch <= 0x7FF) {
return 2;
}
- if (ch <= 0xFFFF) {
- return 3;
- }
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
- if (ch <= 0x3FFFFFF) {
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- return 6;
- }
#endif
return 3;
}
@@ -161,7 +143,7 @@ Tcl_UniCharToUtf(
* large enough to hold the UTF-8 character
* (at most TCL_UTF_MAX bytes). */
{
- if ((ch > 0) && (ch < UNICODE_SELF)) {
+ if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
return 1;
}
@@ -172,43 +154,43 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
- three:
- buf[2] = (char) ((ch | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 12) | 0xE0);
- return 3;
+#if TCL_UTF_MAX == 4
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x0400) {
+ /* Low surrogate */
+ buf[3] = (char) ((ch | 0x80) & 0xBF);
+ buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
+ return 4;
+ } else {
+ /* High surrogate */
+ ch += 0x40;
+ buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
+ buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
+ buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ return 0;
+ }
+ }
+#endif
+ goto three;
}
#if TCL_UTF_MAX > 3
- if (ch <= 0x1FFFFF) {
+ if (ch <= 0x10FFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
- if (ch <= 0x3FFFFFF) {
- buf[4] = (char) ((ch | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 24) | 0xF8);
- return 5;
- }
- if (ch <= 0x7FFFFFFF) {
- buf[5] = (char) ((ch | 0x80) & 0xBF);
- buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
- buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
- buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
- buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
- buf[0] = (char) ((ch >> 30) | 0xFC);
- return 6;
- }
#endif
}
ch = 0xFFFD;
- goto three;
+three:
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
}
/*
@@ -316,16 +298,15 @@ Tcl_UtfToUniChar(
*/
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- return 2;
+ if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+ return 2;
+ }
}
/*
* A two-byte-character lead-byte not followed by trail-byte
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
} else if (byte < 0xF0) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
@@ -334,38 +315,34 @@ Tcl_UtfToUniChar(
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- return 3;
+ if (*chPtr > 0x7ff) {
+ return 3;
+ }
}
/*
* A three-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
-
- *chPtr = (Tcl_UniChar) byte;
- return 1;
}
#if TCL_UTF_MAX > 3
- {
- int ch, total, trail;
-
- total = totalBytes[byte];
- trail = total - 1;
- if (trail > 0) {
- ch = byte & (0x3F >> trail);
- do {
- src++;
- if ((*src & 0xC0) != 0x80) {
- *chPtr = byte;
- return 1;
- }
- ch <<= 6;
- ch |= (*src & 0x3F);
- trail--;
- } while (trail > 0);
- *chPtr = ch;
- return total;
+ else if (byte < 0xF8) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+ /*
+ * Four-byte-character lead byte followed by three trail bytes.
+ */
+
+ *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+ if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+ return 4;
+ }
}
+
+ /*
+ * A four-byte-character lead-byte not followed by two trail-bytes
+ * represents itself.
+ */
}
#endif
@@ -487,7 +464,6 @@ Tcl_NumUtfChars(
* for strlen(string). */
{
Tcl_UniChar ch;
- register Tcl_UniChar *chPtr = &ch;
register int i;
/*
@@ -500,23 +476,25 @@ Tcl_NumUtfChars(
i = 0;
if (length < 0) {
while (*src != '\0') {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (i < 0) i = INT_MAX; /* Bug [2738427] */
} else {
- register int n;
-
- while (length > 0) {
- if (UCHAR(*src) < 0xC0) {
- length--;
- src++;
- } else {
- n = Tcl_UtfToUniChar(src, chPtr);
- length -= n;
- src += n;
- }
+ register const char *endPtr = src + length - TCL_UTF_MAX;
+
+ while (src < endPtr) {
+ src += TclUtfToUniChar(src, &ch);
+ i++;
+ }
+ endPtr += TCL_UTF_MAX;
+ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (src < endPtr) {
+ i += endPtr - src;
+ }
}
return i;
}
@@ -1038,7 +1016,7 @@ Tcl_UtfNcmp(
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
+ * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
* (the byte 0x01.)
*/
@@ -1555,7 +1533,7 @@ Tcl_UniCharIsSpace(
if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
return TclIsSpaceProc((char) ch);
- } else if ((Tcl_UniChar) ch == 0x180e || (Tcl_UniChar) ch == 0x202f) {
+ } else if ((Tcl_UniChar) ch == 0x180E || (Tcl_UniChar) ch == 0x202F) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);