summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c91
1 files changed, 52 insertions, 39 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b33bf6a..b13ad75 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -75,11 +75,17 @@ static const unsigned char totalBytes[256] = {
#endif
1,1,1,1,1,1,1,1
};
+
+/*
+ * Functions used only in this module.
+ */
+
+static int UtfCount(int ch);
/*
*---------------------------------------------------------------------------
*
- * TclUtfCount --
+ * UtfCount --
*
* Find the number of bytes in the Utf character "ch".
*
@@ -92,8 +98,8 @@ static const unsigned char totalBytes[256] = {
*---------------------------------------------------------------------------
*/
-int
-TclUtfCount(
+INLINE static int
+UtfCount(
int ch) /* The Tcl_UniChar whose size is returned. */
{
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
@@ -103,7 +109,7 @@ TclUtfCount(
return 2;
}
#if TCL_UTF_MAX > 3
- if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
#endif
@@ -128,7 +134,7 @@ TclUtfCount(
*---------------------------------------------------------------------------
*/
-int
+INLINE int
Tcl_UniCharToUtf(
int ch, /* The Tcl_UniChar to be stored in the
* buffer. */
@@ -292,7 +298,9 @@ Tcl_UtfToUniChar(
*/
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- return 2;
+ if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+ return 2;
+ }
}
/*
@@ -307,7 +315,9 @@ Tcl_UtfToUniChar(
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- return 3;
+ if (*chPtr > 0x7FF) {
+ return 3;
+ }
}
/*
@@ -322,13 +332,15 @@ Tcl_UtfToUniChar(
* Four-byte-character lead byte followed by three trail bytes.
*/
- *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+ *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
- return 4;
+ if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+ return 4;
+ }
}
/*
- * A three-byte-character lead-byte not followed by two trail-bytes
+ * A four-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
}
@@ -453,7 +465,6 @@ Tcl_NumUtfChars(
* for strlen(string). */
{
Tcl_UniChar ch;
- register Tcl_UniChar *chPtr = &ch;
register int i;
/*
@@ -466,23 +477,25 @@ Tcl_NumUtfChars(
i = 0;
if (length < 0) {
while (*src != '\0') {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (i < 0) i = INT_MAX; /* Bug [2738427] */
} else {
- register int n;
-
- while (length > 0) {
- if (UCHAR(*src) < 0xC0) {
- length--;
- src++;
- } else {
- n = Tcl_UtfToUniChar(src, chPtr);
- length -= n;
- src += n;
- }
+ register const char *endPtr = src + length - TCL_UTF_MAX;
+
+ while (src < endPtr) {
+ src += TclUtfToUniChar(src, &ch);
+ i++;
+ }
+ endPtr += TCL_UTF_MAX;
+ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (src < endPtr) {
+ i += endPtr - src;
+ }
}
return i;
}
@@ -803,7 +816,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(upChar)) {
+ if (bytes < UtfCount(upChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -856,7 +869,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(lowChar)) {
+ if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -906,7 +919,7 @@ Tcl_UtfToTitle(
bytes = TclUtfToUniChar(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
- if (bytes < TclUtfCount(titleChar)) {
+ if (bytes < UtfCount(titleChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -918,7 +931,7 @@ Tcl_UtfToTitle(
bytes = TclUtfToUniChar(src, &ch);
lowChar = Tcl_UniCharToLower(ch);
- if (bytes < TclUtfCount(lowChar)) {
+ if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -1004,7 +1017,7 @@ Tcl_UtfNcmp(
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
+ * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
* (the byte 0x01.)
*/
@@ -1388,11 +1401,11 @@ Tcl_UniCharIsControl(
{
#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+ ch &= 0x1FFFFF;
+ if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) {
return 1;
}
- if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+ if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) {
return 1;
}
return 0;
@@ -1451,8 +1464,8 @@ Tcl_UniCharIsGraph(
{
#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ ch &= 0x1FFFFF;
+ return (ch >= 0xE0100) && (ch <= 0xE01EF);
}
#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
@@ -1508,8 +1521,8 @@ Tcl_UniCharIsPrint(
{
#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ ch &= 0x1FFFFF;
+ return (ch >= 0xE0100) && (ch <= 0xE01EF);
}
#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
@@ -1565,10 +1578,10 @@ Tcl_UniCharIsSpace(
{
#if TCL_UTF_MAX > 3
/* Ignore upper 11 bits. */
- ch &= 0x1fffff;
+ ch &= 0x1FFFFF;
#else
/* Ignore upper 16 bits. */
- ch &= 0xffff;
+ ch &= 0xFFFF;
#endif
/*
@@ -1582,8 +1595,8 @@ Tcl_UniCharIsSpace(
} else if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
#endif
- } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
- || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
+ } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
+ || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);