summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c145
1 files changed, 90 insertions, 55 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b33bf6a..25cc2d1 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -103,7 +103,7 @@ TclUtfCount(
return 2;
}
#if TCL_UTF_MAX > 3
- if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
#endif
@@ -253,6 +253,15 @@ Tcl_UniCharToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
+ * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done:
+ * For any UTF-8 string containing a character outside of the BMP, the
+ * first call to this function will fill *chPtr with the high surrogate
+ * and generate a return value of 0. Calling Tcl_UtfToUniChar again
+ * will produce the low surrogate and a return value of 4. Because *chPtr
+ * is used to remember whether the high surrogate is already produced, it
+ * is recommended to initialize the variable it points to as 0 before
+ * the first call to Tcl_UtfToUniChar is done.
+ *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -272,7 +281,7 @@ Tcl_UtfToUniChar(
register int byte;
/*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+ * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
*/
byte = *((unsigned char *) src);
@@ -292,7 +301,9 @@ Tcl_UtfToUniChar(
*/
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- return 2;
+ if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+ return 2;
+ }
}
/*
@@ -307,7 +318,9 @@ Tcl_UtfToUniChar(
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- return 3;
+ if (*chPtr > 0x7FF) {
+ return 3;
+ }
}
/*
@@ -321,14 +334,34 @@ Tcl_UtfToUniChar(
/*
* Four-byte-character lead byte followed by three trail bytes.
*/
-
- *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+#if TCL_UTF_MAX == 4
+ Tcl_UniChar surrogate;
+
+ byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
+ surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
+ if (byte & 0x100000) {
+ /* out of range, < 0x10000 or > 0x10ffff */
+ } else if (*chPtr != surrogate) {
+ /* produce high surrogate, but don't advance source pointer */
+ *chPtr = surrogate;
+ return 0;
+ } else {
+ /* produce low surrogate, and advance source pointer */
+ *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
+ return 4;
+ }
+#else
+ *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
- return 4;
+ if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+ return 4;
+ }
+#endif
}
/*
- * A three-byte-character lead-byte not followed by two trail-bytes
+ * A four-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
}
@@ -365,7 +398,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar *w, *wString;
+ Tcl_UniChar ch, *w, *wString;
const char *p, *end;
int oldLength;
@@ -387,8 +420,8 @@ Tcl_UtfToUniCharDString(
w = wString;
end = src + length;
for (p = src; p < end; ) {
- p += TclUtfToUniChar(p, w);
- w++;
+ p += TclUtfToUniChar(p, &ch);
+ *w++ = ch;
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
@@ -422,10 +455,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- int ch;
-
- ch = *((unsigned char *) src);
- return length >= totalBytes[ch];
+ return length >= totalBytes[(unsigned char)*src];
}
/*
@@ -452,9 +482,8 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
- Tcl_UniChar ch;
- register Tcl_UniChar *chPtr = &ch;
- register int i;
+ Tcl_UniChar ch = 0;
+ register int i = 0;
/*
* The separate implementations are faster.
@@ -463,26 +492,27 @@ Tcl_NumUtfChars(
* single-byte char case specially.
*/
- i = 0;
if (length < 0) {
while (*src != '\0') {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (i < 0) i = INT_MAX; /* Bug [2738427] */
} else {
- register int n;
+ register const char *endPtr = src + length - TCL_UTF_MAX;
- while (length > 0) {
- if (UCHAR(*src) < 0xC0) {
- length--;
- src++;
- } else {
- n = Tcl_UtfToUniChar(src, chPtr);
- length -= n;
- src += n;
- }
+ while (src < endPtr) {
+ src += TclUtfToUniChar(src, &ch);
+ i++;
+ }
+ endPtr += TCL_UTF_MAX;
+ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (src < endPtr) {
+ i += endPtr - src;
+ }
}
return i;
}
@@ -512,7 +542,7 @@ Tcl_UtfFindFirst(
int ch) /* The Tcl_UniChar to search for. */
{
int len;
- Tcl_UniChar find;
+ Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
@@ -551,7 +581,7 @@ Tcl_UtfFindLast(
int ch) /* The Tcl_UniChar to search for. */
{
int len;
- Tcl_UniChar find;
+ Tcl_UniChar find = 0;
const char *last;
last = NULL;
@@ -591,9 +621,15 @@ const char *
Tcl_UtfNext(
const char *src) /* The current location in the string. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
+ int len = TclUtfToUniChar(src, &ch);
- return src + TclUtfToUniChar(src, &ch);
+#if TCL_UTF_MAX == 4
+ if (len == 0) {
+ len = TclUtfToUniChar(src, &ch);
+ }
+#endif
+ return src + len;
}
/*
@@ -626,8 +662,7 @@ Tcl_UtfPrev(
const char *look;
int i, byte;
- src--;
- look = src;
+ look = --src;
for (i = 0; i < TCL_UTF_MAX; i++) {
if (look < start) {
if (src < start) {
@@ -700,7 +735,7 @@ Tcl_UtfAtIndex(
register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index > 0) {
index--;
@@ -784,7 +819,7 @@ int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, upChar;
+ Tcl_UniChar ch = 0, upChar;
char *src, *dst;
int bytes;
@@ -837,7 +872,7 @@ int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, lowChar;
+ Tcl_UniChar ch = 0, lowChar;
char *src, *dst;
int bytes;
@@ -891,7 +926,7 @@ int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, titleChar, lowChar;
+ Tcl_UniChar ch = 0, titleChar, lowChar;
char *src, *dst;
int bytes;
@@ -1000,11 +1035,11 @@ Tcl_UtfNcmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
+ * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
* (the byte 0x01.)
*/
@@ -1048,7 +1083,7 @@ Tcl_UtfNcasecmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1388,11 +1423,11 @@ Tcl_UniCharIsControl(
{
#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+ ch &= 0x1FFFFF;
+ if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) {
return 1;
}
- if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+ if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) {
return 1;
}
return 0;
@@ -1451,8 +1486,8 @@ Tcl_UniCharIsGraph(
{
#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ ch &= 0x1FFFFF;
+ return (ch >= 0xE0100) && (ch <= 0xE01EF);
}
#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
@@ -1508,8 +1543,8 @@ Tcl_UniCharIsPrint(
{
#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ ch &= 0x1FFFFF;
+ return (ch >= 0xE0100) && (ch <= 0xE01EF);
}
#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
@@ -1565,10 +1600,10 @@ Tcl_UniCharIsSpace(
{
#if TCL_UTF_MAX > 3
/* Ignore upper 11 bits. */
- ch &= 0x1fffff;
+ ch &= 0x1FFFFF;
#else
/* Ignore upper 16 bits. */
- ch &= 0xffff;
+ ch &= 0xFFFF;
#endif
/*
@@ -1582,8 +1617,8 @@ Tcl_UniCharIsSpace(
} else if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
#endif
- } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
- || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
+ } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
+ || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1677,7 +1712,7 @@ Tcl_UniCharCaseMatch(
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- Tcl_UniChar ch1, p;
+ Tcl_UniChar ch1 = 0, p;
while (1) {
p = *uniPattern;