summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c544
1 files changed, 376 insertions, 168 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b33bf6a..ce67db7 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -68,11 +68,7 @@ static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
4,4,4,4,4,4,4,4,
-#else
- 1,1,1,1,1,1,1,1,
-#endif
1,1,1,1,1,1,1,1
};
@@ -94,7 +90,7 @@ static const unsigned char totalBytes[256] = {
int
TclUtfCount(
- int ch) /* The Tcl_UniChar whose size is returned. */
+ int ch) /* The Unicode character whose size is returned. */
{
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
return 1;
@@ -102,11 +98,9 @@ TclUtfCount(
if (ch <= 0x7FF) {
return 2;
}
-#if TCL_UTF_MAX > 3
- if (((unsigned)(ch - 0x10000) <= 0xfffff)) {
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
-#endif
return 3;
}
@@ -135,7 +129,7 @@ Tcl_UniCharToUtf(
char *buf) /* Buffer in which the UTF-8 representation of
* the Tcl_UniChar is stored. Buffer must be
* large enough to hold the UTF-8 character
- * (at most TCL_UTF_MAX bytes). */
+ * (at most 4 bytes). */
{
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
@@ -148,27 +142,30 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
-#if TCL_UTF_MAX == 4
if ((ch & 0xF800) == 0xD800) {
if (ch & 0x0400) {
/* Low surrogate */
- buf[3] = (char) ((ch | 0x80) & 0xBF);
- buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
- return 4;
+ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+ && ((buf[2] & 0xCF) == 0)) {
+ /* Previous Tcl_UniChar was a High surrogate, so combine */
+ buf[3] = (char) ((ch & 0x3F) | 0x80);
+ buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+ return 4;
+ }
+ /* Previous Tcl_UniChar was not a High surrogate, so just output */
} else {
/* High surrogate */
ch += 0x40;
- buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
- buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
- buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ /* Fill buffer with specific 3-byte (invalid) byte combination,
+ so following Low surrogate can recognize it and combine */
+ buf[2] = (char) ((ch << 4) & 0x30);
+ buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
+ buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
return 0;
}
}
-#endif
goto three;
}
-
-#if TCL_UTF_MAX > 3
if (ch <= 0x10FFFF) {
buf[3] = (char) ((ch | 0x80) & 0xBF);
buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
@@ -176,7 +173,13 @@ Tcl_UniCharToUtf(
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
-#endif
+ } else if (ch == -1) {
+ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+ && ((buf[2] & 0xCF) == 0)) {
+ ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2)
+ + ((buf[2] & 0x30) >> 4);
+ goto three;
+ }
}
ch = 0xFFFD;
@@ -215,23 +218,31 @@ Tcl_UniCharToUtfDString(
{
const Tcl_UniChar *w, *wEnd;
char *p, *string;
- int oldLength;
+ int oldLength, len = 1;
/*
- * UTF-8 string length in bytes will be <= Unicode string length *
- * TCL_UTF_MAX.
+ * UTF-8 string length in bytes will be <= Unicode string length * 4.
*/
oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
+ Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
string = Tcl_DStringValue(dsPtr) + oldLength;
p = string;
wEnd = uniStr + uniLength;
for (w = uniStr; w < wEnd; ) {
- p += Tcl_UniCharToUtf(*w, p);
+ if (!len && ((*w & 0xFC00) != 0xDC00)) {
+ /* Special case for handling upper surrogates. */
+ p += Tcl_UniCharToUtf(-1, p);
+ }
+ len = Tcl_UniCharToUtf(*w, p);
+ p += len;
w++;
}
+ if (!len) {
+ /* Special case for handling upper surrogates. */
+ p += Tcl_UniCharToUtf(-1, p);
+ }
Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
return string;
@@ -253,6 +264,15 @@ Tcl_UniCharToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
+ * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done:
+ * For any UTF-8 string containing a character outside of the BMP, the
+ * first call to this function will fill *chPtr with the high surrogate
+ * and generate a return value of 0. Calling Tcl_UtfToUniChar again
+ * will produce the low surrogate and a return value of 4. Because *chPtr
+ * is used to remember whether the high surrogate is already produced, it
+ * is recommended to initialize the variable it points to as 0 before
+ * the first call to Tcl_UtfToUniChar is done.
+ *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -263,6 +283,13 @@ Tcl_UniCharToUtfDString(
*---------------------------------------------------------------------------
*/
+static const unsigned short cp1252[32] = {
+ 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
+ 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+ 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
+};
+
int
Tcl_UtfToUniChar(
register const char *src, /* The UTF-8 string. */
@@ -272,18 +299,24 @@ Tcl_UtfToUniChar(
register int byte;
/*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+ * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
*/
byte = *((unsigned char *) src);
if (byte < 0xC0) {
/*
* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
- * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
+ * Treats naked trail bytes 0x80 to 0x9F as valid characters from
+ * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
+ * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
* characters representing themselves.
*/
- *chPtr = (Tcl_UniChar) byte;
+ if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
+ *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
+ } else {
+ *chPtr = (Tcl_UniChar) byte;
+ }
return 1;
} else if (byte < 0xE0) {
if ((src[1] & 0xC0) == 0x80) {
@@ -292,7 +325,9 @@ Tcl_UtfToUniChar(
*/
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
- return 2;
+ if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
+ return 2;
+ }
}
/*
@@ -307,7 +342,9 @@ Tcl_UtfToUniChar(
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
| ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
- return 3;
+ if (*chPtr > 0x7FF) {
+ return 3;
+ }
}
/*
@@ -315,24 +352,42 @@ Tcl_UtfToUniChar(
* represents itself.
*/
}
-#if TCL_UTF_MAX > 3
else if (byte < 0xF8) {
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
/*
* Four-byte-character lead byte followed by three trail bytes.
*/
-
- *chPtr = (Tcl_UniChar) (((byte & 0x0E) << 18) | ((src[1] & 0x3F) << 12)
+#if TCL_UTF_MAX <= 4
+ Tcl_UniChar surrogate;
+
+ byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
+ surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
+ if (byte & 0x100000) {
+ /* out of range, < 0x10000 or > 0x10ffff */
+ } else if (*chPtr != surrogate) {
+ /* produce high surrogate, but don't advance source pointer */
+ *chPtr = surrogate;
+ return 0;
+ } else {
+ /* produce low surrogate, and advance source pointer */
+ *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
+ return 4;
+ }
+#else
+ *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
- return 4;
+ if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+ return 4;
+ }
+#endif
}
/*
- * A three-byte-character lead-byte not followed by two trail-bytes
+ * A four-byte-character lead-byte not followed by two trail-bytes
* represents itself.
*/
}
-#endif
*chPtr = (Tcl_UniChar) byte;
return 1;
@@ -365,7 +420,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar *w, *wString;
+ Tcl_UniChar ch = 0, *w, *wString;
const char *p, *end;
int oldLength;
@@ -379,20 +434,32 @@ Tcl_UtfToUniCharDString(
*/
oldLength = Tcl_DStringLength(dsPtr);
-/* TODO: fix overreach! */
+
Tcl_DStringSetLength(dsPtr,
- (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
+ oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
- end = src + length;
- for (p = src; p < end; ) {
- p += TclUtfToUniChar(p, w);
- w++;
+ p = src;
+ end = src + length - 4;
+ while (p < end) {
+ p += TclUtfToUniChar(p, &ch);
+ *w++ = ch;
+ }
+ end += 4;
+ while (p < end) {
+ if (Tcl_UtfCharComplete(p, end-p)) {
+ p += TclUtfToUniChar(p, &ch);
+ } else if ((unsigned)((UCHAR(*p)-0x80)) < (unsigned) 0x20) {
+ ch = (Tcl_UniChar) cp1252[UCHAR(*p++)-0x80];
+ } else {
+ ch = UCHAR(*p++);
+ }
+ *w++ = ch;
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
- (oldLength + ((char *) w - (char *) wString)));
+ oldLength + ((char *) w - (char *) wString));
return wString;
}
@@ -422,10 +489,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- int ch;
-
- ch = *((unsigned char *) src);
- return length >= totalBytes[ch];
+ return length >= totalBytes[(unsigned char)*src];
}
/*
@@ -452,9 +516,8 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
- Tcl_UniChar ch;
- register Tcl_UniChar *chPtr = &ch;
- register int i;
+ Tcl_UniChar ch = 0;
+ register int i = 0;
/*
* The separate implementations are faster.
@@ -463,26 +526,27 @@ Tcl_NumUtfChars(
* single-byte char case specially.
*/
- i = 0;
if (length < 0) {
while (*src != '\0') {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (i < 0) i = INT_MAX; /* Bug [2738427] */
} else {
- register int n;
+ register const char *endPtr = src + length - 4;
- while (length > 0) {
- if (UCHAR(*src) < 0xC0) {
- length--;
- src++;
- } else {
- n = Tcl_UtfToUniChar(src, chPtr);
- length -= n;
- src += n;
- }
+ while (src < endPtr) {
+ src += TclUtfToUniChar(src, &ch);
+ i++;
+ }
+ endPtr += 4;
+ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
+ src += TclUtfToUniChar(src, &ch);
i++;
}
+ if (src < endPtr) {
+ i += endPtr - src;
+ }
}
return i;
}
@@ -492,13 +556,13 @@ Tcl_NumUtfChars(
*
* Tcl_UtfFindFirst --
*
- * Returns a pointer to the first occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the first occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
- * return value is NULL.
+ * As above. If the Unicode character does not exist in the given string,
+ * the return value is NULL.
*
* Side effects:
* None.
@@ -509,14 +573,21 @@ Tcl_NumUtfChars(
const char *
Tcl_UtfFindFirst(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
- Tcl_UniChar find;
+ int len, fullchar;
+ Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX <= 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (fullchar == ch) {
return src;
}
if (*src == '\0') {
@@ -531,12 +602,12 @@ Tcl_UtfFindFirst(
*
* Tcl_UtfFindLast --
*
- * Returns a pointer to the last occurance of the given Tcl_UniChar in
- * the NULL-terminated UTF-8 string. The NULL terminator is considered
+ * Returns a pointer to the last occurance of the given Unicode character
+ * in the NULL-terminated UTF-8 string. The NULL terminator is considered
* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
*
* Results:
- * As above. If the Tcl_UniChar does not exist in the given string, the
+ * As above. If the Unicode character does not exist in the given string, the
* return value is NULL.
*
* Side effects:
@@ -548,16 +619,23 @@ Tcl_UtfFindFirst(
const char *
Tcl_UtfFindLast(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
- int len;
- Tcl_UniChar find;
+ int len, fullchar;
+ Tcl_UniChar find = 0;
const char *last;
last = NULL;
while (1) {
len = TclUtfToUniChar(src, &find);
- if (find == ch) {
+ fullchar = find;
+#if TCL_UTF_MAX <= 4
+ if (!len) {
+ len += TclUtfToUniChar(src, &find);
+ fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
+ }
+#endif
+ if (fullchar == ch) {
last = src;
}
if (*src == '\0') {
@@ -591,9 +669,15 @@ const char *
Tcl_UtfNext(
const char *src) /* The current location in the string. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
+ int len = TclUtfToUniChar(src, &ch);
- return src + TclUtfToUniChar(src, &ch);
+#if TCL_UTF_MAX <= 4
+ if (len == 0) {
+ len = TclUtfToUniChar(src, &ch);
+ }
+#endif
+ return src + len;
}
/*
@@ -626,9 +710,8 @@ Tcl_UtfPrev(
const char *look;
int i, byte;
- src--;
- look = src;
- for (i = 0; i < TCL_UTF_MAX; i++) {
+ look = --src;
+ for (i = 0; i < 4; i++) {
if (look < start) {
if (src < start) {
src = start;
@@ -652,7 +735,7 @@ Tcl_UtfPrev(
*
* Tcl_UniCharAtIndex --
*
- * Returns the Unicode character represented at the specified character
+ * Returns the Tcl_UniChar represented at the specified character
* (not byte) position in the UTF-8 string.
*
* Results:
@@ -664,18 +747,33 @@ Tcl_UtfPrev(
*---------------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharAtIndex(
register const char *src, /* The UTF-8 string to dereference. */
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
+ int fullchar = 0;
+#if TCL_UTF_MAX <= 4
+ int len = 1;
+#endif
- while (index >= 0) {
- index--;
+ while (index-- >= 0) {
+#if TCL_UTF_MAX <= 4
+ src += (len = TclUtfToUniChar(src, &ch));
+#else
src += TclUtfToUniChar(src, &ch);
+#endif
}
- return ch;
+ fullchar = ch;
+#if TCL_UTF_MAX <= 4
+ if (!len) {
+ /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */
+ (void)TclUtfToUniChar(src, &ch);
+ fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ return fullchar;
}
/*
@@ -684,7 +782,9 @@ Tcl_UniCharAtIndex(
* Tcl_UtfAtIndex --
*
* Returns a pointer to the specified character (not byte) position in
- * the UTF-8 string.
+ * the UTF-8 string. If TCL_UTF_MAX <= 4, characters > U+FFFF count as
+ * 2 positions, but then the pointer should never be placed between
+ * the two positions.
*
* Results:
* As above.
@@ -700,12 +800,19 @@ Tcl_UtfAtIndex(
register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
+ int len = 1;
- while (index > 0) {
- index--;
+ while (index-- > 0) {
+ len = TclUtfToUniChar(src, &ch);
+ src += len;
+ }
+#if TCL_UTF_MAX <= 4
+ if (!len) {
+ /* Index points at character following High Surrogate */
src += TclUtfToUniChar(src, &ch);
}
+#endif
return src;
}
@@ -784,7 +891,8 @@ int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, upChar;
+ Tcl_UniChar ch = 0;
+ int upChar;
char *src, *dst;
int bytes;
@@ -795,7 +903,16 @@ Tcl_UtfToUpper(
src = dst = str;
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- upChar = Tcl_UniCharToUpper(ch);
+ upChar = ch;
+#if TCL_UTF_MAX <= 4
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ upChar = Tcl_UniCharToUpper(upChar);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -803,7 +920,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(upChar)) {
+ if ((bytes < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -837,7 +954,8 @@ int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, lowChar;
+ Tcl_UniChar ch = 0;
+ int lowChar;
char *src, *dst;
int bytes;
@@ -848,7 +966,16 @@ Tcl_UtfToLower(
src = dst = str;
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ lowChar = ch;
+#if TCL_UTF_MAX <= 4
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ lowChar = Tcl_UniCharToLower(lowChar);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -856,7 +983,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (bytes < TclUtfCount(lowChar)) {
+ if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -891,7 +1018,8 @@ int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, titleChar, lowChar;
+ Tcl_UniChar ch = 0;
+ int titleChar, lowChar;
char *src, *dst;
int bytes;
@@ -904,9 +1032,18 @@ Tcl_UtfToTitle(
if (*src) {
bytes = TclUtfToUniChar(src, &ch);
- titleChar = Tcl_UniCharToTitle(ch);
+ titleChar = ch;
+#if TCL_UTF_MAX <= 4
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ titleChar = Tcl_UniCharToTitle(titleChar);
- if (bytes < TclUtfCount(titleChar)) {
+ if ((bytes < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -916,9 +1053,21 @@ Tcl_UtfToTitle(
}
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ lowChar = ch;
+#if TCL_UTF_MAX <= 4
+ if (!bytes) {
+ /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+ bytes = TclUtfToUniChar(src, &ch);
+ /* Combine surrogates */
+ lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+ }
+#endif
+ /* Special exception for Georgian Asomtavruli chars, no titlecase. */
+ if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
+ lowChar = Tcl_UniCharToLower(lowChar);
+ }
- if (bytes < TclUtfCount(lowChar)) {
+ if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
memcpy(dst, src, (size_t) bytes);
dst += bytes;
} else {
@@ -1000,11 +1149,11 @@ Tcl_UtfNcmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
- * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
+ * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001
* (the byte 0x01.)
*/
@@ -1018,6 +1167,16 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX <= 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
return (ch1 - ch2);
}
}
@@ -1048,7 +1207,8 @@ Tcl_UtfNcasecmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1058,6 +1218,16 @@ Tcl_UtfNcasecmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX <= 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1067,11 +1237,57 @@ Tcl_UtfNcasecmp(
}
return 0;
}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfCmp --
+ *
+ * Compare UTF chars of string cs to string ct case sensitively.
+ * Replacement for strcmp in Tcl core, in places where UTF-8 should
+ * be handled.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCmp(
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
+{
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
+ while (*cs && *ct) {
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+#if TCL_UTF_MAX <= 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
+ return ch1 - ch2;
+ }
+ }
+ return UCHAR(*cs) - UCHAR(*ct);
+}
+
/*
*----------------------------------------------------------------------
*
- * Tcl_UtfNcasecmp --
+ * TclUtfCasecmp --
*
* Compare UTF chars of string cs to string ct case insensitively.
* Replacement for strcasecmp in Tcl core, in places where UTF-8 should
@@ -1091,12 +1307,22 @@ TclUtfCasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct) /* UTF string cs is compared to. */
{
- while (*cs && *ct) {
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+ while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX <= 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1124,16 +1350,18 @@ TclUtfCasecmp(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharToUpper(
int ch) /* Unicode character to convert. */
{
- int info = GetUniCharInfo(ch);
+ if (!UNICODE_OUT_OF_RANGE(ch)) {
+ int info = GetUniCharInfo(ch);
- if (GetCaseType(info) & 0x04) {
- ch -= GetDelta(info);
+ if (GetCaseType(info) & 0x04) {
+ ch -= GetDelta(info);
+ }
}
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
}
/*
@@ -1152,16 +1380,19 @@ Tcl_UniCharToUpper(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharToLower(
int ch) /* Unicode character to convert. */
{
- int info = GetUniCharInfo(ch);
+ if (!UNICODE_OUT_OF_RANGE(ch)) {
+ int info = GetUniCharInfo(ch);
+ int mode = GetCaseType(info);
- if (GetCaseType(info) & 0x02) {
- ch += GetDelta(info);
+ if ((mode & 0x02) && (mode != 0x7)) {
+ ch += GetDelta(info);
+ }
}
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
}
/*
@@ -1180,23 +1411,27 @@ Tcl_UniCharToLower(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
+int
Tcl_UniCharToTitle(
int ch) /* Unicode character to convert. */
{
- int info = GetUniCharInfo(ch);
- int mode = GetCaseType(info);
+ if (!UNICODE_OUT_OF_RANGE(ch)) {
+ int info = GetUniCharInfo(ch);
+ int mode = GetCaseType(info);
- if (mode & 0x1) {
- /*
- * Subtract or add one depending on the original case.
- */
+ if (mode & 0x1) {
+ /*
+ * Subtract or add one depending on the original case.
+ */
- ch += ((mode & 0x4) ? -1 : 1);
- } else if (mode == 0x4) {
- ch -= GetDelta(info);
+ if (mode != 0x7) {
+ ch += ((mode & 0x4) ? -1 : 1);
+ }
+ } else if (mode == 0x4) {
+ ch -= GetDelta(info);
+ }
}
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
}
/*
@@ -1330,11 +1565,9 @@ int
Tcl_UniCharIsAlnum(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
@@ -1358,11 +1591,9 @@ int
Tcl_UniCharIsAlpha(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
@@ -1386,18 +1617,16 @@ int
Tcl_UniCharIsControl(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- if ((ch == 0xe0001) || ((ch >= 0xe0020) && (ch <= 0xe007f))) {
+ ch &= 0x1FFFFF;
+ if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) {
return 1;
}
- if ((ch >= 0xf0000) && ((ch & 0xffff) <= 0xfffd)) {
+ if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) {
return 1;
}
return 0;
}
-#endif
return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
@@ -1421,11 +1650,9 @@ int
Tcl_UniCharIsDigit(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
@@ -1449,12 +1676,10 @@ int
Tcl_UniCharIsGraph(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ ch &= 0x1FFFFF;
+ return (ch >= 0xE0100) && (ch <= 0xE01EF);
}
-#endif
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
@@ -1478,11 +1703,9 @@ int
Tcl_UniCharIsLower(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return (GetCategory(ch) == LOWERCASE_LETTER);
}
@@ -1506,12 +1729,10 @@ int
Tcl_UniCharIsPrint(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
- ch &= 0x1fffff;
- return (ch >= 0xe0100) && (ch <= 0xe01ef);
+ ch &= 0x1FFFFF;
+ return (ch >= 0xE0100) && (ch <= 0xE01EF);
}
-#endif
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
@@ -1535,11 +1756,9 @@ int
Tcl_UniCharIsPunct(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
@@ -1563,13 +1782,8 @@ int
Tcl_UniCharIsSpace(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
/* Ignore upper 11 bits. */
- ch &= 0x1fffff;
-#else
- /* Ignore upper 16 bits. */
- ch &= 0xffff;
-#endif
+ ch &= 0x1FFFFF;
/*
* If the character is within the first 127 characters, just use the
@@ -1578,12 +1792,10 @@ Tcl_UniCharIsSpace(
if (ch < 0x80) {
return TclIsSpaceProc((char) ch);
-#if TCL_UTF_MAX > 3
} else if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
-#endif
- } else if (ch == 0x0085 || ch == 0x180e || ch == 0x200b
- || ch == 0x202f || ch == 0x2060 || ch == 0xfeff) {
+ } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
+ || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1610,11 +1822,9 @@ int
Tcl_UniCharIsUpper(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return (GetCategory(ch) == UPPERCASE_LETTER);
}
@@ -1638,11 +1848,9 @@ int
Tcl_UniCharIsWordChar(
int ch) /* Unicode character to test. */
{
-#if TCL_UTF_MAX > 3
if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
}
-#endif
return ((WORD_BITS >> GetCategory(ch)) & 1);
}
@@ -1677,7 +1885,7 @@ Tcl_UniCharCaseMatch(
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- Tcl_UniChar ch1, p;
+ Tcl_UniChar ch1 = 0, p;
while (1) {
p = *uniPattern;