summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c388
1 files changed, 312 insertions, 76 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 80f3be8..4103eff 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,11 +64,16 @@ static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#if TCL_UTF_MAX != 4
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+#else /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#endif
2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
4,4,4,4,4,
#else
1,1,1,1,1,
@@ -82,6 +87,8 @@ static const unsigned char totalBytes[256] = {
static int UtfCount(int ch);
static int Invalid(const char *src);
+static int UCS4ToUpper(int ch);
+static int UCS4ToTitle(int ch);
/*
*---------------------------------------------------------------------------
@@ -99,7 +106,7 @@ static int Invalid(const char *src);
*---------------------------------------------------------------------------
*/
-static INLINE int
+static inline int
UtfCount(
int ch) /* The Unicode character whose size is returned. */
{
@@ -160,7 +167,7 @@ static const unsigned char bounds[28] = {
#endif
};
-static INLINE int
+static int
Invalid(
const char *src) /* Points to lead byte of a UTF-8 byte sequence */
{
@@ -197,7 +204,7 @@ Invalid(
*---------------------------------------------------------------------------
*/
-INLINE int
+int
Tcl_UniCharToUtf(
int ch, /* The Tcl_UniChar to be stored in the
* buffer. */
@@ -217,6 +224,29 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
+#if TCL_UTF_MAX > 3
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x0400) {
+ /* Low surrogate */
+ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
+ /* Previous Tcl_UniChar was a high surrogate, so combine */
+ buf[2] = (char) ((ch & 0x3F) | 0x80);
+ buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+ return 3;
+ }
+ /* Previous Tcl_UniChar was not a high surrogate, so just output */
+ } else {
+ /* High surrogate */
+ ch += 0x40;
+ /* Fill buffer with specific 3-byte (invalid) byte combination,
+ so following low surrogate can recognize it and combine */
+ buf[2] = (char) ((ch << 4) & 0x30);
+ buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
+ buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
+ return 1;
+ }
+ }
+#endif
goto three;
}
@@ -228,6 +258,16 @@ Tcl_UniCharToUtf(
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
+ } else if (ch == -1) {
+ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)
+ && ((buf[-1] & 0xF8) == 0xF0)) {
+ ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2)
+ + ((buf[1] & 0x30) >> 4);
+ buf[1] = (char) ((ch | 0x80) & 0xBF);
+ buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[-1] = (char) ((ch >> 12) | 0xE0);
+ return 2;
+ }
#endif
}
@@ -305,6 +345,15 @@ Tcl_UniCharToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
+ * If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done:
+ * For any UTF-8 string containing a character outside of the BMP, the
+ * first call to this function will fill *chPtr with the high surrogate
+ * and generate a return value of 1. Calling Tcl_UtfToUniChar again
+ * will produce the low surrogate and a return value of 3. Because *chPtr
+ * is used to remember whether the high surrogate is already produced, it
+ * is recommended to initialize the variable it points to as 0 before
+ * the first call to Tcl_UtfToUniChar is done.
+ *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -335,6 +384,20 @@ Tcl_UtfToUniChar(
* characters representing themselves.
*/
+#if TCL_UTF_MAX <= 4
+ /* If *chPtr contains a high surrogate (produced by a previous
+ * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
+ * bytes, then we must produce a follow-up low surrogate. We only
+ * do that if the high surrogate matches the bits we encounter.
+ */
+ if (((byte & 0xC0) == 0x80)
+ && ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)
+ && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
+ && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) {
+ *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
+ return 3;
+ }
+#endif
*chPtr = byte;
return 1;
} else if (byte < 0xE0) {
@@ -371,17 +434,30 @@ Tcl_UtfToUniChar(
* represents itself.
*/
}
-#if TCL_UTF_MAX > 3
else if (byte < 0xF5) {
- if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+ if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
- * Four-byte-character lead byte followed by three trail bytes.
+ * Four-byte-character lead byte followed by at least two trail bytes.
+ * We don't test the validity of 3th trail byte, see [ed29806ba]
*/
- *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
- | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
- if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
- return 4;
+#if TCL_UTF_MAX <= 4
+ Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
+ | ((src[2] & 0x3F) >> 4)) - 0x40;
+ if (high < 0x400) {
+ /* produce high surrogate, advance source pointer */
+ *chPtr = 0xD800 + high;
+ return 1;
}
+ /* out of range, < 0x10000 or > 0x10FFFF */
+#else
+ if ((src[3] & 0xC0) == 0x80) {
+ *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+ if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+ return 4;
+ }
+ }
+#endif
}
/*
@@ -389,7 +465,6 @@ Tcl_UtfToUniChar(
* represents itself.
*/
}
-#endif
*chPtr = byte;
return 1;
@@ -422,13 +497,13 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar *w, *wString;
+ Tcl_UniChar ch = 0, *w, *wString;
const char *p;
int oldLength;
- /* Pointer to the end of string. Never read endPtr[0] */
- const char *endPtr = src + length;
- /* Pointer to breakpoint in scan where optimization is lost */
- const char *optPtr = endPtr - TCL_UTF_MAX;
+ /* Pointer to the end of string. Never read endPtr[0] */
+ const char *endPtr = src + length;
+ /* Pointer to last byte where optimization still can be used */
+ const char *optPtr = endPtr - TCL_UTF_MAX;
if (length < 0) {
length = strlen(src);
@@ -450,11 +525,12 @@ Tcl_UtfToUniCharDString(
endPtr = src + length;
optPtr = endPtr - TCL_UTF_MAX;
while (p <= optPtr) {
- p += TclUtfToUniChar(p, w);
- w++;
+ p += TclUtfToUniChar(p, &ch);
+ *w++ = ch;
}
while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
- p += TclUtfToUniChar(p, w++);
+ p += TclUtfToUniChar(p, &ch);
+ *w++ = ch;
}
while (p < endPtr) {
*w++ = UCHAR(*p++);
@@ -518,7 +594,7 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int i = 0;
if (length < 0) {
@@ -544,12 +620,26 @@ Tcl_NumUtfChars(
*/
while (src <= optPtr
/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
+#if TCL_UTF_MAX < 4
+ if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
+ /* treat F0 - F4 as single character */
+ ch = 0;
+ src++;
+ } else
+#endif
src += TclUtfToUniChar(src, &ch);
i++;
}
/* Loop over the remaining string where call must happen */
while (src < endPtr) {
if (Tcl_UtfCharComplete(src, endPtr - src)) {
+#if TCL_UTF_MAX < 4
+ if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
+ /* treat F0 - F4 as single character */
+ ch = 0;
+ src++;
+ } else
+#endif
src += TclUtfToUniChar(src, &ch);
} else {
/*
@@ -586,11 +676,11 @@ Tcl_NumUtfChars(
const char *
Tcl_UtfFindFirst(
const char *src, /* The UTF-8 string to be searched. */
- int ch) /* The Tcl_UniChar to search for. */
+ int ch) /* The Unicode character to search for. */
{
while (1) {
- Tcl_UniChar find;
- int len = TclUtfToUniChar(src, &find);
+ int find, len = TclUtfToUCS4(src, &find);
+
if (find == ch) {
return src;
}
@@ -628,8 +718,7 @@ Tcl_UtfFindLast(
const char *last = NULL;
while (1) {
- Tcl_UniChar find;
- int len = TclUtfToUniChar(src, &find);
+ int find, len = TclUtfToUCS4(src, &find);
if (find == ch) {
last = src;
@@ -799,15 +888,19 @@ Tcl_UtfPrev(
/* Continue the search backwards... */
look--;
- } while (trailBytesSeen < TCL_UTF_MAX);
+ } while (trailBytesSeen < ((TCL_UTF_MAX > 4) ? 4 : 3));
/*
- * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+ * We've seen 3 (or 4) trail bytes, so we know there will not be a
* properly formed byte sequence to find, and we can stop looking,
- * accepting the fallback.
+ * accepting the fallback (for TCL_UTF_MAX > 4) or just go back as
+ * far as we can.
*/
-
+#if TCL_UTF_MAX > 4
return fallback;
+#else
+ return src - 3;
+#endif
}
/*
@@ -832,7 +925,7 @@ Tcl_UniCharAtIndex(
const char *src, /* The UTF-8 string to dereference. */
int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch);
return ch;
@@ -860,11 +953,19 @@ Tcl_UtfAtIndex(
const char *src, /* The UTF-8 string. */
int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
+ int len = 0;
while (index-- > 0) {
+ len = TclUtfToUniChar(src, &ch);
+ src += len;
+ }
+#if TCL_UTF_MAX == 4
+ if ((ch >= 0xD800) && (len < 3)) {
+ /* Index points at character following high Surrogate */
src += TclUtfToUniChar(src, &ch);
}
+#endif
return src;
}
@@ -943,7 +1044,7 @@ int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, upChar;
+ int ch, upChar;
char *src, *dst;
int len;
@@ -953,8 +1054,8 @@ Tcl_UtfToUpper(
src = dst = str;
while (*src) {
- len = TclUtfToUniChar(src, &ch);
- upChar = Tcl_UniCharToUpper(ch);
+ len = TclUtfToUCS4(src, &ch);
+ upChar = UCS4ToUpper(ch);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -962,7 +1063,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(upChar)) {
+ if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -996,7 +1097,7 @@ int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, lowChar;
+ int ch, lowChar;
char *src, *dst;
int len;
@@ -1006,8 +1107,8 @@ Tcl_UtfToLower(
src = dst = str;
while (*src) {
- len = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ len = TclUtfToUCS4(src, &ch);
+ lowChar = TclUCS4ToLower(ch);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -1015,7 +1116,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(lowChar)) {
+ if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1050,7 +1151,7 @@ int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, titleChar, lowChar;
+ int ch, titleChar, lowChar;
char *src, *dst;
int len;
@@ -1062,10 +1163,10 @@ Tcl_UtfToTitle(
src = dst = str;
if (*src) {
- len = TclUtfToUniChar(src, &ch);
- titleChar = Tcl_UniCharToTitle(ch);
+ len = TclUtfToUCS4(src, &ch);
+ titleChar = UCS4ToTitle(ch);
- if (len < UtfCount(titleChar)) {
+ if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1074,14 +1175,14 @@ Tcl_UtfToTitle(
src += len;
}
while (*src) {
- len = TclUtfToUniChar(src, &ch);
+ len = TclUtfToUCS4(src, &ch);
lowChar = ch;
/* Special exception for Georgian Asomtavruli chars, no titlecase. */
if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
- lowChar = Tcl_UniCharToLower(lowChar);
+ lowChar = TclUCS4ToLower(lowChar);
}
- if (len < UtfCount(lowChar)) {
+ if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1163,7 +1264,7 @@ Tcl_UtfNcmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
@@ -1181,6 +1282,16 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & ~0x3FF) == 0xD800)) {
+ if ((ch2 & ~0x3FF) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & ~0x3FF) == 0xD800) {
+ return -ch2;
+ }
+#endif
return (ch1 - ch2);
}
}
@@ -1211,7 +1322,8 @@ Tcl_UtfNcasecmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
+
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1221,6 +1333,16 @@ Tcl_UtfNcasecmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1254,12 +1376,22 @@ TclUtfCasecmp(
const char *cs, /* UTF string to compare to ct. */
const char *ct) /* UTF string cs is compared to. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
+#if TCL_UTF_MAX == 4
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
+#endif
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1287,24 +1419,26 @@ TclUtfCasecmp(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
-Tcl_UniCharToUpper(
+static int
+UCS4ToUpper(
int ch) /* Unicode character to convert. */
{
-#if TCL_UTF_MAX > 3
if (!UNICODE_OUT_OF_RANGE(ch)) {
-#endif
int info = GetUniCharInfo(ch);
if (GetCaseType(info) & 0x04) {
ch -= GetDelta(info);
}
-#if TCL_UTF_MAX > 3
}
/* Clear away extension bits, if any */
- ch &= 0x1FFFFF;
-#endif
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
+}
+
+Tcl_UniChar
+Tcl_UniCharToUpper(
+ int ch) /* Unicode character to convert. */
+{
+ return (Tcl_UniChar) UCS4ToUpper(ch);
}
/*
@@ -1323,25 +1457,27 @@ Tcl_UniCharToUpper(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
-Tcl_UniCharToLower(
+int
+TclUCS4ToLower(
int ch) /* Unicode character to convert. */
{
-#if TCL_UTF_MAX > 3
if (!UNICODE_OUT_OF_RANGE(ch)) {
-#endif
int info = GetUniCharInfo(ch);
int mode = GetCaseType(info);
if ((mode & 0x02) && (mode != 0x7)) {
ch += GetDelta(info);
}
-#if TCL_UTF_MAX > 3
}
/* Clear away extension bits, if any */
- ch &= 0x1FFFFF;
-#endif
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
+}
+
+Tcl_UniChar
+Tcl_UniCharToLower(
+ int ch) /* Unicode character to convert. */
+{
+ return (Tcl_UniChar) TclUCS4ToLower(ch);
}
/*
@@ -1360,13 +1496,11 @@ Tcl_UniCharToLower(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
-Tcl_UniCharToTitle(
+static int
+UCS4ToTitle(
int ch) /* Unicode character to convert. */
{
-#if TCL_UTF_MAX > 3
if (!UNICODE_OUT_OF_RANGE(ch)) {
-#endif
int info = GetUniCharInfo(ch);
int mode = GetCaseType(info);
@@ -1381,12 +1515,16 @@ Tcl_UniCharToTitle(
} else if (mode == 0x4) {
ch -= GetDelta(info);
}
-#if TCL_UTF_MAX > 3
}
/* Clear away extension bits, if any */
- ch &= 0x1FFFFF;
-#endif
- return (Tcl_UniChar) ch;
+ return ch & 0x1FFFFF;
+}
+
+Tcl_UniChar
+Tcl_UniCharToTitle(
+ int ch) /* Unicode character to convert. */
+{
+ return (Tcl_UniChar) UCS4ToTitle(ch);
}
/*
@@ -1771,7 +1909,8 @@ Tcl_UniCharIsSpace(
} else if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
#endif
- } else if (ch == 0x180E || ch == 0x202F) {
+ } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
+ || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
return 1;
} else {
return ((SPACE_BITS >> GetCategory(ch)) & 1);
@@ -1865,7 +2004,7 @@ Tcl_UniCharCaseMatch(
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- Tcl_UniChar ch1, p;
+ Tcl_UniChar ch1 = 0, p;
while (1) {
p = *uniPattern;
@@ -2216,6 +2355,103 @@ TclUniCharMatch(
}
/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUtfToUCS4 --
+ *
+ * Extract the 4-byte codepoint from the leading bytes of the
+ * Modified UTF-8 string "src". This is a utility routine to
+ * contain the surrogate gymnastics in one place.
+ *
+ * The caller must ensure that the source buffer is long enough that this
+ * routine does not run off the end and dereference non-existent memory
+ * looking for trail bytes. If the source buffer is known to be '\0'
+ * terminated, this cannot happen. Otherwise, the caller should call
+ * TclUCS4Complete() before calling this routine to ensure that
+ * enough bytes remain in the string.
+ *
+ * Results:
+ * *usc4Ptr is filled with the UCS4 code point, and the return value is
+ * the number of bytes from the UTF-8 string that were consumed.
+ *
+ * Side effects:
+ * None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUtfToUCS4(
+ const char *src, /* The UTF-8 string. */
+ int *ucs4Ptr) /* Filled with the UCS4 codepoint represented
+ * by the UTF-8 string. */
+{
+ Tcl_UniChar ch = 0;
+ int len = Tcl_UtfToUniChar(src, &ch);
+
+#if TCL_UTF_MAX <= 4
+ if ((ch & ~0x3FF) == 0xD800) {
+ Tcl_UniChar low = ch;
+ int len2 = Tcl_UtfToUniChar(src+len, &low);
+ if ((low & ~0x3FF) == 0xDC00) {
+ *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+ return len + len2;
+ }
+ }
+#endif
+ *ucs4Ptr = (int)ch;
+ return len;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUCS4ToUtf --
+ *
+ * Store the given Unicode character as a sequence of UTF-8 bytes in the
+ * provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
+ *
+ * Results:
+ * The return values is the number of bytes in the buffer that were
+ * consumed. If ch == -1, this function outputs 0 bytes (empty string),
+ * since TclGetUCS4 returns -1 for out-of-range indices.
+ *
+ * Side effects:
+ * None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUCS4ToUtf(
+ int ch, /* Unicode character to be stored in the
+ * buffer. */
+ char *buf) /* Buffer in which the UTF-8 representation of
+ * the Unicode character is stored. Buffer must be
+ * large enough to hold the UTF-8 character(s)
+ * (at most 6 bytes). */
+{
+#if TCL_UTF_MAX <= 4
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+ /* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
+ * version and/or TCL_UTF_MAX build value */
+ int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
+ return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
+ }
+#endif
+ if ((ch & ~0x7FF) == 0xD800) {
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
+ }
+ if (ch == -1) {
+ return 0;
+ }
+ return Tcl_UniCharToUtf(ch, buf);
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4