summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclBinary.c16
-rw-r--r--generic/tclCmdMZ.c9
-rw-r--r--generic/tclInt.h2
-rw-r--r--generic/tclParse.c23
-rw-r--r--generic/tclUtf.c68
5 files changed, 77 insertions, 41 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index 6306159..52ef457 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -1222,11 +1222,11 @@ BinaryFormatCmd(
badField:
{
- Tcl_UniChar ch = 0;
- char buf[TCL_UTF_MAX + 1] = "";
+ int ch;
+ char buf[8] = "";
- TclUtfToUniChar(errorString, &ch);
- buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
+ TclUtfToUCS4(errorString, &ch);
+ buf[TclUCS4ToUtf(ch, buf)] = '\0';
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"bad field specifier \"%s\"", buf));
return TCL_ERROR;
@@ -1592,11 +1592,11 @@ BinaryScanCmd(
badField:
{
- Tcl_UniChar ch = 0;
- char buf[TCL_UTF_MAX + 1] = "";
+ int ch;
+ char buf[8] = "";
- TclUtfToUniChar(errorString, &ch);
- buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
+ TclUtfToUCS4(errorString, &ch);
+ buf[TclUCS4ToUtf(ch, buf)] = '\0';
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"bad field specifier \"%s\"", buf));
return TCL_ERROR;
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 162a5a6..011164b 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1413,14 +1413,9 @@ StringIndexCmd(
Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
} else {
- char buf[TCL_UTF_MAX] = "";
+ char buf[8] = "";
- length = Tcl_UniCharToUtf(ch, buf);
-#if TCL_UTF_MAX > 3
- if ((ch >= 0xD800) && (length < 3)) {
- length += Tcl_UniCharToUtf(-1, buf + length);
- }
-#endif
+ length = TclUCS4ToUtf(ch, buf);
Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
}
}
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 5c46470..6f024a6 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,6 +3184,8 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes,
const char *trim, int numTrim);
MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+MODULE_SCOPE int TclUCS4ToUtf(int, char *);
+
/*
* Bytes F0-F4 are start-bytes for 4-byte sequences.
* Byte 0xED can be the start-byte of an upper surrogate. In that case,
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 7beaeea..23a07cf 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -843,7 +843,6 @@ TclParseBackslash(
* written there. */
{
register const char *p = src+1;
- Tcl_UniChar unichar = 0;
int result;
int count;
char buf[TCL_UTF_MAX] = "";
@@ -943,7 +942,7 @@ TclParseBackslash(
* No hexdigits -> This is just "U".
*/
result = 'U';
- } else if ((result | 0x7FF) == 0xDFFF) {
+ } else if ((result & ~0x7FF) == 0xD800) {
/* Upper or lower surrogate, not allowed in this syntax. */
result = 0xFFFD;
}
@@ -991,16 +990,15 @@ TclParseBackslash(
* #217987] test subst-3.2
*/
- if (Tcl_UtfCharComplete(p, numBytes - 1)) {
- count = TclUtfToUniChar(p, &unichar) + 1; /* +1 for '\' */
+ if (TclUCS4Complete(p, numBytes - 1)) {
+ count = TclUtfToUCS4(p, &result) + 1; /* +1 for '\' */
} else {
- char utfBytes[TCL_UTF_MAX];
+ char utfBytes[8];
- memcpy(utfBytes, p, (size_t) (numBytes - 1));
+ memcpy(utfBytes, p, numBytes - 1);
utfBytes[numBytes - 1] = '\0';
- count = TclUtfToUniChar(utfBytes, &unichar) + 1;
+ count = TclUtfToUCS4(utfBytes, &result) + 1;
}
- result = unichar;
break;
}
@@ -1008,13 +1006,12 @@ TclParseBackslash(
if (readPtr != NULL) {
*readPtr = count;
}
- count = Tcl_UniCharToUtf(result, dst);
-#if TCL_UTF_MAX > 3
- if ((result >= 0xD800) && (count < 3)) {
- count += Tcl_UniCharToUtf(-1, dst + count);
+#if TCL_UTF_MAX < 4
+ if (result > 0xFFFF) {
+ result = 0xFFFD;
}
#endif
- return count;
+ return TclUCS4ToUtf(result, dst);
}
/*
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 03a7ca9..a14ce71 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2354,7 +2354,7 @@ TclUniCharMatch(
* routine does not run off the end and dereference non-existent memory
* looking for trail bytes. If the source buffer is known to be '\0'
* terminated, this cannot happen. Otherwise, the caller should call
- * Tcl_UtfCharComplete() before calling this routine to ensure that
+ * TclUCS4Complete() before calling this routine to ensure that
* enough bytes remain in the string.
*
* Results:
@@ -2373,26 +2373,68 @@ TclUtfToUCS4(
int *ucs4Ptr) /* Filled with the UCS4 codepoint represented
* by the UTF-8 string. */
{
- int len, fullchar;
Tcl_UniChar ch = 0;
+ int len = Tcl_UtfToUniChar(src, &ch);
- len = TclUtfToUniChar(src, &ch);
- fullchar = ch;
-
-#if TCL_UTF_MAX == 4
- /* 4-byte UTF-8 is supported; decode surrogates */
-
- if ((ch >= 0xD800) && len < 3) {
- len += Tcl_UtfToUniChar(src + len, &ch);
- fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
+#if TCL_UTF_MAX <= 4
+ if ((ch & ~0x3FF) == 0xD800) {
+ Tcl_UniChar low = ch;
+ int len2 = Tcl_UtfToUniChar(src+len, &low);
+ if ((low & ~0x3FF) == 0xDC00) {
+ *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+ return len + len2;
+ }
}
#endif
-
- *ucs4Ptr = fullchar;
+ *ucs4Ptr = (int)ch;
return len;
}
/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUCS4ToUtf --
+ *
+ * Store the given Unicode character as a sequence of UTF-8 bytes in the
+ * provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
+ *
+ * Results:
+ * The return values is the number of bytes in the buffer that were
+ * consumed.
+ *
+ * Side effects:
+ * None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUCS4ToUtf(
+ int ch, /* Unicode character to be stored in the
+ * buffer. */
+ char *buf) /* Buffer in which the UTF-8 representation of
+ * the Unicode character is stored. Buffer must be
+ * large enough to hold the UTF-8 character(s)
+ * (at most 6 bytes). */
+{
+#if TCL_UTF_MAX <= 4
+ if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+ /* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
+ * version and/or TCL_UTF_MAX build value */
+ int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
+ return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
+ }
+#endif
+ if ((ch & ~0x7FF) == 0xD800) {
+ buf[2] = (char) ((ch | 0x80) & 0xBF);
+ buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ return 3;
+ }
+ return Tcl_UniCharToUtf(ch, buf);
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4