summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclCmdMZ.c34
-rw-r--r--generic/tclInt.h1
-rw-r--r--generic/tclUtf.c55
3 files changed, 61 insertions, 29 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index d344678..23370a8 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1081,23 +1081,10 @@ Tcl_SplitObjCmd(
Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS);
for ( ; stringPtr < end; stringPtr += len) {
- int fullchar;
- len = TclUtfToUniChar(stringPtr, &ch);
- fullchar = ch;
-
-#if TCL_UTF_MAX == 4
- if ((ch >= 0xD800) && (len < 3)) {
- len += TclUtfToUniChar(stringPtr + len, &ch);
- fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
- }
-#endif
+ int ucs4;
- /*
- * Assume Tcl_UniChar is an integral type...
- */
-
- hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(fullchar),
- &isNew);
+ len = TclUtfToUCS4(stringPtr, &ucs4);
+ hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ucs4), &isNew);
if (isNew) {
TclNewStringObj(objPtr, stringPtr, len);
@@ -1466,7 +1453,6 @@ StringIsCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
const char *string1, *end, *stop;
- Tcl_UniChar ch = 0;
int (*chcomp)(int) = NULL; /* The UniChar comparison function. */
int i, failat = 0, result = 1, strict = 0, index, length1, length2;
Tcl_Obj *objPtr, *failVarObj = NULL;
@@ -1797,16 +1783,10 @@ StringIsCmd(
}
end = string1 + length1;
for (; string1 < end; string1 += length2, failat++) {
- int fullchar;
- length2 = TclUtfToUniChar(string1, &ch);
- fullchar = ch;
-#if TCL_UTF_MAX == 4
- if ((ch >= 0xD800) && (length2 < 3)) {
- length2 += TclUtfToUniChar(string1 + length2, &ch);
- fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
- }
-#endif
- if (!chcomp(fullchar)) {
+ int ucs4;
+
+ length2 = TclUtfToUCS4(string1, &ucs4);
+ if (!chcomp(ucs4)) {
result = 0;
break;
}
diff --git a/generic/tclInt.h b/generic/tclInt.h
index c30a257..74b2cc9 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3186,6 +3186,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes,
MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes,
const char *trim, int numTrim);
MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
+MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr);
MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index c58f5a9..0db06bd 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -278,8 +278,8 @@ Tcl_UniCharToUtfDString(
* If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done:
* For any UTF-8 string containing a character outside of the BMP, the
* first call to this function will fill *chPtr with the high surrogate
- * and generate a return value of 0. Calling Tcl_UtfToUniChar again
- * will produce the low surrogate and a return value of 4. Because *chPtr
+ * and generate a return value of 1. Calling Tcl_UtfToUniChar again
+ * will produce the low surrogate and a return value of 3. Because *chPtr
* is used to remember whether the high surrogate is already produced, it
* is recommended to initialize the variable it points to as 0 before
* the first call to Tcl_UtfToUniChar is done.
@@ -2156,6 +2156,57 @@ TclUniCharMatch(
}
/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUtfToUCS4 --
+ *
+ * Extract the 4-byte codepoint from the leading bytes of the
+ * Modified UTF-8 string "src". This is a utility routine to
+ * contain the surrogate gymnastics in one place.
+ *
+ * The caller must ensure that the source buffer is long enough that this
+ * routine does not run off the end and dereference non-existent memory
+ * looking for trail bytes. If the source buffer is known to be '\0'
+ * terminated, this cannot happen. Otherwise, the caller should call
+ * Tcl_UtfCharComplete() before calling this routine to ensure that
+ * enough bytes remain in the string.
+ *
+ * Results:
+ * *usc4Ptr is filled with the UCS4 code point, and the return value is
+ * the number of bytes from the UTF-8 string that were consumed.
+ *
+ * Side effects:
+ * None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUtfToUCS4(
+ const char *src, /* The UTF-8 string. */
+ int *ucs4Ptr) /* Filled with the UCS4 codepoint represented
+ * by the UTF-8 string. */
+{
+ int len, fullchar;
+ Tcl_UniChar ch = 0;
+
+ len = TclUtfToUniChar(src, &ch);
+ fullchar = ch;
+
+#if TCL_UTF_MAX == 4
+ /* 4-byte UTF-8 is supported; decode surrogates */
+
+ if ((ch >= 0xD800) && len < 3)
+ len += Tcl_UtfToUniChar(src + len, &ch);
+ fullchar = (((fullchar & 0x3FF) << 10) | (ch & 0x3FF)) + 0x10000;
+ }
+#endif
+
+ *ucs4Ptr = fullchar;
+ return len;
+}
+
+/*
* Local Variables:
* mode: c
* c-basic-offset: 4