summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-04 12:31:51 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-04 12:31:51 (GMT)
commit7759441602f14b3772186ea0d87f61ffdafc8404 (patch)
treec8aac29d738caa3756201b63f7a4b9a81f27ad47 /generic
parent66197cab8b6c43b474a6dceae32fc95f4eed37b9 (diff)
downloadtcl-7759441602f14b3772186ea0d87f61ffdafc8404.zip
tcl-7759441602f14b3772186ea0d87f61ffdafc8404.tar.gz
tcl-7759441602f14b3772186ea0d87f61ffdafc8404.tar.bz2
New internal function TclGetUCS4() only available when TCL_UTF_MAX=4. This fixes all "knownBug" testcases related to tip389.
Diffstat (limited to 'generic')
-rw-r--r--generic/tclCmdMZ.c2
-rw-r--r--generic/tclExecute.c12
-rw-r--r--generic/tclInt.h7
-rw-r--r--generic/tclStringObj.c78
-rw-r--r--generic/tclUtf.c6
5 files changed, 91 insertions, 14 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 011164b..7516208 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1401,7 +1401,7 @@ StringIndexCmd(
}
if ((index >= 0) && (index < length)) {
- Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index);
+ int ch = TclGetUCS4(objv[1], index);
/*
* If we have a ByteArray object, we're careful to generate a new
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index f38e752..eeb69de 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5571,16 +5571,10 @@ TEBCresume(
objResultPtr = Tcl_NewStringObj((const char *)
valuePtr->bytes+index, 1);
} else {
- char buf[4] = "";
- Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index);
+ char buf[8] = "";
+ int ch = TclGetUCS4(valuePtr, index);
- /*
- * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1)
- * but creating the object as a string seems to be faster in
- * practical use.
- */
-
- length = Tcl_UniCharToUtf(ch, buf);
+ length = TclUCS4ToUtf(ch, buf);
objResultPtr = Tcl_NewStringObj(buf, length);
}
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 6f024a6..8983659 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3183,8 +3183,13 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes,
MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes,
const char *trim, int numTrim);
MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
-MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+MODULE_SCOPE int TclUtfToUCS4(const char *, int *);
MODULE_SCOPE int TclUCS4ToUtf(int, char *);
+#if TCL_UTF_MAX == 4
+ MODULE_SCOPE int TclGetUCS4(Tcl_Obj *, int);
+#else
+ #define TclGetUCS4 Tcl_GetUniChar
+#endif
/*
* Bytes F0-F4 are start-bytes for 4-byte sequences.
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 3ce8281..656d6ce 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -519,10 +519,10 @@ TclCheckEmptyString (
/*
*----------------------------------------------------------------------
*
- * Tcl_GetUniChar --
+ * Tcl_GetUniChar/TclGetUCS4 --
*
* Get the index'th Unicode character from the String object. If index
- * is out of range, the result = 0xFFFD;
+ * is out of range, the result = 0xFFFD (Tcl_GetUniChar) resp. -1 (TclGetUCS4)
*
* Results:
* Returns the index'th Unicode character in the Object.
@@ -587,6 +587,80 @@ Tcl_GetUniChar(
}
return stringPtr->unicode[index];
}
+
+#if TCL_UTF_MAX == 4
+int
+TclGetUCS4(
+ Tcl_Obj *objPtr, /* The object to get the Unicode charater
+ * from. */
+ int index) /* Get the index'th Unicode character. */
+{
+ String *stringPtr;
+ int ch, length;
+
+ if (index < 0) {
+ return -1;
+ }
+
+ /*
+ * Optimize the case where we're really dealing with a bytearray object
+ * we don't need to convert to a string to perform the indexing operation.
+ */
+
+ if (TclIsPureByteArray(objPtr)) {
+ unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length);
+ if (index >= length) {
+ return -1;
+ }
+
+ return (int) bytes[index];
+ }
+
+ /*
+ * OK, need to work with the object as a string.
+ */
+
+ SetStringFromAny(NULL, objPtr);
+ stringPtr = GET_STRING(objPtr);
+
+ if (stringPtr->hasUnicode == 0) {
+ /*
+ * If numChars is unknown, compute it.
+ */
+
+ if (stringPtr->numChars == -1) {
+ TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length);
+ }
+ if (stringPtr->numChars == objPtr->length) {
+ return (Tcl_UniChar) objPtr->bytes[index];
+ }
+ FillUnicodeRep(objPtr);
+ stringPtr = GET_STRING(objPtr);
+ }
+
+ if (index >= stringPtr->numChars) {
+ return -1;
+ }
+ ch = stringPtr->unicode[index];
+#if TCL_UTF_MAX <= 4
+ /* See: bug [11ae2be95dac9417] */
+ if ((ch & 0xF800) == 0xD800) {
+ if (ch & 0x400) {
+ if ((index > 0)
+ && ((stringPtr->unicode[index-1] & 0xFC00) == 0xD800)) {
+ ch = -1; /* low surrogate preceded by high surrogate */
+ }
+ } else if ((++index < stringPtr->numChars)
+ && ((stringPtr->unicode[index] & 0xFC00) == 0xDC00)) {
+ /* high surrogate followed by low surrogate */
+ ch = (((ch & 0x3FF) << 10) |
+ (stringPtr->unicode[index] & 0x3FF)) + 0x10000;
+ }
+ }
+#endif
+ return ch;
+}
+#endif
/*
*----------------------------------------------------------------------
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index ab3c577..9714204 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -2383,7 +2383,8 @@ TclUtfToUCS4(
*
* Results:
* The return values is the number of bytes in the buffer that were
- * consumed.
+ * consumed. If ch == -1, this function outputs 0 bytes (empty string),
+ * since TclGetUCS4 returns -1 for out-of-range indices.
*
* Side effects:
* None.
@@ -2414,6 +2415,9 @@ TclUCS4ToUtf(
buf[0] = (char) ((ch >> 12) | 0xE0);
return 3;
}
+ if (ch == -1) {
+ return 0;
+ }
return Tcl_UniCharToUtf(ch, buf);
}