summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2022-03-03 13:05:38 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2022-03-03 13:05:38 (GMT)
commitb488f3edf6ee202281aca13745c0d4212310f654 (patch)
tree068d3d40fef160ee211303889016729e228b15ec /generic
parent9a1c1f5e11679feeaafd9c788631fc98faf6945e (diff)
downloadtcl-b488f3edf6ee202281aca13745c0d4212310f654.zip
tcl-b488f3edf6ee202281aca13745c0d4212310f654.tar.gz
tcl-b488f3edf6ee202281aca13745c0d4212310f654.tar.bz2
TIP #619 implementation. tests not working yet
Diffstat (limited to 'generic')
-rw-r--r--generic/tcl.h7
-rw-r--r--generic/tclCmdMZ.c2
-rw-r--r--generic/tclDecls.h8
-rw-r--r--generic/tclEncoding.c21
-rw-r--r--generic/tclParse.c7
-rw-r--r--generic/tclUtf.c18
6 files changed, 48 insertions, 15 deletions
diff --git a/generic/tcl.h b/generic/tcl.h
index 6b69929..8778203 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -835,6 +835,13 @@ typedef struct Tcl_DString {
#define TCL_INDEX_NULL_OK 4
/*
+ * Flags that may be passed to Tcl_UniCharToUtf.
+ * TCL_COMBINE Combine surrogates
+ */
+
+#define TCL_COMBINE 0x200000
+
+/*
*----------------------------------------------------------------------------
* Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv.
* WARNING: these bit choices must not conflict with the bit choices for
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 85174ec..b50eacb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1432,9 +1432,11 @@ StringIndexCmd(
char buf[4] = "";
end = Tcl_UniCharToUtf(ch, buf);
+#if TCL_UTF_MAX < 4
if ((ch >= 0xD800) && (end < 3)) {
end += Tcl_UniCharToUtf(-1, buf + end);
}
+#endif
Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end));
}
}
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 9205401..d073edd 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -3919,6 +3919,14 @@ extern const TclStubs *tclStubsPtr;
# define Tcl_UtfToUniChar Tcl_UtfToChar16
# undef Tcl_UniCharLen
# define Tcl_UniCharLen Tcl_Char16Len
+# undef Tcl_UniCharToUtf
+# if defined(USE_TCL_STUBS)
+# define Tcl_UniCharToUtf(c, p) \
+ (tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p)))
+# else
+# define Tcl_UniCharToUtf(c, p) \
+ ((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p)))
+# endif
#endif
#if defined(USE_TCL_STUBS)
# define Tcl_WCharToUtfDString (sizeof(wchar_t) != sizeof(short) \
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 3a6385f..765f98b 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2228,7 +2228,6 @@ UtfToUtfProc(
}
dst += Tcl_UniCharToUtf(ch, dst);
} else {
- int low;
const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)
@@ -2246,13 +2245,20 @@ UtfToUtfProc(
*dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
ch = (ch & 0x0CFF) | 0xDC00;
}
- goto cesu8;
+#if TCL_UTF_MAX < 4
+ cesu8:
+#endif
+ *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
+ *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
+ *dst++ = (char) ((ch | 0x80) & 0xBF);
+ continue;
+#if TCL_UTF_MAX < 4
} else if ((ch | 0x7FF) == 0xDFFF) {
/*
* A surrogate character is detected, handle especially.
*/
- low = ch;
+ int low = ch;
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
@@ -2261,15 +2267,12 @@ UtfToUtfProc(
src = saveSrc;
break;
}
- cesu8:
- *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
- *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
- *dst++ = (char) ((ch | 0x80) & 0xBF);
- continue;
+ goto cesu8;
}
src += len;
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
+#endif
} else if (!Tcl_UniCharIsUnicode(ch)) {
if (flags & TCL_ENCODING_STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
@@ -2578,7 +2581,7 @@ Utf16ToUtfProc(
if (ch && ch < 0x80) {
*dst++ = (ch & 0xFF);
} else {
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
}
src += sizeof(unsigned short);
}
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 614401f..fdd1478 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -869,6 +869,7 @@ TclParseBackslash(
* No hexdigits -> This is just "u".
*/
result = 'u';
+#if TCL_UTF_MAX < 4
} else if (((result & 0xFC00) == 0xD800) && (count == 6)
&& (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) {
/* If high surrogate is immediately followed by a low surrogate
@@ -879,6 +880,7 @@ TclParseBackslash(
result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000;
count += count2 + 2;
}
+#endif
}
break;
case 'U':
@@ -888,9 +890,6 @@ TclParseBackslash(
* No hexdigits -> This is just "U".
*/
result = 'U';
- } else if ((result | 0x7FF) == 0xDFFF) {
- /* Upper or lower surrogate, not allowed in this syntax. */
- result = 0xFFFD;
}
break;
case '\n':
@@ -954,10 +953,12 @@ TclParseBackslash(
*readPtr = count;
}
count = Tcl_UniCharToUtf(result, dst);
+#if TCL_UTF_MAX < 4
if ((result >= 0xD800) && (count < 3)) {
/* Special case for handling high surrogates. */
count += Tcl_UniCharToUtf(-1, dst + count);
}
+#endif
return count;
}
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index e353b7f..a04e41c 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -208,15 +208,23 @@ Invalid(
*---------------------------------------------------------------------------
*/
+#undef Tcl_UniCharToUtf
int
Tcl_UniCharToUtf(
int ch, /* The Tcl_UniChar to be stored in the
- * buffer. */
+ * buffer. Can be or'ed with flag TCL_COMBINE */
char *buf) /* Buffer in which the UTF-8 representation of
* the Tcl_UniChar is stored. Buffer must be
* large enough to hold the UTF-8 character
* (at most 4 bytes). */
{
+#if TCL_UTF_MAX > 3
+ int flags = ch;
+#endif
+
+ if (ch >= TCL_COMBINE) {
+ ch &= (TCL_COMBINE - 1);
+ }
if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
buf[0] = (char) ch;
return 1;
@@ -228,7 +236,11 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
- if ((ch & 0xF800) == 0xD800) {
+ if (
+#if TCL_UTF_MAX > 3
+ (flags & TCL_COMBINE) &&
+#endif
+ ((ch & 0xF800) == 0xD800)) {
if (ch & 0x0400) {
/* Low surrogate */
if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
@@ -377,7 +389,7 @@ Tcl_Char16ToUtfDString(
/* Special case for handling high surrogates. */
p += Tcl_UniCharToUtf(-1, p);
}
- len = Tcl_UniCharToUtf(*w, p);
+ len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p);
p += len;
if ((*w >= 0xD800) && (len < 3)) {
len = 0; /* Indication that high surrogate was found */