summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tcl.h7
-rw-r--r--generic/tclEncoding.c119
-rw-r--r--tests/cmdAH.test3
3 files changed, 99 insertions, 30 deletions
diff --git a/generic/tcl.h b/generic/tcl.h
index b7d31aa..3fc53db 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2140,6 +2140,7 @@ typedef struct Tcl_EncodingType {
/* Reserve top byte for profile values (disjoint) */
#define TCL_ENCODING_PROFILE_TCL8 0x01000000
#define TCL_ENCODING_PROFILE_STRICT 0x02000000
+#define TCL_ENCODING_PROFILE_REPLACE 0x03000000
#define TCL_ENCODING_PROFILE_MASK 0xFF000000
#define TCL_ENCODING_PROFILE_GET(flags_) ((flags_) & TCL_ENCODING_PROFILE_MASK)
#define TCL_ENCODING_PROFILE_SET(flags_, profile_) \
@@ -2151,13 +2152,9 @@ typedef struct Tcl_EncodingType {
#if TCL_MAJOR_VERSION < 9
#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8
#else
-#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? TODO */
+#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? REPLACE? TODO */
#endif
-#define TCL_ENCODING_EXTERNAL_FLAG_MASK \
- (TCL_ENCODING_START|TCL_ENCODING_END|TCL_ENCODING_STOPONERROR)
-
-
/*
* The following definitions are the error codes returned by the conversion
* routines:
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 85c2b6a..bb1f32f 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -193,8 +193,12 @@ Tcl_Encoding tclIdentityEncoding = NULL;
static struct TclEncodingProfiles {
const char *name;
int value;
-} encodingProfiles[] = {{"tcl8", TCL_ENCODING_PROFILE_TCL8},
- {"strict", TCL_ENCODING_PROFILE_STRICT}};
+} encodingProfiles[] = {
+ {"tcl8", TCL_ENCODING_PROFILE_TCL8},
+ {"strict", TCL_ENCODING_PROFILE_STRICT},
+ {"replace", TCL_ENCODING_PROFILE_REPLACE},
+};
+#define UNICODE_REPLACE_CHAR 0xFFFD
/*
* The following variable is used in the sparse matrix code for a
@@ -2336,7 +2340,7 @@ UtfToUtfProc(
void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
- int flags, /* Conversion control flags. */
+ int flags, /* TCL_ENCODING_* conversion control flags. */
TCL_UNUSED(Tcl_EncodingState *),
char *dst, /* Output buffer in which converted string is
* stored. */
@@ -2376,6 +2380,8 @@ UtfToUtfProc(
dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
+ int profile = TCL_ENCODING_PROFILE_GET(flags);
+
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
@@ -2389,34 +2395,51 @@ UtfToUtfProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) {
+ /*
+ * TCL_ENCODING_MODIFIED is set when the target encoding is Tcl's
+ * internal UTF-8 modified version.
+ */
+ if (UCHAR(*src) < 0x80
+ && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) {
/*
- * Copy 7bit characters, but skip null-bytes when we are in input
- * mode, so that they get converted to 0xC080.
+ * Copy 7bit characters, but skip null-bytes when target encoding
+ * is Tcl's "modified" UTF-8. These need to be converted to
+ * \xC0\x80 as is done in a later branch.
*/
*dst++ = *src++;
- } else if ((UCHAR(*src) == 0xC0) &&
- (src + 1 < srcEnd) &&
- (UCHAR(src[1]) == 0x80) &&
- (!(flags & TCL_ENCODING_MODIFIED)
- || (STRICT_PROFILE(flags)))) {
+ }
+ else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd)
+ && (UCHAR(src[1]) == 0x80)
+ && (!(flags & TCL_ENCODING_MODIFIED)
+ || (profile == TCL_ENCODING_PROFILE_STRICT))) {
/*
- * If in input mode, and -strict or -failindex is specified: This is an error.
+ * \xC0\x80 and either strict profile or target is "real" UTF-8
+ * - Strict profile - error
+ * - Non-strict, real UTF-8 - output \x00
*/
if (flags & TCL_ENCODING_MODIFIED) {
+ /*
+ * TODO - should above check not be against STRICT?
+ * That would probably break a convertto command that goes
+ * from the internal UTF8 to the real UTF8. On the other
+ * hand this means, a strict UTF8->UTF8 transform is not
+ * possible using this function.
+ */
result = TCL_CONVERT_SYNTAX;
break;
}
/*
- * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'.
+ * Convert 0xC080 to real nulls when we are in output mode,
+ * irrespective of the profile.
*/
*dst++ = 0;
src += 2;
}
else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
+ * Incomplete byte sequence.
* Always check before using TclUtfToUCS4. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
* incomplete char its bytes are made to represent themselves
@@ -2424,17 +2447,39 @@ UtfToUtfProc(
*/
if (flags & TCL_ENCODING_MODIFIED) {
- if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) {
- result = TCL_CONVERT_MULTIBYTE;
+ /* Incomplete bytes for modified UTF-8 target */
+ if (profile == TCL_ENCODING_PROFILE_STRICT) {
+ result = (flags & TCL_ENCODING_CHAR_LIMIT)
+ ? TCL_CONVERT_MULTIBYTE
+ : TCL_CONVERT_SYNTAX;
break;
}
- if (STRICT_PROFILE(flags)) {
- result = TCL_CONVERT_SYNTAX;
- break;
+ if (profile == TCL_ENCODING_PROFILE_REPLACE) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ /* TCL_ENCODING_PROFILE_TCL8 */
+ ch = UCHAR(*src);
}
- ch = UCHAR(*src++);
- } else {
+ ++src;
+ }
+ else {
+ /*
+ * Incomplete bytes for real UTF-8 target.
+ * TODO - no profile check here because did not have any
+ * checks in the pre-profile code. Why? Is it because on
+ * output a valid internal utf-8 stream is assumed?
+ */
char chbuf[2];
+ /*
+ * TODO - this code seems broken to me.
+ * - it does not check profiles
+ * - generates invalid output for real UTF-8 target
+ * (consider \xC2)
+ * A possible explanation is this behavior matches the
+ * Tcl8 decoding behavior of mapping invalid bytes to the same
+ * code point value. Still, at least strictness checks should
+ * be made.
+ */
chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
TclUtfToUCS4(chbuf, &ch);
}
@@ -2444,11 +2489,31 @@ UtfToUtfProc(
int low;
const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
+
+ /*
+ * Valid single char encodings were already handled earlier.
+ * So len==1 means an invalid byte that is magically transformed
+ * to a code point unless it resulted from the special
+ * \xC0\x80 sequence. Tests io-75.*
+ * TODO - below check could be simplified to remove the MODIFIED
+ * expression I think given the checks already made above. May be.
+ */
+#if 0
if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)
- && STRICT_PROFILE(flags)) {
+ && (profile == TCL_ENCODING_PROFILE_STRICT)) {
result = TCL_CONVERT_SYNTAX;
break;
}
+#else
+ if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)) {
+ if (profile == TCL_ENCODING_PROFILE_STRICT) {
+ result = TCL_CONVERT_SYNTAX;
+ break;
+ } else if (profile == TCL_ENCODING_PROFILE_REPLACE) {
+ ch = UNICODE_REPLACE_CHAR;
+ }
+ }
+#endif
src += len;
if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) {
if (ch > 0xFFFF) {
@@ -2464,13 +2529,14 @@ UtfToUtfProc(
/*
* A surrogate character is detected, handle especially.
*/
+ /* TODO - what about REPLACE profile? */
low = ch;
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
- if (STOPONERROR) {
+ if (profile == TCL_ENCODING_PROFILE_STRICT) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
@@ -2484,12 +2550,14 @@ UtfToUtfProc(
src += len;
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
- } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) {
+ } else if ((profile == TCL_ENCODING_PROFILE_STRICT) &&
+ !(flags & TCL_ENCODING_MODIFIED) &&
+ (((ch & ~0x7FF) == 0xD800))) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
- } else if (STRICT_PROFILE(flags) &&
- (flags & TCL_ENCODING_MODIFIED) &&
+ } else if ((profile == TCL_ENCODING_PROFILE_STRICT) &&
+ (flags & TCL_ENCODING_MODIFIED) &&
((ch & ~0x7FF) == 0xD800)) {
result = TCL_CONVERT_SYNTAX;
src = saveSrc;
@@ -4216,6 +4284,7 @@ int TclEncodingExternalFlagsToInternal(int flags)
switch (profile) {
case TCL_ENCODING_PROFILE_TCL8:
case TCL_ENCODING_PROFILE_STRICT:
+ case TCL_ENCODING_PROFILE_REPLACE:
break;
case 0: /* Unspecified by caller */
default:
diff --git a/tests/cmdAH.test b/tests/cmdAH.test
index c4053a2..52e7ac3 100644
--- a/tests/cmdAH.test
+++ b/tests/cmdAH.test
@@ -228,6 +228,9 @@ set encInvalidBytes {
utf-8 A\xed\xb0\x80B default A\uDC00B -1 Low-surrogate
utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate
utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate
+ utf-8 \xed\xa0\x80\xed\xb0\x80 default \U00010000 -1 High-low-surrogate
+ utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate
+ utf-8 \xed\xa0\x80\xed\xb0\x80 strict \U00010000 0 High-low-surrogate
utf-32le \x00\xD8\x00\x00 default \uD800 -1 {High-surrogate}
utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate}