diff options
-rw-r--r-- | generic/tcl.h | 7 | ||||
-rw-r--r-- | generic/tclEncoding.c | 119 | ||||
-rw-r--r-- | tests/cmdAH.test | 3 |
3 files changed, 99 insertions, 30 deletions
diff --git a/generic/tcl.h b/generic/tcl.h index b7d31aa..3fc53db 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2140,6 +2140,7 @@ typedef struct Tcl_EncodingType { /* Reserve top byte for profile values (disjoint) */ #define TCL_ENCODING_PROFILE_TCL8 0x01000000 #define TCL_ENCODING_PROFILE_STRICT 0x02000000 +#define TCL_ENCODING_PROFILE_REPLACE 0x03000000 #define TCL_ENCODING_PROFILE_MASK 0xFF000000 #define TCL_ENCODING_PROFILE_GET(flags_) ((flags_) & TCL_ENCODING_PROFILE_MASK) #define TCL_ENCODING_PROFILE_SET(flags_, profile_) \ @@ -2151,13 +2152,9 @@ typedef struct Tcl_EncodingType { #if TCL_MAJOR_VERSION < 9 #define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 #else -#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? TODO */ +#define TCL_ENCODING_PROFILE_DEFAULT TCL_ENCODING_PROFILE_TCL8 /* STRICT? REPLACE? TODO */ #endif -#define TCL_ENCODING_EXTERNAL_FLAG_MASK \ - (TCL_ENCODING_START|TCL_ENCODING_END|TCL_ENCODING_STOPONERROR) - - /* * The following definitions are the error codes returned by the conversion * routines: diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 85c2b6a..bb1f32f 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -193,8 +193,12 @@ Tcl_Encoding tclIdentityEncoding = NULL; static struct TclEncodingProfiles { const char *name; int value; -} encodingProfiles[] = {{"tcl8", TCL_ENCODING_PROFILE_TCL8}, - {"strict", TCL_ENCODING_PROFILE_STRICT}}; +} encodingProfiles[] = { + {"tcl8", TCL_ENCODING_PROFILE_TCL8}, + {"strict", TCL_ENCODING_PROFILE_STRICT}, + {"replace", TCL_ENCODING_PROFILE_REPLACE}, +}; +#define UNICODE_REPLACE_CHAR 0xFFFD /* * The following variable is used in the sparse matrix code for a @@ -2336,7 +2340,7 @@ UtfToUtfProc( void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ - int flags, /* Conversion control flags. */ + int flags, /* TCL_ENCODING_* conversion control flags. */ TCL_UNUSED(Tcl_EncodingState *), char *dst, /* Output buffer in which converted string is * stored. */ @@ -2376,6 +2380,8 @@ UtfToUtfProc( dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { + int profile = TCL_ENCODING_PROFILE_GET(flags); + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2389,34 +2395,51 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { + /* + * TCL_ENCODING_MODIFIED is set when the target encoding is Tcl's + * internal UTF-8 modified version. + */ + if (UCHAR(*src) < 0x80 + && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { /* - * Copy 7bit characters, but skip null-bytes when we are in input - * mode, so that they get converted to 0xC080. + * Copy 7bit characters, but skip null-bytes when target encoding + * is Tcl's "modified" UTF-8. These need to be converted to + * \xC0\x80 as is done in a later branch. */ *dst++ = *src++; - } else if ((UCHAR(*src) == 0xC0) && - (src + 1 < srcEnd) && - (UCHAR(src[1]) == 0x80) && - (!(flags & TCL_ENCODING_MODIFIED) - || (STRICT_PROFILE(flags)))) { + } + else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) + && (UCHAR(src[1]) == 0x80) + && (!(flags & TCL_ENCODING_MODIFIED) + || (profile == TCL_ENCODING_PROFILE_STRICT))) { /* - * If in input mode, and -strict or -failindex is specified: This is an error. + * \xC0\x80 and either strict profile or target is "real" UTF-8 + * - Strict profile - error + * - Non-strict, real UTF-8 - output \x00 */ if (flags & TCL_ENCODING_MODIFIED) { + /* + * TODO - should above check not be against STRICT? + * That would probably break a convertto command that goes + * from the internal UTF8 to the real UTF8. On the other + * hand this means, a strict UTF8->UTF8 transform is not + * possible using this function. + */ result = TCL_CONVERT_SYNTAX; break; } /* - * Convert 0xC080 to real nulls when we are in output mode, with or without '-strict'. + * Convert 0xC080 to real nulls when we are in output mode, + * irrespective of the profile. */ *dst++ = 0; src += 2; } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* + * Incomplete byte sequence. * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves @@ -2424,17 +2447,39 @@ UtfToUtfProc( */ if (flags & TCL_ENCODING_MODIFIED) { - if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { - result = TCL_CONVERT_MULTIBYTE; + /* Incomplete bytes for modified UTF-8 target */ + if (profile == TCL_ENCODING_PROFILE_STRICT) { + result = (flags & TCL_ENCODING_CHAR_LIMIT) + ? TCL_CONVERT_MULTIBYTE + : TCL_CONVERT_SYNTAX; break; } - if (STRICT_PROFILE(flags)) { - result = TCL_CONVERT_SYNTAX; - break; + if (profile == TCL_ENCODING_PROFILE_REPLACE) { + ch = UNICODE_REPLACE_CHAR; + } else { + /* TCL_ENCODING_PROFILE_TCL8 */ + ch = UCHAR(*src); } - ch = UCHAR(*src++); - } else { + ++src; + } + else { + /* + * Incomplete bytes for real UTF-8 target. + * TODO - no profile check here because did not have any + * checks in the pre-profile code. Why? Is it because on + * output a valid internal utf-8 stream is assumed? + */ char chbuf[2]; + /* + * TODO - this code seems broken to me. + * - it does not check profiles + * - generates invalid output for real UTF-8 target + * (consider \xC2) + * A possible explanation is this behavior matches the + * Tcl8 decoding behavior of mapping invalid bytes to the same + * code point value. Still, at least strictness checks should + * be made. + */ chbuf[0] = UCHAR(*src++); chbuf[1] = 0; TclUtfToUCS4(chbuf, &ch); } @@ -2444,11 +2489,31 @@ UtfToUtfProc( int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); + + /* + * Valid single char encodings were already handled earlier. + * So len==1 means an invalid byte that is magically transformed + * to a code point unless it resulted from the special + * \xC0\x80 sequence. Tests io-75.* + * TODO - below check could be simplified to remove the MODIFIED + * expression I think given the checks already made above. May be. + */ +#if 0 if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED) - && STRICT_PROFILE(flags)) { + && (profile == TCL_ENCODING_PROFILE_STRICT)) { result = TCL_CONVERT_SYNTAX; break; } +#else + if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)) { + if (profile == TCL_ENCODING_PROFILE_STRICT) { + result = TCL_CONVERT_SYNTAX; + break; + } else if (profile == TCL_ENCODING_PROFILE_REPLACE) { + ch = UNICODE_REPLACE_CHAR; + } + } +#endif src += len; if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { if (ch > 0xFFFF) { @@ -2464,13 +2529,14 @@ UtfToUtfProc( /* * A surrogate character is detected, handle especially. */ + /* TODO - what about REPLACE profile? */ low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { - if (STOPONERROR) { + if (profile == TCL_ENCODING_PROFILE_STRICT) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2484,12 +2550,14 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) { + } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && + !(flags & TCL_ENCODING_MODIFIED) && + (((ch & ~0x7FF) == 0xD800))) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if (STRICT_PROFILE(flags) && - (flags & TCL_ENCODING_MODIFIED) && + } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && + (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; @@ -4216,6 +4284,7 @@ int TclEncodingExternalFlagsToInternal(int flags) switch (profile) { case TCL_ENCODING_PROFILE_TCL8: case TCL_ENCODING_PROFILE_STRICT: + case TCL_ENCODING_PROFILE_REPLACE: break; case 0: /* Unspecified by caller */ default: diff --git a/tests/cmdAH.test b/tests/cmdAH.test index c4053a2..52e7ac3 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -228,6 +228,9 @@ set encInvalidBytes { utf-8 A\xed\xb0\x80B default A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B tcl8 A\uDC00B -1 Low-surrogate utf-8 A\xed\xb0\x80B strict A 1 Low-surrogate + utf-8 \xed\xa0\x80\xed\xb0\x80 default \U00010000 -1 High-low-surrogate + utf-8 \xed\xa0\x80\xed\xb0\x80 tcl8 \U00010000 -1 High-low-surrogate + utf-8 \xed\xa0\x80\xed\xb0\x80 strict \U00010000 0 High-low-surrogate utf-32le \x00\xD8\x00\x00 default \uD800 -1 {High-surrogate} utf-32le \x00\xD8\x00\x00 tcl8 \uD800 -1 {High-surrogate} |