diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-12 06:15:59 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-12 06:15:59 (GMT) |
commit | 486c0943f4f861dd0bf46b6d7298f42253b6551b (patch) | |
tree | 3789853c2696ebe5056e497bbb5e889cb6b2b668 /generic/tclEncoding.c | |
parent | 3a88e21d0e30e5ad85ac8435d4c551bf751489e9 (diff) | |
download | tcl-486c0943f4f861dd0bf46b6d7298f42253b6551b.zip tcl-486c0943f4f861dd0bf46b6d7298f42253b6551b.tar.gz tcl-486c0943f4f861dd0bf46b6d7298f42253b6551b.tar.bz2 |
Minor readability changes
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 101 |
1 files changed, 64 insertions, 37 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index bb1f32f..d2f3551 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -198,7 +198,20 @@ static struct TclEncodingProfiles { {"strict", TCL_ENCODING_PROFILE_STRICT}, {"replace", TCL_ENCODING_PROFILE_REPLACE}, }; +#define PROFILE_STRICT(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) + +#define PROFILE_REPLACE(flags_) \ + ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE) \ + || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ + && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_REPLACE)) + #define UNICODE_REPLACE_CHAR 0xFFFD +#define SURROGATE(c_) (((c_) & ~0x7FF) == 0xD800) +#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800) +#define LOW_SURROGATE(c_) (((c_) & ~0x3FF) == 0xDC00) /* * The following variable is used in the sparse matrix code for a @@ -243,6 +256,7 @@ static Tcl_EncodingConvertProc UtfToUtfProc; static Tcl_EncodingConvertProc Iso88591FromUtfProc; static Tcl_EncodingConvertProc Iso88591ToUtfProc; + /* * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field * of the internalrep. This should help the lifetime of encodings be more useful. @@ -2328,13 +2342,6 @@ BinaryProc( *------------------------------------------------------------------------- */ -#define STRICT_PROFILE(flags_) \ - ((TCL_ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_STRICT) \ - || (TCL_ENCODING_PROFILE_GET(flags_) == 0 \ - && TCL_ENCODING_PROFILE_DEFAULT == TCL_ENCODING_PROFILE_STRICT)) - -#define STOPONERROR STRICT_PROFILE(flags) - static int UtfToUtfProc( void *clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */ @@ -2412,7 +2419,7 @@ UtfToUtfProc( else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) - || (profile == TCL_ENCODING_PROFILE_STRICT))) { + || PROFILE_STRICT(profile))) { /* * \xC0\x80 and either strict profile or target is "real" UTF-8 * - Strict profile - error @@ -2448,13 +2455,13 @@ UtfToUtfProc( if (flags & TCL_ENCODING_MODIFIED) { /* Incomplete bytes for modified UTF-8 target */ - if (profile == TCL_ENCODING_PROFILE_STRICT) { + if (PROFILE_STRICT(profile)) { result = (flags & TCL_ENCODING_CHAR_LIMIT) ? TCL_CONVERT_MULTIBYTE : TCL_CONVERT_SYNTAX; break; } - if (profile == TCL_ENCODING_PROFILE_REPLACE) { + if (PROFILE_REPLACE(profile)) { ch = UNICODE_REPLACE_CHAR; } else { /* TCL_ENCODING_PROFILE_TCL8 */ @@ -2506,10 +2513,10 @@ UtfToUtfProc( } #else if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED)) { - if (profile == TCL_ENCODING_PROFILE_STRICT) { + if (PROFILE_STRICT(profile)) { result = TCL_CONVERT_SYNTAX; break; - } else if (profile == TCL_ENCODING_PROFILE_REPLACE) { + } else if (PROFILE_REPLACE(profile)) { ch = UNICODE_REPLACE_CHAR; } } @@ -2534,9 +2541,9 @@ UtfToUtfProc( low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; - if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + if ((!LOW_SURROGATE(low)) || (ch & 0x400)) { - if (profile == TCL_ENCODING_PROFILE_STRICT) { + if (PROFILE_STRICT(profile)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2550,15 +2557,15 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; - } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && - !(flags & TCL_ENCODING_MODIFIED) && - (((ch & ~0x7FF) == 0xD800))) { + } else if (PROFILE_STRICT(profile) && + (!(flags & TCL_ENCODING_MODIFIED)) && + SURROGATE(ch)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; - } else if ((profile == TCL_ENCODING_PROFILE_STRICT) && + } else if (PROFILE_STRICT(profile) && (flags & TCL_ENCODING_MODIFIED) && - ((ch & ~0x7FF) == 0xD800)) { + SURROGATE(ch)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -2649,12 +2656,15 @@ Utf32ToUtfProc( } else { ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF); } - if ((unsigned)ch > 0x10FFFF - || (STRICT_PROFILE(flags) && ((ch & ~0x7FF) == 0xD800))) { - if (STOPONERROR) { + + if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } /* @@ -2666,7 +2676,7 @@ Utf32ToUtfProc( *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); - if ((ch & ~0x3FF) == 0xD800) { + if (HIGH_SURROGATE(ch)) { /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } @@ -2750,11 +2760,14 @@ UtfToUtf32Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -2952,11 +2965,14 @@ UtfToUtf16Proc( break; } len = TclUtfToUCS4(src, &ch); - if ((ch & ~0x7FF) == 0xD800) { - if (STOPONERROR) { + if (SURROGATE(ch)) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } + if (PROFILE_REPLACE(flags)) { + ch = UNICODE_REPLACE_CHAR; + } } src += len; if (flags & TCL_ENCODING_LE) { @@ -3059,6 +3075,9 @@ UtfToUcs2Proc( result = TCL_CONVERT_NOSPACE; break; } + /* TODO - there were no STRICT or NOCOMPLAIN checks here (why?) + * so no profile checks either for now. */ + #if TCL_UTF_MAX < 4 src += (len = TclUtfToUniChar(src, &ch)); if ((ch >= 0xD800) && (len < 3)) { @@ -3163,23 +3182,30 @@ TableToUtfProc( if (prefixBytes[byte]) { src++; if (src >= srcEnd) { + /* + * TODO - this is broken. For consistency with other + * decoders, an error should be raised only if strict. + * However, doing that check cause a whole bunch of test + * failures. Need to verify if those tests are in fact + * correct. + */ src--; result = TCL_CONVERT_MULTIBYTE; break; } - ch = toUnicode[byte][*((unsigned char *) src)]; + ch = toUnicode[byte][*((unsigned char *)src)]; } else { ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; break; } if (prefixBytes[byte]) { src--; } - ch = (Tcl_UniChar) byte; + ch = (Tcl_UniChar)byte; } /* @@ -3288,11 +3314,11 @@ TableFromUtfProc( word = fromUnicode[(ch >> 8)][ch & 0xFF]; if ((word == 0) && (ch != 0)) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } - word = dataPtr->fallback; + word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */ } if (prefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { @@ -3476,7 +3502,7 @@ Iso88591FromUtfProc( || ((ch >= 0xD800) && (len < 3)) #endif ) { - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } @@ -3489,7 +3515,7 @@ Iso88591FromUtfProc( * Plunge on, using '?' as a fallback character. */ - ch = (Tcl_UniChar) '?'; + ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */ } if (dst > dstEnd) { @@ -3703,9 +3729,10 @@ EscapeToUtfProc( if ((checked == dataPtr->numSubTables + 2) || (flags & TCL_ENCODING_END)) { - if (!STOPONERROR) { + if (!PROFILE_STRICT(flags)) { /* - * Skip the unknown escape sequence. + * Skip the unknown escape sequence. TODO - bug? + * May be replace with UNICODE_REPLACE_CHAR? */ src += longest; @@ -3878,7 +3905,7 @@ EscapeFromUtfProc( if (word == 0) { state = oldState; - if (STOPONERROR) { + if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_UNKNOWN; break; } |