diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-14 22:06:32 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2023-02-14 22:06:32 (GMT) |
commit | fb260e838200ee3007a853db81e2e9f775ebb783 (patch) | |
tree | 4bfe1e1a8a62a40a77ae74f373e4c10742218f0e | |
parent | 0c306c8f81e856f8992e427ec5a1b9611a7c0434 (diff) | |
parent | 5040a69c28a92e0318ca56498bdf30f8690470ae (diff) | |
download | tcl-fb260e838200ee3007a853db81e2e9f775ebb783.zip tcl-fb260e838200ee3007a853db81e2e9f775ebb783.tar.gz tcl-fb260e838200ee3007a853db81e2e9f775ebb783.tar.bz2 |
Merge 8.7
-rw-r--r-- | generic/tclEncoding.c | 39 | ||||
-rw-r--r-- | tests/encoding.test | 38 |
2 files changed, 60 insertions, 17 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 6990346..9d896e2 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -525,7 +525,8 @@ FillEncodingFileMap(void) /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and * TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ -#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ +#define ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ +#define ENCODING_INPUT 0x400 /* For UTF-8/CESU-8 encoding, means external -> internal */ void TclInitEncodingSubsystem(void) @@ -567,7 +568,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = INT2PTR(TCL_ENCODING_UTF); + type.clientData = INT2PTR(ENCODING_UTF); Tcl_CreateEncoding(&type); type.clientData = INT2PTR(TCL_ENCODING_NOCOMPLAIN); type.encodingName = "cesu-8"; @@ -1180,7 +1181,7 @@ Tcl_ExternalToUtfDStringEx( flags |= TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags |= ENCODING_INPUT; } while (1) { @@ -1297,7 +1298,7 @@ Tcl_ExternalToUtf( dstLen--; } if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; + flags |= ENCODING_INPUT; } do { Tcl_EncodingState savedState = *statePtr; @@ -1390,7 +1391,7 @@ Tcl_UtfToExternalDStringEx( const char *src, /* Source string in UTF-8. */ size_t srcLen, /* Source string length in bytes, or < 0 for * strlen(). */ - int flags, /* Conversion control flags. */ + int flags, /* Conversion control flags. */ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the * converted string is stored. */ { @@ -2300,7 +2301,7 @@ UtfToUtfProc( dstStart = dst; flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); + dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2316,7 +2317,7 @@ UtfToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & TCL_ENCODING_MODIFIED))) { + if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) { /* * Copy 7bit characters, but skip null-bytes when we are in input * mode, so that they get converted to 0xC080. @@ -2324,11 +2325,13 @@ UtfToUtfProc( *dst++ = *src++; } else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) - && (UCHAR(src[1]) == 0x80) && (!(flags & TCL_ENCODING_MODIFIED) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) || (flags & ENCODING_FAILINDEX))) { + && (UCHAR(src[1]) == 0x80) && (flags & ENCODING_UTF) && (!(flags & ENCODING_INPUT) + || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) + || (flags & ENCODING_FAILINDEX))) { /* * If in input mode, and -strict or -failindex is specified: This is an error. */ - if (flags & TCL_ENCODING_MODIFIED) { + if (flags & ENCODING_INPUT) { result = TCL_CONVERT_SYNTAX; break; } @@ -2346,7 +2349,7 @@ UtfToUtfProc( * unless the user has explicitly asked to be told. */ - if (flags & TCL_ENCODING_MODIFIED) { + if (flags & ENCODING_INPUT) { if ((STOPONERROR) && (flags & TCL_ENCODING_CHAR_LIMIT)) { result = TCL_CONVERT_MULTIBYTE; break; @@ -2365,13 +2368,13 @@ UtfToUtfProc( } else { const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); - if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_MODIFIED) + if ((len < 2) && (ch != 0) && (flags & ENCODING_INPUT) && (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { + if (!(flags & ENCODING_UTF) && (ch > 0x3FF)) { if (ch > 0xFFFF) { /* CESU-8 6-byte sequence for chars > U+FFFF */ ch -= 0x10000; @@ -2393,6 +2396,11 @@ UtfToUtfProc( * A surrogate character is detected, handle especially. */ + if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) && (flags & ENCODING_UTF)) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } int low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; @@ -2409,12 +2417,12 @@ UtfToUtfProc( dst += Tcl_UniCharToUtf(ch, dst); ch = low; #endif - } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) { + } else if (STOPONERROR && !(flags & ENCODING_INPUT) && (((ch & ~0x7FF) == 0xD800))) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT) - && (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) { + && (flags & ENCODING_INPUT) && ((ch & ~0x7FF) == 0xD800)) { result = TCL_CONVERT_SYNTAX; src = saveSrc; break; @@ -3040,7 +3048,8 @@ TableToUtfProc( ch = pageZero[byte]; } if ((ch == 0) && (byte != 0)) { - if (STOPONERROR) { + if ((flags & ENCODING_FAILINDEX) + || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)) { result = TCL_CONVERT_SYNTAX; break; } diff --git a/tests/encoding.test b/tests/encoding.test index 3fba2f3..aef5028 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -453,6 +453,24 @@ test encoding-15.24 {UtfToUtfProc CESU-8 bug [048dd20b4171c8da]} { binary scan $y H* z list [string length $y] $z } {2 cfbf} +test encoding-15.25 {UtfToUtfProc CESU-8} { + encoding convertfrom cesu-8 \x00 +} \x00 +test encoding-15.26 {UtfToUtfProc CESU-8} { + encoding convertfrom cesu-8 \xC0\x80 +} \x00 +test encoding-15.27 {UtfToUtfProc -strict CESU-8} { + encoding convertfrom -strict cesu-8 \xC0\x80 +} \x00 +test encoding-15.28 {UtfToUtfProc -strict CESU-8} { + encoding convertfrom -strict cesu-8 \xC0\x80 +} \x00 +test encoding-15.29 {UtfToUtfProc CESU-8} { + encoding convertto cesu-8 \x00 +} \xC0\x80 +test encoding-15.30 {UtfToUtfProc -strict CESU-8} { + encoding convertto -strict cesu-8 \x00 +} \xC0\x80 test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] @@ -585,8 +603,21 @@ test encoding-18.6 {TableToUtfProc on invalid input with -nocomplain} -body { list [catch {encoding convertto -nocomplain jis0208 \\} res] $res } -result {0 !)} -test encoding-19.1 {TableFromUtfProc} { -} {} +test encoding-19.1 {TableFromUtfProc} -body { + encoding convertfrom ascii AÁ +} -result AÁ +test encoding-19.2 {TableFromUtfProc} -body { + encoding convertfrom -nocomplain ascii AÁ +} -result AÁ +test encoding-19.3 {TableFromUtfProc} -body { + encoding convertfrom -strict ascii AÁ +} -returnCodes 1 -result {unexpected byte sequence starting at index 1: '\xC1'} +test encoding-19.4 {TableFromUtfProc} -body { + list [encoding convertfrom -failindex idx ascii AÁ] [set idx] +} -result {A 1} +test encoding-19.4 {TableFromUtfProc} -body { + list [encoding convertfrom -failindex idx -strict ascii AÁ] [set idx] +} -result {A 1} test encoding-20.1 {TableFreefProc} { } {} @@ -805,6 +836,9 @@ test encoding-24.39 {Try to generate invalid utf-8 with -strict} -body { test encoding-24.40 {Try to generate invalid utf-8 with -nocomplain} -body { encoding convertto -nocomplain utf-8 \uD800 } -result \xED\xA0\x80 +test encoding-24.41 {Parse invalid utf-8 with -strict} -body { + encoding convertfrom -strict utf-8 \xED\xA0\x80\xED\xB0\x80 +} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xED'} file delete [file join [temporaryDirectory] iso2022.txt] |