diff options
author | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-23 11:22:22 (GMT) |
---|---|---|
committer | apnadkarni <apnmbx-wits@yahoo.com> | 2023-02-23 11:22:22 (GMT) |
commit | f20343c1afd3e673f08c064a73ff721c7e160203 (patch) | |
tree | 427fc351f8fc984f078632fb85f33e51ade56cf0 | |
parent | d1a70b49c30854038c296c5d448d96d4c263ed0b (diff) | |
parent | 1d76ffb03b359c7f557943523fd9b0c49a312554 (diff) | |
download | tcl-tip-656-pre-capi.zip tcl-tip-656-pre-capi.tar.gz tcl-tip-656-pre-capi.tar.bz2 |
Merge 8.7tip-656-pre-capi
-rw-r--r-- | generic/tclEncoding.c | 68 | ||||
-rw-r--r-- | tests/cmdAH.test | 14 | ||||
-rw-r--r-- | tests/encoding.test | 19 |
3 files changed, 61 insertions, 40 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index a877468..daab3a9 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -264,8 +264,13 @@ static Tcl_EncodingConvertProc Iso88591ToUtfProc; */ static const Tcl_ObjType encodingType = { - "encoding", FreeEncodingInternalRep, DupEncodingInternalRep, NULL, NULL + "encoding", + FreeEncodingInternalRep, + DupEncodingInternalRep, + NULL, + NULL }; + #define EncodingSetInternalRep(objPtr, encoding) \ do { \ Tcl_ObjInternalRep ir; \ @@ -488,7 +493,7 @@ FillEncodingFileMap(void) map = Tcl_NewDictObj(); Tcl_IncrRefCount(map); - for (i = numDirs-1; i >= 0; i--) { + for (i = numDirs-1; i != TCL_INDEX_NONE; i--) { /* * Iterate backwards through the search path so as we overwrite * entries found, we favor files earlier on the search path. @@ -1209,7 +1214,7 @@ Tcl_ExternalToUtfDString( * Tcl_ExternalToUtfDStringEx -- * * Convert a source buffer from the specified encoding into UTF-8. -* The parameter flags controls the behavior, if any of the bytes in + * The parameter flags controls the behavior, if any of the bytes in * the source buffer are invalid or cannot be represented in utf-8. * Possible flags values: * target encoding. It should be composed by OR-ing the following: @@ -1482,8 +1487,9 @@ Tcl_UtfToExternalDStringEx( char *dst; Tcl_EncodingState state; const Encoding *encodingPtr; - int dstLen, result, soFar, srcRead, dstWrote, dstChars; + int result, soFar, srcRead, dstWrote, dstChars; const char *srcStart = src; + int dstLen; Tcl_DStringInit(dstPtr); dst = Tcl_DStringValue(dstPtr); @@ -2594,8 +2600,8 @@ Utf32ToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; - int result, extra, numChars, charLimit = INT_MAX; - int ch = 0; + int result, numChars, charLimit = INT_MAX; + int ch = 0, bytesLeft = srcLen % 4; flags |= PTR2INT(clientData); if (flags & TCL_ENCODING_CHAR_LIMIT) { @@ -2606,11 +2612,10 @@ Utf32ToUtfProc( /* * Check alignment with utf-32 (4 == sizeof(UTF-32)) */ - extra = srcLen % 4; - if (extra != 0) { - /* We have a truncated code unit */ + if (bytesLeft != 0) { + /* We have a truncated code unit */ result = TCL_CONVERT_MULTIBYTE; - srcLen &= -4; + srcLen -= bytesLeft; } /* @@ -2648,7 +2653,7 @@ Utf32ToUtfProc( /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */ dst += Tcl_UniCharToUtf(-1, dst); } - + if ((unsigned)ch > 0x10FFFF || SURROGATE(ch)) { if (PROFILE_STRICT(flags)) { result = TCL_CONVERT_SYNTAX; @@ -2679,16 +2684,22 @@ Utf32ToUtfProc( } /* * If we had a truncated code unit at the end AND this is the last - * fragment AND profile is "replace", stick FFFD in its place. + * fragment AND profile is not "strict", stick FFFD in its place. */ - if (extra && (flags & TCL_ENCODING_END) && PROFILE_REPLACE(flags)) { - src += extra; /* Go past truncated code unit */ + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - result = TCL_OK; - } + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src += bytesLeft; /* Go past truncated code unit */ + } + } } *srcReadPtr = src - srcStart; @@ -2837,7 +2848,7 @@ Utf16ToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; - int result, extra, numChars, charLimit = INT_MAX; + int result, numChars, charLimit = INT_MAX; unsigned short ch = 0; flags |= PTR2INT(clientData); @@ -2850,8 +2861,7 @@ Utf16ToUtfProc( * Check alignment with utf-16 (2 == sizeof(UTF-16)) */ - extra = srcLen % 2; - if (extra != 0) { + if ((srcLen % 2) != 0) { result = TCL_CONVERT_MULTIBYTE; srcLen--; } @@ -2909,16 +2919,22 @@ Utf16ToUtfProc( } /* * If we had a truncated code unit at the end AND this is the last - * fragment AND profile is "replace", stick FFFD in its place. + * fragment AND profile is not "strict", stick FFFD in its place. */ - if (extra && (flags & TCL_ENCODING_END) && PROFILE_REPLACE(flags)) { - ++src;/* Go past the truncated code unit */ + if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; } else { - dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); - result = TCL_OK; - } + if (PROFILE_STRICT(flags)) { + result = TCL_CONVERT_SYNTAX; + } else { + /* PROFILE_REPLACE or PROFILE_TCL8 */ + result = TCL_OK; + dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); + numChars++; + src++; /* Go past truncated code unit */ + } + } } *srcReadPtr = src - srcStart; diff --git a/tests/cmdAH.test b/tests/cmdAH.test index d76607c..f8eba4e 100644 --- a/tests/cmdAH.test +++ b/tests/cmdAH.test @@ -703,7 +703,7 @@ lappend encInvalidBytes {*}{ # happen when the sequence is at the end (including by itself) Thus {solo tail} # in some cases. lappend encInvalidBytes {*}{ - utf-16le 41 tcl8 {} -1 {solo tail} {Truncated} + utf-16le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-16le 41 replace \uFFFD -1 {solo tail} {Truncated} utf-16le 41 strict {} 0 {solo tail} {Truncated} utf-16le 00D8 tcl8 \uD800 -1 {} {Missing low surrogate} @@ -719,13 +719,13 @@ lappend encInvalidBytes {*}{ # happen when the sequence is at the end (including by itself) Thus {solo tail} # in some cases. lappend encInvalidBytes {*}{ - utf-32le 41 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 41 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-32le 41 replace \uFFFD -1 {solo} {Truncated} utf-32le 41 strict {} 0 {solo tail} {Truncated} - utf-32le 4100 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 4100 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-32le 4100 replace \uFFFD -1 {solo} {Truncated} utf-32le 4100 strict {} 0 {solo tail} {Truncated} - utf-32le 410000 tcl8 {} -1 {solo tail} {Truncated} + utf-32le 410000 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-32le 410000 replace \uFFFD -1 {solo} {Truncated} utf-32le 410000 strict {} 0 {solo tail} {Truncated} utf-32le 00D80000 tcl8 \uD800 -1 {} {High-surrogate} @@ -744,9 +744,9 @@ lappend encInvalidBytes {*}{ utf-32le FFFFFFFF replace \UFFFD -1 {} {Out of range} utf-32le FFFFFFFF strict {} 0 {} {Out of range} - utf-32be 41 tcl8 {} -1 {solo tail} {Truncated} - utf-32be 0041 tcl8 {} -1 {solo tail} {Truncated} - utf-32be 000041 tcl8 {} -1 {solo tail} {Truncated} + utf-32be 41 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 0041 tcl8 \uFFFD -1 {solo tail} {Truncated} + utf-32be 000041 tcl8 \uFFFD -1 {solo tail} {Truncated} utf-32be 0000D800 tcl8 \uD800 -1 {} {High-surrogate} utf-32be 0000D800 replace \uFFFD -1 {} {High-surrogate} utf-32be 0000D800 strict {} 0 {} {High-surrogate} diff --git a/tests/encoding.test b/tests/encoding.test index 0abd193..87da880 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -534,7 +534,7 @@ test encoding-16.17 {Utf32ToUtfProc} -body { list [encoding convertfrom -profile strict -failindex idx utf-32le \x41\x00\x00\x00\x00\xD8\x00\x00\x42\x00\x00\x00] [set idx] } -result {A 4} -test encoding-16.9 { +test encoding-16.18 { Utf16ToUtfProc, Tcl_UniCharToUtf, surrogate pairs in utf-16 } -body { apply [list {} { @@ -553,10 +553,15 @@ test encoding-16.9 { return done } [namespace current]] } -result done - - - - +test encoding-16.19 {UnicodeToUtfProc, bug [d19fe0a5b]} -body { + encoding convertfrom utf-16 "\x41\x41\x41" +} -result \u4141\uFFFD +test encoding-16.20 {UnicodeToUtfProc, bug [d19fe0a5b]} -constraints deprecated -body { + encoding convertfrom utf-16 "\xD8\xD8" +} -result \uD8D8 +test encoding-16.21 {UnicodeToUtfProc, bug [d19fe0a5b]} -body { + encoding convertfrom utf-32 "\x00\x00\x00\x00\x41\x41" +} -result \x00\uFFFD test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" @@ -783,10 +788,10 @@ test encoding-24.19 {Parse valid or invalid utf-8} -constraints deprecated -body } -result ZX\xED\xA0\x80 test encoding-24.20 {Parse with -profile tcl8 but without providing encoding} -body { encoding convertfrom -profile tcl8 "\x20" -} -result {wrong # args: should be "::tcl::encoding::convertfrom ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error +} -result {wrong # args: should be "::tcl::encoding::convertfrom ? ?-profile profile? ?-failindex var? encoding ? data"} -returnCodes error test encoding-24.21 {Parse with -profile tcl8 but without providing encoding} -body { string length [encoding convertto -profile tcl8 "\x20"] -} -result {wrong # args: should be "::tcl::encoding::convertto ??-profile profile? ?-failindex var? ?encoding?? data"} -returnCodes error +} -result {wrong # args: should be "::tcl::encoding::convertto ? ?-profile profile? ?-failindex var? encoding ? data"} -returnCodes error test encoding-24.22 {Syntax error, two encodings} -body { encoding convertfrom iso8859-1 utf-8 "ZX\uD800" } -result {bad option "iso8859-1": must be -profile or -failindex} -returnCodes error |