diff options
| author | apnadkarni <apnmbx-wits@yahoo.com> | 2024-08-22 10:16:49 (GMT) |
|---|---|---|
| committer | apnadkarni <apnmbx-wits@yahoo.com> | 2024-08-22 10:16:49 (GMT) |
| commit | 11d48cf9a94ffd0f61f698c1c81fca711e1a65b3 (patch) | |
| tree | 9d238f83e8d81e6d0f66a24a4dd482c06473605f | |
| parent | 5e67db1121363ac6be972bb2d779ad5c88c4a273 (diff) | |
| parent | a64b3e4f6f9fc4141aa2211d311c5877006c7e08 (diff) | |
| download | tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.zip tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.gz tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.bz2 | |
Merge trunk
| -rw-r--r-- | generic/tclEncoding.c | 164 | ||||
| -rw-r--r-- | generic/tclIORChan.c | 20 | ||||
| -rw-r--r-- | generic/tclIORTrans.c | 17 | ||||
| -rw-r--r-- | tests/utfext.test | 86 |
4 files changed, 232 insertions, 55 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 95acfa9..d235911 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2424,7 +2424,11 @@ UtfToUtfProc( const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* TCL_ENCODING_* conversion control flags. */ - TCL_UNUSED(Tcl_EncodingState *), + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ char *dst, /* Output buffer in which converted string is * stored. */ int dstLen, /* The maximum length of output buffer in @@ -2447,6 +2451,10 @@ UtfToUtfProc( int ch; int profile; + if (flags & TCL_ENCODING_START) { + /* *statePtr will hold high surrogate in a split surrogate pair */ + *statePtr = 0; + } result = TCL_OK; srcStart = src; @@ -2463,6 +2471,42 @@ UtfToUtfProc( flags |= PTR2INT(clientData); dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6); + /* + * Macro to output an isolated high surrogate when it is not followed + * by a low surrogate. NOT to be called for strict profile since + * that should raise an error. + */ +#define OUTPUT_ISOLATEDSURROGATE \ + do { \ + Tcl_UniChar high; \ + if (PROFILE_REPLACE(profile)) { \ + high = UNICODE_REPLACE_CHAR; \ + } else { \ + high = (Tcl_UniChar)(ptrdiff_t) *statePtr; \ + } \ + assert(!(flags & ENCODING_UTF)); /* Must be CESU-8 */ \ + assert(HIGH_SURROGATE(high)); \ + assert(!PROFILE_STRICT(profile)); \ + dst += Tcl_UniCharToUtf(high, dst); \ + *statePtr = 0; /* Reset state */ \ + } while (0) + + /* + * Macro to check for isolated surrogate and either break with + * an error if profile is strict, or output an appropriate + * character for replace and tcl8 profiles and continue. + */ +#define CHECK_ISOLATEDSURROGATE \ + if (*statePtr) { \ + if (PROFILE_STRICT(profile)) { \ + result = TCL_CONVERT_SYNTAX; \ + break; \ + } \ + OUTPUT_ISOLATEDSURROGATE; \ + continue; /* Rerun loop so length checks etc. repeated */ \ + } else \ + (void) 0 + profile = ENCODING_PROFILE_GET(flags); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { @@ -2481,6 +2525,8 @@ UtfToUtfProc( } if (UCHAR(*src) < 0x80 && !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) { + + CHECK_ISOLATEDSURROGATE; /* * Copy 7bit characters, but skip null-bytes when we are in input * mode, so that they get converted to \xC0\x80. @@ -2490,6 +2536,8 @@ UtfToUtfProc( (UCHAR(src[1]) == 0x80) && (!(flags & ENCODING_INPUT) || !PROFILE_TCL8(profile))) { /* Special sequence \xC0\x80 */ + + CHECK_ISOLATEDSURROGATE; if (!PROFILE_TCL8(profile) && (flags & ENCODING_INPUT)) { if (PROFILE_REPLACE(profile)) { dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst); @@ -2510,12 +2558,12 @@ UtfToUtfProc( } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { /* - * Incomplete byte sequence. - * Always check before using Tcl_UtfToUniChar. Not doing so can cause - * it to run beyond the end of the buffer! If we happen on such an - * incomplete char its bytes are made to represent themselves unless - * the user has explicitly asked to be told. - */ + * Incomplete byte sequence not because there are insufficient + * bytes in source buffer (have already checked that above) but + * because the UTF-8 sequence is truncated. + */ + + CHECK_ISOLATEDSURROGATE; if (flags & ENCODING_INPUT) { /* Incomplete bytes for modified UTF-8 target */ @@ -2537,7 +2585,12 @@ UtfToUtfProc( } dst += Tcl_UniCharToUtf(ch, dst); } else { + /* Have a complete character */ size_t len = TclUtfToUniChar(src, &ch); + + Tcl_UniChar savedSurrogate = (Tcl_UniChar) (ptrdiff_t)*statePtr; + *statePtr = 0; /* Reset surrogate */ + if (flags & ENCODING_INPUT) { if (((len < 2) && (ch != 0)) || ((ch > 0xFFFF) && !(flags & ENCODING_UTF))) { @@ -2554,6 +2607,8 @@ UtfToUtfProc( src += len; if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) { + assert(savedSurrogate == 0); /* Since this flag combo + will never set *statePtr */ if (ch > 0xFFFF) { /* CESU-8 6-byte sequence for chars > U+FFFF */ ch -= 0x10000; @@ -2567,19 +2622,98 @@ UtfToUtfProc( *dst++ = (char)((ch | 0x80) & 0xBF); continue; } else if (SURROGATE(ch)) { - if (PROFILE_STRICT(profile)) { - result = (flags & ENCODING_INPUT) + if ((flags & ENCODING_UTF)) { + /* UTF-8, not CESU-8, so surrogates should not appear */ + if (PROFILE_STRICT(profile)) { + result = (flags & ENCODING_INPUT) ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN; - src = saveSrc; - break; - } else if (PROFILE_REPLACE(profile)) { - ch = UNICODE_REPLACE_CHAR; - } - } + src = saveSrc; + break; + } else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } else { + /* PROFILE_TCL8 - output as is */ + } + } else { + /* CESU-8 */ + if (LOW_SURROGATE(ch)) { + if (savedSurrogate) { + assert(HIGH_SURROGATE(savedSurrogate)); + ch = 0x10000 + ((savedSurrogate - 0xd800) << 10) + (ch - 0xdc00); + } else { + /* Isolated low surrogate */ + if (PROFILE_STRICT(profile)) { + result = (flags & ENCODING_INPUT) + ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } else { + /* Tcl8 profile. Output low surrogate as is */ + } + } + } else { + assert(HIGH_SURROGATE(ch)); + /* Save the high surrogate */ + *statePtr = (Tcl_EncodingState) (ptrdiff_t) ch; + if (savedSurrogate) { + assert(HIGH_SURROGATE(savedSurrogate)); + if (PROFILE_STRICT(profile)) { + result = (flags & ENCODING_INPUT) + ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } else if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } else { + /* Output the isolated high surrogate */ + ch = savedSurrogate; + } + } else { + /* High surrogate saved in *statePtr. Do not output anything just yet. */ + --numChars; /* Cancel the increment at end of loop */ + continue; + } + } + } + } else { + /* Normal character */ + CHECK_ISOLATEDSURROGATE; + } + dst += Tcl_UniCharToUtf(ch, dst); } } + /* Check if an high surrogate left over */ + if (*statePtr) { + assert(!(flags & ENCODING_UTF)); /* CESU-8, Not UTF-8 */ + if (!(flags & TCL_ENCODING_END)) { + /* More data coming */ + } else { + /* No more data coming */ + if (PROFILE_STRICT(profile)) { + result = (flags & ENCODING_INPUT) + ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN; + } else { + if (PROFILE_REPLACE(profile)) { + ch = UNICODE_REPLACE_CHAR; + } else { + ch = (Tcl_UniChar) (ptrdiff_t) *statePtr; + } + if (dst < dstEnd) { + dst += Tcl_UniCharToUtf(ch, dst); + ++numChars; + } else { + /* No room in destination */ + result = TCL_CONVERT_NOSPACE; + } + } + } + + } + *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; diff --git a/generic/tclIORChan.c b/generic/tclIORChan.c index c8449aa..859366f 100644 --- a/generic/tclIORChan.c +++ b/generic/tclIORChan.c @@ -522,9 +522,10 @@ TclChanCreateObjCmd( * Actually: rCreate MODE CMDPREFIX * [0] [1] [2] */ - -#define MODE (1) -#define CMD (2) + enum ArgIndices { + MODE = 1, + CMD = 2 + }; /* * Number of arguments... @@ -739,9 +740,6 @@ TclChanCreateObjCmd( Tcl_DecrRefCount(rcPtr->cmd); Tcl_Free(rcPtr); return TCL_ERROR; - -#undef MODE -#undef CMD } /* @@ -826,9 +824,10 @@ TclChanPostEventObjCmd( * * where EVENTSPEC = {read write ...} (Abbreviations allowed as well). */ - -#define CHAN (1) -#define EVENT (2) + enum ArgIndices { + CHAN = 1, + EVENT = 2 + }; const char *chanId; /* Tcl level channel handle */ Tcl_Channel chan; /* Channel associated to the handle */ @@ -980,9 +979,6 @@ TclChanPostEventObjCmd( Tcl_ResetResult(interp); return TCL_OK; - -#undef CHAN -#undef EVENT } /* diff --git a/generic/tclIORTrans.c b/generic/tclIORTrans.c index dce1a1c..d2853e2 100644 --- a/generic/tclIORTrans.c +++ b/generic/tclIORTrans.c @@ -529,9 +529,10 @@ TclChanPushObjCmd( * Actually: rPush CHANNEL CMDPREFIX * [0] [1] [2] */ - -#define CHAN (1) -#define CMD (2) + enum ArgIndices { + CHAN = 1, + CMD = 2 + }; /* * Number of arguments... @@ -714,9 +715,6 @@ TclChanPushObjCmd( Tcl_EventuallyFree(rtPtr, FreeReflectedTransform); return TCL_ERROR; - -#undef CHAN -#undef CMD } /* @@ -751,8 +749,9 @@ TclChanPopObjCmd( * Actually: rPop CHANNEL * [0] [1] */ - -#define CHAN (1) + enum ArgIndices { + CHAN = 1 + }; const char *chanId; /* Tcl level channel handle */ Tcl_Channel chan; /* Channel associated to the handle */ @@ -786,8 +785,6 @@ TclChanPopObjCmd( Tcl_UnstackChannel(interp, chan); return TCL_OK; - -#undef CHAN } /* diff --git a/tests/utfext.test b/tests/utfext.test index 0c5601c..4b15a8d 100644 --- a/tests/utfext.test +++ b/tests/utfext.test @@ -37,16 +37,11 @@ namespace eval utftest { # 4 external fragmentation index - where to split field 2 for fragmentation # tests. -1 to skip # - # cesu-8 tests disabled because of bug [304d30677a] - TODO - # cesu-8 { - # {bmp {41 c3a9 42} {41 c3a9 42} 2 2} - # {nonbmp {41 f09f9880 42} {41 eda0bd edb080 42} 3 3} - # {null {41 c080 42} {41 00 42} 2 -1} - # } - + # THE HEX DEFINITIONS SHOULD SEPARATE EACH CHARACTER BY WHITESPACE + # (assumed by the charlimit tests) lappend utfExtMap {*}{ ascii { - {basic 414243 414243 -1 -1} + {basic {41 42 43} {41 42 43} -1 -1} } utf-8 { {bmp {41 c3a9 42} {41 c3a9 42} 2 2} @@ -55,6 +50,13 @@ namespace eval utftest { {nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42} 4 4} {null {41 c080 42} {41 00 42} 2 -1} } + cesu-8 { + {bmp {41 c3a9 42} {41 c3a9 42} 2 2} + {nonbmp-frag-surr-low {41 f09f9880 42} {41 eda0bd edb880 42} 2 2} + {nonbmp-split-surr {41 f09f9880 42} {41 eda0bd edb880 42} 3 -1} + {nonbmp-frag-surr-high {41 f09f9880 42} {41 eda0bd edb880 42} 4 6} + {null {41 c080 42} {41 00 42} 2 -1} + } utf-16le { {bmp {41 c3a9 42} {4100 e900 4200} 2 3} {nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4 3} @@ -78,12 +80,12 @@ namespace eval utftest { {null {41 c080 42} {00000041 00000000 00000042} 2 3} } iso8859-1 { - {basic {41 c3a9 42} 41e942 2 -1} - {null {41 c080 42} 410042 2 -1} + {basic {41 c3a9 42} {41 e9 42} 2 -1} + {null {41 c080 42} {41 00 42} 2 -1} } iso8859-3 { - {basic {41 c4a0 42} 41d542 2 -1} - {null {41 c080 42} 410042 2 -1} + {basic {41 c4a0 42} {41 d5 42} 2 -1} + {null {41 c080 42} {41 00 42} 2 -1} } shiftjis { {basic {41 e4b98e 42} {41 8cc1 42} 3 2} @@ -153,7 +155,7 @@ namespace eval utftest { } } - proc testfragment {direction enc comment hexin hexout fragindex} { + proc testfragment {direction enc comment hexin hexout fragindex args} { if {$fragindex < 0} { # Single byte encodings so no question of fragmentation @@ -167,26 +169,62 @@ namespace eval utftest { set cmd Tcl_UtfToExternal } + set status1 multibyte; # Return status to expect after first call + while {[llength $args] > 1} { + set opt [lpop args 0] + switch $opt { + -status1 { set status1 [lpop args 0]} + default { + error "Unknown option \"$opt\"" + } + } + } + set in [binary decode hex $hexin] set infrag [string range $in 0 $fragindex-1] set out [binary decode hex $hexout] set dstlen 40 ;# Should be enough for all encoding tests - set expected_result {} - append expected_result multibyte $fragindex - test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body { set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written] lassign $frag1Result frag1Status frag1State frag1Decoded set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written] lassign $frag2Result frag2Status frag2State frag2Decoded set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]] - list $frag1Status [expr {$frag1Read < $fragindex}] \ + list $frag1Status [expr {$frag1Read <= $fragindex}] \ $frag2Status [expr {$frag1Read+$frag2Read}] \ [expr {$frag1Written+$frag2Written}] $decoded - } -result [list multibyte 1 ok [string length $in] [string length $out] $out] + } -result [list $status1 1 ok [string length $in] [string length $out] $out] + } + + proc testcharlimit {direction enc comment hexin hexout} { + set id $comment-[join $hexin ""]-charlimit + + if {$direction eq "toutf"} { + set cmd Tcl_ExternalToUtf + } else { + set cmd Tcl_UtfToExternal + } + + set maxchars [llength $hexout] + set in [binary decode hex $hexin] + set out [binary decode hex $hexout] + set dstlen 40 ;# Should be enough for all encoding tests + + for {set nchars 0} {$nchars <= $maxchars} {incr nchars} { + set expected_bytes [binary decode hex [lrange $hexout 0 $nchars-1]] + set expected_nwritten [string length $expected_bytes] + test $cmd-$enc-$id-$nchars "$cmd - $enc - $hexin - nchars $nchars" -constraints testencoding -body { + set charlimit $nchars + lassign [testencoding $cmd $enc $in \ + {start end charlimit} 0 $dstlen nread nwritten charlimit] \ + status state buf + list $status $nwritten [string range $buf 0 $nwritten-1] + } -result [list [expr {$nchars == $maxchars ? "ok" : "nospace"}] $expected_nwritten $expected_bytes] + } } + # # Basic tests foreach {enc testcases} $utfExtMap { @@ -205,11 +243,23 @@ namespace eval utftest { # should have no effect in other direction testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate} + # Fragments testfragment toutf $enc $comment $hex $utfhex $externalfragindex testfragment fromutf $enc $comment $utfhex $hex $internalfragindex + + # Char limits - note no fromutf as Tcl_UtfToExternal does not support it + if {![string match utf-16* $enc] && $enc ne "cesu-8"} { + # TODO - utf16 hangs + testcharlimit toutf $enc $comment $hex $utfhex + } } } + # Special cases - cesu2 high and low surrogates in separate fragments + # This will (correctly) return "ok", not "multibyte" after first frag + testfragment toutf cesu-8 nonbmp-split-surr \ + {41 eda0bd edb880 42} {41 f09f9880 42} 4 -status1 ok + # Bug regression tests test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body { testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1 |
