summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorapnadkarni <apnmbx-wits@yahoo.com>2024-08-22 10:16:49 (GMT)
committerapnadkarni <apnmbx-wits@yahoo.com>2024-08-22 10:16:49 (GMT)
commit11d48cf9a94ffd0f61f698c1c81fca711e1a65b3 (patch)
tree9d238f83e8d81e6d0f66a24a4dd482c06473605f
parent5e67db1121363ac6be972bb2d779ad5c88c4a273 (diff)
parenta64b3e4f6f9fc4141aa2211d311c5877006c7e08 (diff)
downloadtcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.zip
tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.gz
tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.bz2
Merge trunk
-rw-r--r--generic/tclEncoding.c164
-rw-r--r--generic/tclIORChan.c20
-rw-r--r--generic/tclIORTrans.c17
-rw-r--r--tests/utfext.test86
4 files changed, 232 insertions, 55 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 95acfa9..d235911 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2424,7 +2424,11 @@ UtfToUtfProc(
const char *src, /* Source string in UTF-8. */
int srcLen, /* Source string length in bytes. */
int flags, /* TCL_ENCODING_* conversion control flags. */
- TCL_UNUSED(Tcl_EncodingState *),
+ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
+ * information used during a piecewise
+ * conversion. Contents of statePtr are
+ * initialized and/or reset by conversion
+ * routine under control of flags argument. */
char *dst, /* Output buffer in which converted string is
* stored. */
int dstLen, /* The maximum length of output buffer in
@@ -2447,6 +2451,10 @@ UtfToUtfProc(
int ch;
int profile;
+ if (flags & TCL_ENCODING_START) {
+ /* *statePtr will hold high surrogate in a split surrogate pair */
+ *statePtr = 0;
+ }
result = TCL_OK;
srcStart = src;
@@ -2463,6 +2471,42 @@ UtfToUtfProc(
flags |= PTR2INT(clientData);
dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
+ /*
+ * Macro to output an isolated high surrogate when it is not followed
+ * by a low surrogate. NOT to be called for strict profile since
+ * that should raise an error.
+ */
+#define OUTPUT_ISOLATEDSURROGATE \
+ do { \
+ Tcl_UniChar high; \
+ if (PROFILE_REPLACE(profile)) { \
+ high = UNICODE_REPLACE_CHAR; \
+ } else { \
+ high = (Tcl_UniChar)(ptrdiff_t) *statePtr; \
+ } \
+ assert(!(flags & ENCODING_UTF)); /* Must be CESU-8 */ \
+ assert(HIGH_SURROGATE(high)); \
+ assert(!PROFILE_STRICT(profile)); \
+ dst += Tcl_UniCharToUtf(high, dst); \
+ *statePtr = 0; /* Reset state */ \
+ } while (0)
+
+ /*
+ * Macro to check for isolated surrogate and either break with
+ * an error if profile is strict, or output an appropriate
+ * character for replace and tcl8 profiles and continue.
+ */
+#define CHECK_ISOLATEDSURROGATE \
+ if (*statePtr) { \
+ if (PROFILE_STRICT(profile)) { \
+ result = TCL_CONVERT_SYNTAX; \
+ break; \
+ } \
+ OUTPUT_ISOLATEDSURROGATE; \
+ continue; /* Rerun loop so length checks etc. repeated */ \
+ } else \
+ (void) 0
+
profile = ENCODING_PROFILE_GET(flags);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
@@ -2481,6 +2525,8 @@ UtfToUtfProc(
}
if (UCHAR(*src) < 0x80
&& !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
+
+ CHECK_ISOLATEDSURROGATE;
/*
* Copy 7bit characters, but skip null-bytes when we are in input
* mode, so that they get converted to \xC0\x80.
@@ -2490,6 +2536,8 @@ UtfToUtfProc(
(UCHAR(src[1]) == 0x80) &&
(!(flags & ENCODING_INPUT) || !PROFILE_TCL8(profile))) {
/* Special sequence \xC0\x80 */
+
+ CHECK_ISOLATEDSURROGATE;
if (!PROFILE_TCL8(profile) && (flags & ENCODING_INPUT)) {
if (PROFILE_REPLACE(profile)) {
dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
@@ -2510,12 +2558,12 @@ UtfToUtfProc(
} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
- * Incomplete byte sequence.
- * Always check before using Tcl_UtfToUniChar. Not doing so can cause
- * it to run beyond the end of the buffer! If we happen on such an
- * incomplete char its bytes are made to represent themselves unless
- * the user has explicitly asked to be told.
- */
+ * Incomplete byte sequence not because there are insufficient
+ * bytes in source buffer (have already checked that above) but
+ * because the UTF-8 sequence is truncated.
+ */
+
+ CHECK_ISOLATEDSURROGATE;
if (flags & ENCODING_INPUT) {
/* Incomplete bytes for modified UTF-8 target */
@@ -2537,7 +2585,12 @@ UtfToUtfProc(
}
dst += Tcl_UniCharToUtf(ch, dst);
} else {
+ /* Have a complete character */
size_t len = TclUtfToUniChar(src, &ch);
+
+ Tcl_UniChar savedSurrogate = (Tcl_UniChar) (ptrdiff_t)*statePtr;
+ *statePtr = 0; /* Reset surrogate */
+
if (flags & ENCODING_INPUT) {
if (((len < 2) && (ch != 0))
|| ((ch > 0xFFFF) && !(flags & ENCODING_UTF))) {
@@ -2554,6 +2607,8 @@ UtfToUtfProc(
src += len;
if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT)
&& (ch > 0x3FF)) {
+ assert(savedSurrogate == 0); /* Since this flag combo
+ will never set *statePtr */
if (ch > 0xFFFF) {
/* CESU-8 6-byte sequence for chars > U+FFFF */
ch -= 0x10000;
@@ -2567,19 +2622,98 @@ UtfToUtfProc(
*dst++ = (char)((ch | 0x80) & 0xBF);
continue;
} else if (SURROGATE(ch)) {
- if (PROFILE_STRICT(profile)) {
- result = (flags & ENCODING_INPUT)
+ if ((flags & ENCODING_UTF)) {
+ /* UTF-8, not CESU-8, so surrogates should not appear */
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & ENCODING_INPUT)
? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
- src = saveSrc;
- break;
- } else if (PROFILE_REPLACE(profile)) {
- ch = UNICODE_REPLACE_CHAR;
- }
- }
+ src = saveSrc;
+ break;
+ } else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ /* PROFILE_TCL8 - output as is */
+ }
+ } else {
+ /* CESU-8 */
+ if (LOW_SURROGATE(ch)) {
+ if (savedSurrogate) {
+ assert(HIGH_SURROGATE(savedSurrogate));
+ ch = 0x10000 + ((savedSurrogate - 0xd800) << 10) + (ch - 0xdc00);
+ } else {
+ /* Isolated low surrogate */
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & ENCODING_INPUT)
+ ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ } else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ /* Tcl8 profile. Output low surrogate as is */
+ }
+ }
+ } else {
+ assert(HIGH_SURROGATE(ch));
+ /* Save the high surrogate */
+ *statePtr = (Tcl_EncodingState) (ptrdiff_t) ch;
+ if (savedSurrogate) {
+ assert(HIGH_SURROGATE(savedSurrogate));
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & ENCODING_INPUT)
+ ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ } else if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ /* Output the isolated high surrogate */
+ ch = savedSurrogate;
+ }
+ } else {
+ /* High surrogate saved in *statePtr. Do not output anything just yet. */
+ --numChars; /* Cancel the increment at end of loop */
+ continue;
+ }
+ }
+ }
+ } else {
+ /* Normal character */
+ CHECK_ISOLATEDSURROGATE;
+ }
+
dst += Tcl_UniCharToUtf(ch, dst);
}
}
+ /* Check if an high surrogate left over */
+ if (*statePtr) {
+ assert(!(flags & ENCODING_UTF)); /* CESU-8, Not UTF-8 */
+ if (!(flags & TCL_ENCODING_END)) {
+ /* More data coming */
+ } else {
+ /* No more data coming */
+ if (PROFILE_STRICT(profile)) {
+ result = (flags & ENCODING_INPUT)
+ ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
+ } else {
+ if (PROFILE_REPLACE(profile)) {
+ ch = UNICODE_REPLACE_CHAR;
+ } else {
+ ch = (Tcl_UniChar) (ptrdiff_t) *statePtr;
+ }
+ if (dst < dstEnd) {
+ dst += Tcl_UniCharToUtf(ch, dst);
+ ++numChars;
+ } else {
+ /* No room in destination */
+ result = TCL_CONVERT_NOSPACE;
+ }
+ }
+ }
+
+ }
+
*srcReadPtr = src - srcStart;
*dstWrotePtr = dst - dstStart;
*dstCharsPtr = numChars;
diff --git a/generic/tclIORChan.c b/generic/tclIORChan.c
index c8449aa..859366f 100644
--- a/generic/tclIORChan.c
+++ b/generic/tclIORChan.c
@@ -522,9 +522,10 @@ TclChanCreateObjCmd(
* Actually: rCreate MODE CMDPREFIX
* [0] [1] [2]
*/
-
-#define MODE (1)
-#define CMD (2)
+ enum ArgIndices {
+ MODE = 1,
+ CMD = 2
+ };
/*
* Number of arguments...
@@ -739,9 +740,6 @@ TclChanCreateObjCmd(
Tcl_DecrRefCount(rcPtr->cmd);
Tcl_Free(rcPtr);
return TCL_ERROR;
-
-#undef MODE
-#undef CMD
}
/*
@@ -826,9 +824,10 @@ TclChanPostEventObjCmd(
*
* where EVENTSPEC = {read write ...} (Abbreviations allowed as well).
*/
-
-#define CHAN (1)
-#define EVENT (2)
+ enum ArgIndices {
+ CHAN = 1,
+ EVENT = 2
+ };
const char *chanId; /* Tcl level channel handle */
Tcl_Channel chan; /* Channel associated to the handle */
@@ -980,9 +979,6 @@ TclChanPostEventObjCmd(
Tcl_ResetResult(interp);
return TCL_OK;
-
-#undef CHAN
-#undef EVENT
}
/*
diff --git a/generic/tclIORTrans.c b/generic/tclIORTrans.c
index dce1a1c..d2853e2 100644
--- a/generic/tclIORTrans.c
+++ b/generic/tclIORTrans.c
@@ -529,9 +529,10 @@ TclChanPushObjCmd(
* Actually: rPush CHANNEL CMDPREFIX
* [0] [1] [2]
*/
-
-#define CHAN (1)
-#define CMD (2)
+ enum ArgIndices {
+ CHAN = 1,
+ CMD = 2
+ };
/*
* Number of arguments...
@@ -714,9 +715,6 @@ TclChanPushObjCmd(
Tcl_EventuallyFree(rtPtr, FreeReflectedTransform);
return TCL_ERROR;
-
-#undef CHAN
-#undef CMD
}
/*
@@ -751,8 +749,9 @@ TclChanPopObjCmd(
* Actually: rPop CHANNEL
* [0] [1]
*/
-
-#define CHAN (1)
+ enum ArgIndices {
+ CHAN = 1
+ };
const char *chanId; /* Tcl level channel handle */
Tcl_Channel chan; /* Channel associated to the handle */
@@ -786,8 +785,6 @@ TclChanPopObjCmd(
Tcl_UnstackChannel(interp, chan);
return TCL_OK;
-
-#undef CHAN
}
/*
diff --git a/tests/utfext.test b/tests/utfext.test
index 0c5601c..4b15a8d 100644
--- a/tests/utfext.test
+++ b/tests/utfext.test
@@ -37,16 +37,11 @@ namespace eval utftest {
# 4 external fragmentation index - where to split field 2 for fragmentation
# tests. -1 to skip
#
- # cesu-8 tests disabled because of bug [304d30677a] - TODO
- # cesu-8 {
- # {bmp {41 c3a9 42} {41 c3a9 42} 2 2}
- # {nonbmp {41 f09f9880 42} {41 eda0bd edb080 42} 3 3}
- # {null {41 c080 42} {41 00 42} 2 -1}
- # }
-
+ # THE HEX DEFINITIONS SHOULD SEPARATE EACH CHARACTER BY WHITESPACE
+ # (assumed by the charlimit tests)
lappend utfExtMap {*}{
ascii {
- {basic 414243 414243 -1 -1}
+ {basic {41 42 43} {41 42 43} -1 -1}
}
utf-8 {
{bmp {41 c3a9 42} {41 c3a9 42} 2 2}
@@ -55,6 +50,13 @@ namespace eval utftest {
{nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42} 4 4}
{null {41 c080 42} {41 00 42} 2 -1}
}
+ cesu-8 {
+ {bmp {41 c3a9 42} {41 c3a9 42} 2 2}
+ {nonbmp-frag-surr-low {41 f09f9880 42} {41 eda0bd edb880 42} 2 2}
+ {nonbmp-split-surr {41 f09f9880 42} {41 eda0bd edb880 42} 3 -1}
+ {nonbmp-frag-surr-high {41 f09f9880 42} {41 eda0bd edb880 42} 4 6}
+ {null {41 c080 42} {41 00 42} 2 -1}
+ }
utf-16le {
{bmp {41 c3a9 42} {4100 e900 4200} 2 3}
{nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4 3}
@@ -78,12 +80,12 @@ namespace eval utftest {
{null {41 c080 42} {00000041 00000000 00000042} 2 3}
}
iso8859-1 {
- {basic {41 c3a9 42} 41e942 2 -1}
- {null {41 c080 42} 410042 2 -1}
+ {basic {41 c3a9 42} {41 e9 42} 2 -1}
+ {null {41 c080 42} {41 00 42} 2 -1}
}
iso8859-3 {
- {basic {41 c4a0 42} 41d542 2 -1}
- {null {41 c080 42} 410042 2 -1}
+ {basic {41 c4a0 42} {41 d5 42} 2 -1}
+ {null {41 c080 42} {41 00 42} 2 -1}
}
shiftjis {
{basic {41 e4b98e 42} {41 8cc1 42} 3 2}
@@ -153,7 +155,7 @@ namespace eval utftest {
}
}
- proc testfragment {direction enc comment hexin hexout fragindex} {
+ proc testfragment {direction enc comment hexin hexout fragindex args} {
if {$fragindex < 0} {
# Single byte encodings so no question of fragmentation
@@ -167,26 +169,62 @@ namespace eval utftest {
set cmd Tcl_UtfToExternal
}
+ set status1 multibyte; # Return status to expect after first call
+ while {[llength $args] > 1} {
+ set opt [lpop args 0]
+ switch $opt {
+ -status1 { set status1 [lpop args 0]}
+ default {
+ error "Unknown option \"$opt\""
+ }
+ }
+ }
+
set in [binary decode hex $hexin]
set infrag [string range $in 0 $fragindex-1]
set out [binary decode hex $hexout]
set dstlen 40 ;# Should be enough for all encoding tests
- set expected_result {}
- append expected_result multibyte $fragindex
-
test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body {
set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written]
lassign $frag1Result frag1Status frag1State frag1Decoded
set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written]
lassign $frag2Result frag2Status frag2State frag2Decoded
set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]]
- list $frag1Status [expr {$frag1Read < $fragindex}] \
+ list $frag1Status [expr {$frag1Read <= $fragindex}] \
$frag2Status [expr {$frag1Read+$frag2Read}] \
[expr {$frag1Written+$frag2Written}] $decoded
- } -result [list multibyte 1 ok [string length $in] [string length $out] $out]
+ } -result [list $status1 1 ok [string length $in] [string length $out] $out]
+ }
+
+ proc testcharlimit {direction enc comment hexin hexout} {
+ set id $comment-[join $hexin ""]-charlimit
+
+ if {$direction eq "toutf"} {
+ set cmd Tcl_ExternalToUtf
+ } else {
+ set cmd Tcl_UtfToExternal
+ }
+
+ set maxchars [llength $hexout]
+ set in [binary decode hex $hexin]
+ set out [binary decode hex $hexout]
+ set dstlen 40 ;# Should be enough for all encoding tests
+
+ for {set nchars 0} {$nchars <= $maxchars} {incr nchars} {
+ set expected_bytes [binary decode hex [lrange $hexout 0 $nchars-1]]
+ set expected_nwritten [string length $expected_bytes]
+ test $cmd-$enc-$id-$nchars "$cmd - $enc - $hexin - nchars $nchars" -constraints testencoding -body {
+ set charlimit $nchars
+ lassign [testencoding $cmd $enc $in \
+ {start end charlimit} 0 $dstlen nread nwritten charlimit] \
+ status state buf
+ list $status $nwritten [string range $buf 0 $nwritten-1]
+ } -result [list [expr {$nchars == $maxchars ? "ok" : "nospace"}] $expected_nwritten $expected_bytes]
+ }
}
+
#
# Basic tests
foreach {enc testcases} $utfExtMap {
@@ -205,11 +243,23 @@ namespace eval utftest {
# should have no effect in other direction
testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate}
+ # Fragments
testfragment toutf $enc $comment $hex $utfhex $externalfragindex
testfragment fromutf $enc $comment $utfhex $hex $internalfragindex
+
+ # Char limits - note no fromutf as Tcl_UtfToExternal does not support it
+ if {![string match utf-16* $enc] && $enc ne "cesu-8"} {
+ # TODO - utf16 hangs
+ testcharlimit toutf $enc $comment $hex $utfhex
+ }
}
}
+ # Special cases - cesu2 high and low surrogates in separate fragments
+ # This will (correctly) return "ok", not "multibyte" after first frag
+ testfragment toutf cesu-8 nonbmp-split-surr \
+ {41 eda0bd edb880 42} {41 f09f9880 42} 4 -status1 ok
+
# Bug regression tests
test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body {
testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1