Merge trunk

author: apnadkarni <apnmbx-wits@yahoo.com> 2024-08-22 10:16:49 (GMT)
committer: apnadkarni <apnmbx-wits@yahoo.com> 2024-08-22 10:16:49 (GMT)
commit: 11d48cf9a94ffd0f61f698c1c81fca711e1a65b3 (patch)
tree: 9d238f83e8d81e6d0f66a24a4dd482c06473605f
parent: 5e67db1121363ac6be972bb2d779ad5c88c4a273 (diff)
parent: a64b3e4f6f9fc4141aa2211d311c5877006c7e08 (diff)
download: tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.zip
tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.gz
tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.bz2
4 files changed, 232 insertions, 55 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 95acfa9..d235911 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2424,7 +2424,11 @@ UtfToUtfProc(
     const char *src,		/* Source string in UTF-8. */
     int srcLen,			/* Source string length in bytes. */
     int flags,			/* TCL_ENCODING_* conversion control flags. */
-    TCL_UNUSED(Tcl_EncodingState *),
+    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
+				 * information used during a piecewise
+				 * conversion. Contents of statePtr are
+				 * initialized and/or reset by conversion
+				 * routine under control of flags argument. */
     char *dst,			/* Output buffer in which converted string is
 				 * stored. */
     int dstLen,			/* The maximum length of output buffer in
@@ -2447,6 +2451,10 @@ UtfToUtfProc(
     int ch;
     int profile;
 
+    if (flags & TCL_ENCODING_START) {
+        /* *statePtr will hold high surrogate in a split surrogate pair */
+    	*statePtr = 0;
+    }
     result = TCL_OK;
 
     srcStart = src;
@@ -2463,6 +2471,42 @@ UtfToUtfProc(
     flags |= PTR2INT(clientData);
     dstEnd = dst + dstLen - ((flags & ENCODING_UTF) ? TCL_UTF_MAX : 6);
 
+    /*
+     * Macro to output an isolated high surrogate when it is not followed
+     * by a low surrogate. NOT to be called for strict profile since
+     * that should raise an error.
+     */
+#define OUTPUT_ISOLATEDSURROGATE                                \
+    do {                                                        \
+        Tcl_UniChar high;                                       \
+        if (PROFILE_REPLACE(profile)) {                         \
+            high = UNICODE_REPLACE_CHAR;                        \
+        } else {                                                \
+            high = (Tcl_UniChar)(ptrdiff_t) *statePtr;          \
+        }                                                       \
+        assert(!(flags & ENCODING_UTF)); /* Must be CESU-8 */   \
+        assert(HIGH_SURROGATE(high));                           \
+        assert(!PROFILE_STRICT(profile));                       \
+        dst += Tcl_UniCharToUtf(high, dst);                     \
+        *statePtr = 0; /* Reset state */                        \
+    } while (0) 
+
+    /*
+     * Macro to check for isolated surrogate and either break with
+     * an error if profile is strict, or output an appropriate
+     * character for replace and tcl8 profiles and continue.
+     */
+#define CHECK_ISOLATEDSURROGATE                                         \
+    if (*statePtr) {                                                    \
+        if (PROFILE_STRICT(profile)) {                                  \
+            result = TCL_CONVERT_SYNTAX;                                \
+            break;                                                      \
+        }                                                               \
+        OUTPUT_ISOLATEDSURROGATE;                                       \
+        continue; /* Rerun loop so length checks etc. repeated */       \
+    } else                                                              \
+        (void) 0
+
     profile = ENCODING_PROFILE_GET(flags);
     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 
@@ -2481,6 +2525,8 @@ UtfToUtfProc(
 	}
 	if (UCHAR(*src) < 0x80
 		&& !((UCHAR(*src) == 0) && (flags & ENCODING_INPUT))) {
+
+            CHECK_ISOLATEDSURROGATE;
 	    /*
 	     * Copy 7bit characters, but skip null-bytes when we are in input
 	     * mode, so that they get converted to \xC0\x80.
@@ -2490,6 +2536,8 @@ UtfToUtfProc(
 		 (UCHAR(src[1]) == 0x80) &&
 		 (!(flags & ENCODING_INPUT) || !PROFILE_TCL8(profile))) {
 	    /* Special sequence \xC0\x80 */
+
+            CHECK_ISOLATEDSURROGATE;
 	    if (!PROFILE_TCL8(profile) && (flags & ENCODING_INPUT)) {
 		if (PROFILE_REPLACE(profile)) {
 		    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
@@ -2510,12 +2558,12 @@ UtfToUtfProc(
 
 	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
 	    /*
-	     * Incomplete byte sequence.
-	     * Always check before using Tcl_UtfToUniChar. Not doing so can cause
-	     * it to run beyond the end of the buffer! If we happen on such an
-	     * incomplete char its bytes are made to represent themselves unless
-	     * the user has explicitly asked to be told.
-	     */
+             * Incomplete byte sequence not because there are insufficient
+             * bytes in source buffer (have already checked that above) but
+             * because the UTF-8 sequence is truncated.
+             */
+
+            CHECK_ISOLATEDSURROGATE;
 
 	    if (flags & ENCODING_INPUT) {
 		/* Incomplete bytes for modified UTF-8 target */
@@ -2537,7 +2585,12 @@ UtfToUtfProc(
 	    }
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
+            /* Have a complete character */
 	    size_t len = TclUtfToUniChar(src, &ch);
+
+            Tcl_UniChar savedSurrogate = (Tcl_UniChar) (ptrdiff_t)*statePtr;
+            *statePtr = 0; /* Reset surrogate */
+
 	    if (flags & ENCODING_INPUT) {
 		if (((len < 2) && (ch != 0))
 			|| ((ch > 0xFFFF) && !(flags & ENCODING_UTF))) {
@@ -2554,6 +2607,8 @@ UtfToUtfProc(
 	    src += len;
 	    if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT)
 		    && (ch > 0x3FF)) {
+                assert(savedSurrogate == 0); /* Since this flag combo
+                                                will never set *statePtr */
 		if (ch > 0xFFFF) {
 		    /* CESU-8 6-byte sequence for chars > U+FFFF */
 		    ch -= 0x10000;
@@ -2567,19 +2622,98 @@ UtfToUtfProc(
 		*dst++ = (char)((ch | 0x80) & 0xBF);
 		continue;
 	    } else if (SURROGATE(ch)) {
-		if (PROFILE_STRICT(profile)) {
-		    result = (flags & ENCODING_INPUT)
+                if ((flags & ENCODING_UTF)) {
+                    /* UTF-8, not CESU-8, so surrogates should not appear */
+                    if (PROFILE_STRICT(profile)) {
+                        result = (flags & ENCODING_INPUT)
 			    ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
-		    src = saveSrc;
-		    break;
-		} else if (PROFILE_REPLACE(profile)) {
-		    ch = UNICODE_REPLACE_CHAR;
-		}
-	    }
+                        src = saveSrc;
+                        break;
+                    } else if (PROFILE_REPLACE(profile)) {
+                        ch = UNICODE_REPLACE_CHAR;
+                    } else {
+                        /* PROFILE_TCL8 - output as is */
+                    }
+                } else {
+                    /* CESU-8 */
+                    if (LOW_SURROGATE(ch)) {
+                        if (savedSurrogate) {
+                            assert(HIGH_SURROGATE(savedSurrogate));
+                            ch = 0x10000 + ((savedSurrogate - 0xd800) << 10) + (ch - 0xdc00);
+                        } else {
+                            /* Isolated low surrogate */
+                            if (PROFILE_STRICT(profile)) {
+                                result = (flags & ENCODING_INPUT)
+                                    ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
+                                src = saveSrc;
+                                break;
+                            } else if (PROFILE_REPLACE(profile)) {
+                                ch = UNICODE_REPLACE_CHAR;
+                            } else {
+                                /* Tcl8 profile. Output low surrogate as is */
+                            }
+                        }
+                    } else {
+                        assert(HIGH_SURROGATE(ch));
+                        /* Save the high surrogate */
+                        *statePtr = (Tcl_EncodingState) (ptrdiff_t) ch;
+                        if (savedSurrogate) {
+                            assert(HIGH_SURROGATE(savedSurrogate));
+                            if (PROFILE_STRICT(profile)) {
+                                result = (flags & ENCODING_INPUT)
+                                    ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
+                                src = saveSrc;
+                                break;
+                            } else if (PROFILE_REPLACE(profile)) {
+                                ch = UNICODE_REPLACE_CHAR;
+                            } else {
+                                /* Output the isolated high surrogate */
+                                ch = savedSurrogate;
+                            }
+                        } else {
+                            /* High surrogate saved in *statePtr. Do not output anything just yet. */
+                            --numChars; /* Cancel the increment at end of loop */
+                            continue;
+                        }
+                    }
+                }
+	    } else {
+                /* Normal character */
+                CHECK_ISOLATEDSURROGATE;
+            }
+
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	}
     }
 
+    /* Check if an high surrogate left over */
+    if (*statePtr) {
+        assert(!(flags & ENCODING_UTF)); /* CESU-8, Not UTF-8 */
+        if (!(flags & TCL_ENCODING_END)) {
+            /* More data coming */
+        } else {
+            /* No more data coming */
+            if (PROFILE_STRICT(profile)) {
+                result = (flags & ENCODING_INPUT)
+                    ? TCL_CONVERT_SYNTAX : TCL_CONVERT_UNKNOWN;
+            } else {
+                if (PROFILE_REPLACE(profile)) {
+                    ch = UNICODE_REPLACE_CHAR;
+                } else {
+                    ch = (Tcl_UniChar) (ptrdiff_t) *statePtr;
+                }
+                if (dst < dstEnd) {
+                    dst += Tcl_UniCharToUtf(ch, dst);
+                    ++numChars;
+                } else {
+                    /* No room in destination */
+                    result = TCL_CONVERT_NOSPACE;
+                }
+            }
+        }
+
+    }
+
     *srcReadPtr = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
diff --git a/generic/tclIORChan.c b/generic/tclIORChan.c
index c8449aa..859366f 100644
--- a/generic/tclIORChan.c
+++ b/generic/tclIORChan.c
@@ -522,9 +522,10 @@ TclChanCreateObjCmd(
      * Actually: rCreate MODE CMDPREFIX
      *           [0]     [1]  [2]
      */
-
-#define MODE	(1)
-#define CMD	(2)
+    enum ArgIndices {
+	MODE = 1,
+	CMD = 2
+    };
 
     /*
      * Number of arguments...
@@ -739,9 +740,6 @@ TclChanCreateObjCmd(
     Tcl_DecrRefCount(rcPtr->cmd);
     Tcl_Free(rcPtr);
     return TCL_ERROR;
-
-#undef MODE
-#undef CMD
 }
 
 /*
@@ -826,9 +824,10 @@ TclChanPostEventObjCmd(
      *
      * where EVENTSPEC = {read write ...} (Abbreviations allowed as well).
      */
-
-#define CHAN	(1)
-#define EVENT	(2)
+    enum ArgIndices {
+	CHAN = 1,
+	EVENT = 2
+    };
 
     const char *chanId;		/* Tcl level channel handle */
     Tcl_Channel chan;		/* Channel associated to the handle */
@@ -980,9 +979,6 @@ TclChanPostEventObjCmd(
 
     Tcl_ResetResult(interp);
     return TCL_OK;
-
-#undef CHAN
-#undef EVENT
 }
 
 /*
diff --git a/generic/tclIORTrans.c b/generic/tclIORTrans.c
index dce1a1c..d2853e2 100644
--- a/generic/tclIORTrans.c
+++ b/generic/tclIORTrans.c
@@ -529,9 +529,10 @@ TclChanPushObjCmd(
      * Actually: rPush CHANNEL CMDPREFIX
      *           [0]   [1]     [2]
      */
-
-#define CHAN	(1)
-#define CMD	(2)
+    enum ArgIndices {
+	CHAN = 1,
+	CMD = 2
+    };
 
     /*
      * Number of arguments...
@@ -714,9 +715,6 @@ TclChanPushObjCmd(
 
     Tcl_EventuallyFree(rtPtr, FreeReflectedTransform);
     return TCL_ERROR;
-
-#undef CHAN
-#undef CMD
 }
 
 /*
@@ -751,8 +749,9 @@ TclChanPopObjCmd(
      * Actually: rPop CHANNEL
      *           [0]  [1]
      */
-
-#define CHAN	(1)
+    enum ArgIndices {
+	CHAN = 1
+    };
 
     const char *chanId;		/* Tcl level channel handle */
     Tcl_Channel chan;		/* Channel associated to the handle */
@@ -786,8 +785,6 @@ TclChanPopObjCmd(
 
     Tcl_UnstackChannel(interp, chan);
     return TCL_OK;
-
-#undef CHAN
 }
 
 /*
diff --git a/tests/utfext.test b/tests/utfext.test
index 0c5601c..4b15a8d 100644
--- a/tests/utfext.test
+++ b/tests/utfext.test
@@ -37,16 +37,11 @@ namespace eval utftest {
     # 4 external fragmentation index - where to split field 2 for fragmentation
     #   tests. -1 to skip
     #
-    # cesu-8 tests disabled because of bug [304d30677a] - TODO
-    # cesu-8 {
-        # {bmp    {41 c3a9 42}     {41 c3a9 42}      2  2}
-        # {nonbmp {41 f09f9880 42} {41 eda0bd edb080 42} 3 3}
-        # {null   {41 c080 42} {41 00 42}    2 -1}
-    # }
-
+    # THE HEX DEFINITIONS SHOULD SEPARATE EACH CHARACTER BY WHITESPACE
+    # (assumed by the charlimit tests)
     lappend utfExtMap {*}{
         ascii {
-            {basic 414243 414243 -1 -1}
+            {basic {41 42 43} {41 42 43} -1 -1}
         }
         utf-8 {
             {bmp    {41 c3a9 42}     {41 c3a9 42}      2  2}
@@ -55,6 +50,13 @@ namespace eval utftest {
             {nonbmp-frag-3 {41 f09f9880 42} {41 f09f9880 42}  4  4}
             {null   {41 c080 42} {41 00 42}    2 -1}
         }
+        cesu-8 {
+            {bmp    {41 c3a9 42}     {41 c3a9 42}      2  2}
+            {nonbmp-frag-surr-low {41 f09f9880 42} {41 eda0bd edb880 42} 2 2}
+            {nonbmp-split-surr {41 f09f9880 42} {41 eda0bd edb880 42} 3 -1}
+            {nonbmp-frag-surr-high {41 f09f9880 42} {41 eda0bd edb880 42} 4 6}
+            {null   {41 c080 42} {41 00 42}    2 -1}
+        }
         utf-16le {
             {bmp    {41 c3a9 42}     {4100 e900 4200}     2  3}
             {nonbmp {41 f09f9880 42} {4100 3dd8 00de 4200} 4  3}
@@ -78,12 +80,12 @@ namespace eval utftest {
             {null   {41 c080 42} {00000041 00000000 00000042}  2 3}
         }
         iso8859-1 {
-            {basic {41 c3a9 42} 41e942 2 -1}
-            {null  {41 c080 42} 410042 2 -1}
+            {basic {41 c3a9 42} {41 e9 42} 2 -1}
+            {null  {41 c080 42} {41 00 42} 2 -1}
         }
         iso8859-3 {
-            {basic {41 c4a0 42} 41d542 2 -1}
-            {null  {41 c080 42} 410042 2 -1}
+            {basic {41 c4a0 42} {41 d5 42} 2 -1}
+            {null  {41 c080 42} {41 00 42} 2 -1}
         }
         shiftjis {
             {basic {41 e4b98e 42} {41 8cc1 42} 3 2}
@@ -153,7 +155,7 @@ namespace eval utftest {
         }
     }
 
-    proc testfragment {direction enc comment hexin hexout fragindex} {
+    proc testfragment {direction enc comment hexin hexout fragindex args} {
 
         if {$fragindex < 0} {
             # Single byte encodings so no question of fragmentation
@@ -167,26 +169,62 @@ namespace eval utftest {
             set cmd Tcl_UtfToExternal
         }
 
+        set status1 multibyte; # Return status to expect after first call
+        while {[llength $args] > 1} {
+            set opt [lpop args 0]
+            switch $opt {
+                -status1  { set status1 [lpop args 0]}
+                default {
+                    error "Unknown option \"$opt\""
+                }
+            }
+        }
+
         set in [binary decode hex $hexin]
         set infrag [string range $in 0 $fragindex-1]
         set out [binary decode hex $hexout]
         set dstlen 40 ;# Should be enough for all encoding tests
 
-        set expected_result {}
-        append expected_result multibyte $fragindex
-
         test $cmd-$enc-$id "$cmd - $enc - $hexin - frag" -constraints testencoding -body {
             set frag1Result [testencoding $cmd $enc [string range $in 0 $fragindex-1] {start} 0 $dstlen frag1Read frag1Written]
             lassign $frag1Result frag1Status frag1State frag1Decoded
             set frag2Result [testencoding $cmd $enc [string range $in $frag1Read end] {end} $frag1State $dstlen frag2Read frag2Written]
             lassign $frag2Result frag2Status frag2State frag2Decoded
             set decoded [string cat [string range $frag1Decoded 0 $frag1Written-1] [string range $frag2Decoded 0 $frag2Written-1]]
-            list $frag1Status [expr {$frag1Read < $fragindex}] \
+            list $frag1Status [expr {$frag1Read <= $fragindex}] \
                 $frag2Status [expr {$frag1Read+$frag2Read}] \
                 [expr {$frag1Written+$frag2Written}] $decoded
-        } -result [list multibyte 1 ok [string length $in] [string length $out] $out]
+        } -result [list $status1 1 ok [string length $in] [string length $out] $out]
+    }
+
+    proc testcharlimit {direction enc comment hexin hexout} {
+        set id $comment-[join $hexin ""]-charlimit
+
+        if {$direction eq "toutf"} {
+            set cmd Tcl_ExternalToUtf
+        } else {
+            set cmd Tcl_UtfToExternal
+        }
+
+        set maxchars [llength $hexout]
+        set in [binary decode hex $hexin]
+        set out [binary decode hex $hexout]
+        set dstlen 40 ;# Should be enough for all encoding tests
+
+        for {set nchars 0} {$nchars <= $maxchars} {incr nchars} {
+            set expected_bytes [binary decode hex [lrange $hexout 0 $nchars-1]]
+            set expected_nwritten [string length $expected_bytes]
+            test $cmd-$enc-$id-$nchars "$cmd - $enc - $hexin - nchars $nchars" -constraints testencoding -body {
+                set charlimit $nchars
+                lassign [testencoding $cmd $enc $in \
+                             {start end charlimit} 0 $dstlen nread nwritten charlimit] \
+                    status state buf
+                list $status $nwritten [string range $buf 0 $nwritten-1]
+            } -result [list [expr {$nchars == $maxchars ? "ok" : "nospace"}] $expected_nwritten $expected_bytes]
+        }
     }
 
+
     #
     # Basic tests
     foreach {enc testcases} $utfExtMap {
@@ -205,11 +243,23 @@ namespace eval utftest {
             # should have no effect in other direction
             testutf fromutf $enc $comment $utfhex $hex$encnuls -flags {start end noterminate}
 
+            # Fragments
             testfragment toutf $enc $comment $hex $utfhex $externalfragindex
             testfragment fromutf $enc $comment $utfhex $hex $internalfragindex
+
+            # Char limits - note no fromutf as Tcl_UtfToExternal does not support it
+            if {![string match utf-16* $enc] && $enc ne "cesu-8"} {
+                # TODO - utf16 hangs
+                testcharlimit toutf $enc $comment $hex $utfhex
+            }
         }
     }
 
+    # Special cases - cesu2 high and low surrogates in separate fragments
+    # This will (correctly) return "ok", not "multibyte" after first frag
+    testfragment toutf cesu-8 nonbmp-split-surr \
+        {41 eda0bd edb880 42} {41 f09f9880 42} 4 -status1 ok
+
     # Bug regression tests
     test Tcl_UtfToExternal-bug-183a1adcc0 {buffer overflow} -body {
         testencoding Tcl_UtfToExternal utf-16 A {start end} {} 1
author	apnadkarni <apnmbx-wits@yahoo.com>	2024-08-22 10:16:49 (GMT)
committer	apnadkarni <apnmbx-wits@yahoo.com>	2024-08-22 10:16:49 (GMT)
commit	11d48cf9a94ffd0f61f698c1c81fca711e1a65b3 (patch)
tree	9d238f83e8d81e6d0f66a24a4dd482c06473605f
parent	5e67db1121363ac6be972bb2d779ad5c88c4a273 (diff)
parent	a64b3e4f6f9fc4141aa2211d311c5877006c7e08 (diff)
download	tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.zip tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.gz tcl-11d48cf9a94ffd0f61f698c1c81fca711e1a65b3.tar.bz2