3 files changed, 111 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index 3f1b105..9ace770 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,25 @@
+2006-04-03  Andreas Kupries <andreask@activestate.com>
+
+	* generic/tclIO.c (ReadChars): Added check and panic and
+	  commentary to a piece of code which relies on BUFFER_PADDING to
+	  create enough space at the beginning of each buffer forthe
+	  insertion of partial multi-byte data at the beginning of a
+	  buffer. To explain why this code is ok, and as precaution if
+	  someone twiddled the BUFFER_PADDING into uselessness.
+	
+	* generic/tclIO.c (ReadChars): [SF Tcl Bug 1462248]. Added code
+	  temporarily suppress the use of TCL_ENCODING_END set when eof
+	  was reached while the buffer we are converting is not truly the
+	  last buffer in the queue. together with the Utf bug below it was
+	  possible to completely bollox the buffer data structures,
+	  eventually crashing Tcl.
+
+	* generic/tclEncoding.c (UtfToUtfProc): Fixed problem where the
+	  function accessed memory beyond the end of the input
+	  buffer. When TCL_ENCODING_END is set and the last bytes of the
+	  buffer start a multi-byte sequence. This bug contributed to [SF
+	  Tcl Bug 1462248].
+
 2006-03-28  Jeff Hobbs  <jeffh@ActiveState.com>
 
 	* win/configure, win/tcl.m4: define MACHINE for gcc builds as well.
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 5fa799e..2a3698f 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.9 2006/03/13 20:57:09 dgp Exp $
+ * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.10 2006/04/05 00:06:02 andreas_kupries Exp $
  */
 
 #include "tclInt.h"
@@ -2083,13 +2083,23 @@ UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
 	     */
 	    *dst++ = 0;
 	    src += 2;
+	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
+	    /* Always check before using Tcl_UtfToUniChar. Not doing
+	     * can so cause it run beyond the endof the buffer!  If we
+	     * * happen such an incomplete char its byts are made to *
+	     * represent themselves.
+	     */
+
+	    ch = (Tcl_UniChar) *src;
+	    src += 1;
+	    dst += Tcl_UniCharToUtf(ch, dst);
 	} else {
 	    src += Tcl_UtfToUniChar(src, &ch);
 	    dst += Tcl_UniCharToUtf(ch, dst);
 	}
     }
 
-    *srcReadPtr = src - srcStart;
+    *srcReadPtr  = src - srcStart;
     *dstWrotePtr = dst - dstStart;
     *dstCharsPtr = numChars;
     return result;
diff --git a/generic/tclIO.c b/generic/tclIO.c
index c7c2e3f..f82f648 100644
--- a/generic/tclIO.c
+++ b/generic/tclIO.c
@@ -10,7 +10,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclIO.c,v 1.61.2.19 2006/03/10 14:12:14 vasiljevic Exp $
+ * RCS: @(#) $Id: tclIO.c,v 1.61.2.20 2006/04/05 00:06:03 andreas_kupries Exp $
  */
 
 #include "tclInt.h"
@@ -4727,12 +4727,13 @@ ReadChars(statePtr, objPtr, charsToRead, offsetPtr, factorPtr)
     ChannelBuffer *bufPtr;
     char *src, *dst;
     Tcl_EncodingState oldState;
+    int encEndFlagSuppressed = 0;
 
     factor = *factorPtr;
     offset = *offsetPtr;
 
     bufPtr = statePtr->inQueueHead; 
-    src = bufPtr->buf + bufPtr->nextRemoved;
+    src    = bufPtr->buf + bufPtr->nextRemoved;
     srcLen = bufPtr->nextAdded - bufPtr->nextRemoved;
 
     toRead = charsToRead;
@@ -4776,6 +4777,54 @@ ReadChars(statePtr, objPtr, charsToRead, offsetPtr, factorPtr)
     }
     dst = objPtr->bytes + offset;
 
+    /*
+     * SF Tcl Bug 1462248
+     * The cause of the crash reported in the referenced bug is this:
+     *
+     * - ReadChars, called with a single buffer, with a incomplete
+     *   multi-byte character at the end (only the first byte of it).
+     * - Encoding translation fails, asks for more data
+     * - Data is read, and eof is reached, TCL_ENCODING_END (TEE) is set.
+     * - ReadChar is called again, converts the first buffer, but due
+     *   to TEE it does not check for incomplete multi-byte data, and the
+     *   character just after the end of the first buffer is a valid
+     *   completion of the multi-byte header in the actual buffer. The
+     *   conversion reads more characters from the buffer then present.
+     *   This causes nextRemoved to overshoot nextAdded and the next
+     *   reads compute a negative srcLen, cause further translations to
+     *   fail, causing copying of data into the next buffer using bad
+     *   arguments, causing the mecpy for to eventually fail.
+     *
+     * In the end it is a memory access bug spiraling out of control
+     * if the conditions are _just so_. And ultimate cause is that TEE
+     * is given to a conversion where it should not. TEE signals that
+     * this is the last buffer. Except in our case it is not.
+     *
+     * My solution is to suppress TEE if the first buffer is not the
+     * last. We will eventually need it given that EOF has been
+     * reached, but not right now. This is what the new flag
+     * "endEncSuppressFlag" is for.
+     *
+     * The bug in 'Tcl_Utf2UtfProc' where it read from memory behind
+     * the actual buffer has been fixed as well, and fixes the problem
+     * with the crash too, but this would still allow the generic
+     * layer to accidentially break a multi-byte sequence if the
+     * conditions are just right, because again the ExternalToUtf
+     * would be successful where it should not.
+     */
+
+    if ((statePtr->inputEncodingFlags & TCL_ENCODING_END) &&
+	(bufPtr->nextPtr != NULL)) {
+
+        /* TEE is set for a buffer which is not the last. Squash it
+	 * for now, and restore it later, before yielding control to
+	 * our caller.
+	 */
+
+        statePtr->inputEncodingFlags &= ~TCL_ENCODING_END;
+        encEndFlagSuppressed = 1;
+    }
+
     oldState = statePtr->inputEncodingState;
     if (statePtr->flags & INPUT_NEED_NL) {
 	/*
@@ -4801,12 +4850,21 @@ ReadChars(statePtr, objPtr, charsToRead, offsetPtr, factorPtr)
 	}
 	statePtr->inputEncodingFlags &= ~TCL_ENCODING_START;
 	*offsetPtr += 1;
+
+	if (encEndFlagSuppressed) {
+	    statePtr->inputEncodingFlags |= TCL_ENCODING_END;
+	}
         return 1;
     }
 
     Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen,
 	    statePtr->inputEncodingFlags, &statePtr->inputEncodingState, dst,
 	    dstNeeded + TCL_UTF_MAX, &srcRead, &dstWrote, &numChars);
+
+    if (encEndFlagSuppressed) {
+        statePtr->inputEncodingFlags |= TCL_ENCODING_END;
+    }
+
     if (srcRead == 0) {
 	/*
 	 * Not enough bytes in src buffer to make a complete char.  Copy
@@ -4837,6 +4895,23 @@ ReadChars(statePtr, objPtr, charsToRead, offsetPtr, factorPtr)
 	    }
 	    return -1;
 	}
+
+	/* Space is made at the beginning of the buffer to copy the
+	 * previous unused bytes there. Check first if the buffer we
+	 * are using actually has enough space at its beginning for
+	 * the data we are copying. Because if not we will write over the
+	 * buffer management information, especially the 'nextPtr'.
+	 *
+	 * Note that the BUFFER_PADDING (See AllocChannelBuffer) is
+	 * used to prevent exactly this situation. I.e. it should
+	 * never happen. Therefore it is ok to panic should it happen
+	 * despite the precautions.
+	 */
+
+	if (nextPtr->nextRemoved - srcLen < 0) {
+	    Tcl_Panic ("Buffer Underflow, BUFFER_PADDING not enough");
+	}
+
 	nextPtr->nextRemoved -= srcLen;
 	memcpy((VOID *) (nextPtr->buf + nextPtr->nextRemoved), (VOID *) src,
 		(size_t) srcLen);