From 607601abc11ec2e965fedc5d3cb1e6d83c3a4a10 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Fri, 28 Feb 2014 18:25:20 +0000
Subject: More ReadChars rewriting.  Test suite now passes. Note that this
 reform simplifies ReadChars a fair bit (at least in my eyes). Also it does
 away with the use of an INPUT_NEED_NL flag, using the same strategy for
 partial \r\n sequences as is used for incomplete multibyte chars.

---
 generic/tclIO.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 173 insertions(+), 21 deletions(-)

diff --git a/generic/tclIO.c b/generic/tclIO.c
index 20428b5..ec71991 100644
--- a/generic/tclIO.c
+++ b/generic/tclIO.c
@@ -5403,7 +5403,6 @@ ReadChars(
 	 * srcLen bytes, write no more than dstLimit bytes.
 	 */
 
-//fprintf(stdout, "Start %d %d\n", dstLimit, srcLen); fflush(stdout);
 	int code = Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen,
 		statePtr->inputEncodingFlags & (bufPtr->nextPtr
 		? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState,
@@ -5416,9 +5415,6 @@ ReadChars(
 	 * Capture the number of bytes actually consumed in dstRead.
 	 */
 
-//fprintf(stdout, "Key NS=%d MB=%d S=%d\n", TCL_CONVERT_NOSPACE,
-//TCL_CONVERT_MULTIBYTE, TCL_CONVERT_SYNTAX); fflush(stdout);
-//fprintf(stdout, "Decoded %d %d\n", dstDecoded,code); fflush(stdout);
 	dstWrote = dstRead = dstDecoded;
 	TranslateInputEOL(statePtr, dst, dst, &dstWrote, &dstRead);
 
@@ -5426,20 +5422,144 @@ ReadChars(
 
 	    /*
 	     * The encoding transformation produced bytes that the
-	     * translation transformation did not consume.  Start over
-	     * and impose new limits so that doesn't happen again.
+	     * translation transformation did not consume.  Why did
+	     * this happen?
 	     */
-//fprintf(stdout, "X! %d %d\n", dstRead, dstDecoded); fflush(stdout);
 
-	    dstLimit = dstRead + TCL_UTF_MAX;
-	    statePtr->flags = savedFlags;
-	    statePtr->inputEncodingFlags = savedIEFlags;
-	    statePtr->inputEncodingState = savedState;
-	    continue;
-	}
+	    if (statePtr->inEofChar && dst[dstRead] == statePtr->inEofChar) {
+		/*
+		 * 1) There's an eof char set on the channel, and
+		 *    we saw it and stopped translating at that point.
+		 *
+		 * NOTE the bizarre spec of TranslateInputEOL in this case.
+		 * Clearly the eof char had to be read in order to account
+		 * for the stopping, but the value of dstRead does not
+		 * include it.
+		 *
+		 * Also rather bizarre, our caller can only notice an
+		 * EOF condition if we return the value -1 as the number
+		 * of chars read.  This forces us to perform a 2-call
+		 * dance where the first call can read all the chars
+		 * up to the eof char, and the second call is solely
+		 * for consuming the encoded eof char then pointed at
+		 * by src so that we can return that magic -1 value.
+		 * This seems really wasteful, especially since
+		 * the first decoding pass of each call is likely to
+		 * decode many bytes beyond that eof char that's all we
+		 * care about.
+		 */
+
+		if (dstRead == 0) {
+		    /*
+		     * Curious choice in the eof char handling.  We leave
+		     * the eof char in the buffer.  So, no need to compute
+		     * a proper srcRead value.  At this point, there
+		     * are no chars before the eof char in the buffer.
+		     */
+		    return -1;
+		}
+
+		{
+		    /*
+		     * There are chars leading the buffer before the eof
+		     * char.  Adjust the dstLimit so we go back and read
+		     * only those and do not encounter the eof char this
+		     * time.
+		     */
+
+		    dstLimit = dstRead + TCL_UTF_MAX;
+		    statePtr->flags = savedFlags;
+		    statePtr->inputEncodingFlags = savedIEFlags;
+		    statePtr->inputEncodingState = savedState;
+		    continue;
+		}
+	    }
+
+	    /*
+	     * 2) The other way to read fewer bytes than are decoded
+	     *    is when the final byte is \r and we're in a CRLF
+	     *    translation mode so we cannot decide whether to
+	     *	  record \r or \n yet.
+	     */
+
+	    assert(dstRead + 1 == dstDecoded);
+	    assert(dst[dstRead] == '\r');
+	    assert(statePtr->inputTranslation == TCL_TRANSLATE_CRLF);
+
+	    if (dstWrote > 0) {
+		/*
+		 * There are chars we can read before we hit the bare cr.
+		 * Go back with a smaller dstLimit so we get them in the
+		 * next pass, compute a matching srcRead, and don't end
+		 * up back here in this call.
+		 */
+
+		dstLimit = dstRead + TCL_UTF_MAX;
+		statePtr->flags = savedFlags;
+		statePtr->inputEncodingFlags = savedIEFlags;
+		statePtr->inputEncodingState = savedState;
+		continue;
+	    }
+
+	    assert(dstWrote == 0);
+	    assert(dstRead == 0);
+	    assert(dstDecoded == 1);
+
+	    /*
+	     * We decoded only the bare cr, and we cannot read a
+	     * translated char from that alone.  We have to know what's
+	     * next.  So why do we only have the one decoded char?
+	     */
+
+	    if (code != TCL_OK) {
+		char buffer[TCL_UTF_MAX + 2];
+		int read, decoded, count;
+
+		/* 
+		 * Didn't get everything the buffer could offer
+		 */
+
+		statePtr->flags = savedFlags;
+		statePtr->inputEncodingFlags = savedIEFlags;
+		statePtr->inputEncodingState = savedState;
+
+		Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen,
+		statePtr->inputEncodingFlags & (bufPtr->nextPtr
+		? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState,
+		buffer, TCL_UTF_MAX + 2, &read, &decoded, &count);
+
+		if (count == 2) {
+		    if (buffer[1] == '\n') {
+			/* \r\n translate to \n */
+			dst[0] = '\n';
+			bufPtr->nextRemoved += read;
+		    } else {
+			dst[0] = '\r';
+			bufPtr->nextRemoved += srcRead;
+		    }
+
+		    dst[1] = '\0';
+		    statePtr->inputEncodingFlags &= ~TCL_ENCODING_START;
+
+		    *offsetPtr += 1;
+		    return 1;
+		}
 
-//fprintf(stdout, "check %d %d %d\n", dstWrote, dstRead, dstDecoded);
-//fflush(stdout);
+	    } else if (statePtr->flags & CHANNEL_EOF) {
+
+		/*
+		 * The bare \r is the only char and we will never read
+		 * a subsequent char to make the determination.
+		 */
+
+		dst[0] = '\r';
+		bufPtr->nextRemoved = bufPtr->nextAdded;
+		*offsetPtr += 1;
+		return 1;
+	    }
+
+	    /* FALL THROUGH - get more data (dstWrote == 0) */
+	}
 
 	/* 
 	 * The translation transformation can only reduce the number
@@ -5455,7 +5575,6 @@ ReadChars(
 	     * We read more chars than allowed.  Reset limits to
 	     * prevent that and try again.
 	     */
-//fprintf(stdout, "Y!\n"); fflush(stdout);
 
 	    dstLimit = Tcl_UtfAtIndex(dst, charsToRead + 1) - dst;
 	    statePtr->flags = savedFlags;
@@ -5466,12 +5585,46 @@ ReadChars(
 
 	if (dstWrote == 0) {
 
-//fprintf(stdout, "Z!\n"); fflush(stdout);
-	    /* 
-	     * Could not read anything.  Ask caller to get more data.
+	    /*
+	     * We were not able to read any chars.  Maybe there were
+	     * not enough src bytes to decode into a char.  Maybe
+	     * a lone \r could not be translated (crlf mode).  Need
+	     * to combine any unused src bytes we have in the first
+	     * buffer with subsequent bytes to try again.
 	     */
 
-	    return -1;
+	    ChannelBuffer *nextPtr = bufPtr->nextPtr;
+
+	    if (nextPtr == NULL) {
+		if (srcLen > 0) {
+		    SetFlag(statePtr, CHANNEL_NEED_MORE_DATA);
+		}
+		return -1;
+	    }
+
+	    /*
+	     * Space is made at the beginning of the buffer to copy the
+	     * previous unused bytes there. Check first if the buffer we
+	     * are using actually has enough space at its beginning for
+	     * the data we are copying.  Because if not we will write over
+	     * the buffer management information, especially the 'nextPtr'.
+	     *
+	     * Note that the BUFFER_PADDING (See AllocChannelBuffer) is
+	     * used to prevent exactly this situation. I.e. it should never
+	     * happen.  Therefore it is ok to panic should it happen despite
+	     * the precautions.
+	     */
+
+	    if (nextPtr->nextRemoved - srcLen < 0) {
+		Tcl_Panic("Buffer Underflow, BUFFER_PADDING not enough");
+	    }
+
+	    nextPtr->nextRemoved -= srcLen;
+	    memcpy(RemovePoint(nextPtr), src, (size_t) srcLen);
+	    RecycleBuffer(statePtr, bufPtr, 0);
+	    statePtr->inQueueHead = nextPtr;
+	    return ReadChars(statePtr, objPtr, charsToRead,
+		    offsetPtr, factorPtr);
 	}
 
 	statePtr->inputEncodingFlags &= ~TCL_ENCODING_START;
@@ -5481,7 +5634,6 @@ ReadChars(
 	    *factorPtr = dstWrote * UTF_EXPANSION_FACTOR / srcRead;
 	}
 	*offsetPtr += dstWrote;
-//fprintf(stdout, "OK: %d\n", numChars); fflush(stdout);
 	return numChars;
     }
 
-- 
cgit v0.12