From 607601abc11ec2e965fedc5d3cb1e6d83c3a4a10 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 28 Feb 2014 18:25:20 +0000 Subject: More ReadChars rewriting. Test suite now passes. Note that this reform simplifies ReadChars a fair bit (at least in my eyes). Also it does away with the use of an INPUT_NEED_NL flag, using the same strategy for partial \r\n sequences as is used for incomplete multibyte chars. --- generic/tclIO.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 173 insertions(+), 21 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 20428b5..ec71991 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -5403,7 +5403,6 @@ ReadChars( * srcLen bytes, write no more than dstLimit bytes. */ -//fprintf(stdout, "Start %d %d\n", dstLimit, srcLen); fflush(stdout); int code = Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen, statePtr->inputEncodingFlags & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, @@ -5416,9 +5415,6 @@ ReadChars( * Capture the number of bytes actually consumed in dstRead. */ -//fprintf(stdout, "Key NS=%d MB=%d S=%d\n", TCL_CONVERT_NOSPACE, -//TCL_CONVERT_MULTIBYTE, TCL_CONVERT_SYNTAX); fflush(stdout); -//fprintf(stdout, "Decoded %d %d\n", dstDecoded,code); fflush(stdout); dstWrote = dstRead = dstDecoded; TranslateInputEOL(statePtr, dst, dst, &dstWrote, &dstRead); @@ -5426,20 +5422,144 @@ ReadChars( /* * The encoding transformation produced bytes that the - * translation transformation did not consume. Start over - * and impose new limits so that doesn't happen again. + * translation transformation did not consume. Why did + * this happen? */ -//fprintf(stdout, "X! %d %d\n", dstRead, dstDecoded); fflush(stdout); - dstLimit = dstRead + TCL_UTF_MAX; - statePtr->flags = savedFlags; - statePtr->inputEncodingFlags = savedIEFlags; - statePtr->inputEncodingState = savedState; - continue; - } + if (statePtr->inEofChar && dst[dstRead] == statePtr->inEofChar) { + /* + * 1) There's an eof char set on the channel, and + * we saw it and stopped translating at that point. + * + * NOTE the bizarre spec of TranslateInputEOL in this case. + * Clearly the eof char had to be read in order to account + * for the stopping, but the value of dstRead does not + * include it. + * + * Also rather bizarre, our caller can only notice an + * EOF condition if we return the value -1 as the number + * of chars read. This forces us to perform a 2-call + * dance where the first call can read all the chars + * up to the eof char, and the second call is solely + * for consuming the encoded eof char then pointed at + * by src so that we can return that magic -1 value. + * This seems really wasteful, especially since + * the first decoding pass of each call is likely to + * decode many bytes beyond that eof char that's all we + * care about. + */ + + if (dstRead == 0) { + /* + * Curious choice in the eof char handling. We leave + * the eof char in the buffer. So, no need to compute + * a proper srcRead value. At this point, there + * are no chars before the eof char in the buffer. + */ + return -1; + } + + { + /* + * There are chars leading the buffer before the eof + * char. Adjust the dstLimit so we go back and read + * only those and do not encounter the eof char this + * time. + */ + + dstLimit = dstRead + TCL_UTF_MAX; + statePtr->flags = savedFlags; + statePtr->inputEncodingFlags = savedIEFlags; + statePtr->inputEncodingState = savedState; + continue; + } + } + + /* + * 2) The other way to read fewer bytes than are decoded + * is when the final byte is \r and we're in a CRLF + * translation mode so we cannot decide whether to + * record \r or \n yet. + */ + + assert(dstRead + 1 == dstDecoded); + assert(dst[dstRead] == '\r'); + assert(statePtr->inputTranslation == TCL_TRANSLATE_CRLF); + + if (dstWrote > 0) { + /* + * There are chars we can read before we hit the bare cr. + * Go back with a smaller dstLimit so we get them in the + * next pass, compute a matching srcRead, and don't end + * up back here in this call. + */ + + dstLimit = dstRead + TCL_UTF_MAX; + statePtr->flags = savedFlags; + statePtr->inputEncodingFlags = savedIEFlags; + statePtr->inputEncodingState = savedState; + continue; + } + + assert(dstWrote == 0); + assert(dstRead == 0); + assert(dstDecoded == 1); + + /* + * We decoded only the bare cr, and we cannot read a + * translated char from that alone. We have to know what's + * next. So why do we only have the one decoded char? + */ + + if (code != TCL_OK) { + char buffer[TCL_UTF_MAX + 2]; + int read, decoded, count; + + /* + * Didn't get everything the buffer could offer + */ + + statePtr->flags = savedFlags; + statePtr->inputEncodingFlags = savedIEFlags; + statePtr->inputEncodingState = savedState; + + Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen, + statePtr->inputEncodingFlags & (bufPtr->nextPtr + ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, + buffer, TCL_UTF_MAX + 2, &read, &decoded, &count); + + if (count == 2) { + if (buffer[1] == '\n') { + /* \r\n translate to \n */ + dst[0] = '\n'; + bufPtr->nextRemoved += read; + } else { + dst[0] = '\r'; + bufPtr->nextRemoved += srcRead; + } + + dst[1] = '\0'; + statePtr->inputEncodingFlags &= ~TCL_ENCODING_START; + + *offsetPtr += 1; + return 1; + } -//fprintf(stdout, "check %d %d %d\n", dstWrote, dstRead, dstDecoded); -//fflush(stdout); + } else if (statePtr->flags & CHANNEL_EOF) { + + /* + * The bare \r is the only char and we will never read + * a subsequent char to make the determination. + */ + + dst[0] = '\r'; + bufPtr->nextRemoved = bufPtr->nextAdded; + *offsetPtr += 1; + return 1; + } + + /* FALL THROUGH - get more data (dstWrote == 0) */ + } /* * The translation transformation can only reduce the number @@ -5455,7 +5575,6 @@ ReadChars( * We read more chars than allowed. Reset limits to * prevent that and try again. */ -//fprintf(stdout, "Y!\n"); fflush(stdout); dstLimit = Tcl_UtfAtIndex(dst, charsToRead + 1) - dst; statePtr->flags = savedFlags; @@ -5466,12 +5585,46 @@ ReadChars( if (dstWrote == 0) { -//fprintf(stdout, "Z!\n"); fflush(stdout); - /* - * Could not read anything. Ask caller to get more data. + /* + * We were not able to read any chars. Maybe there were + * not enough src bytes to decode into a char. Maybe + * a lone \r could not be translated (crlf mode). Need + * to combine any unused src bytes we have in the first + * buffer with subsequent bytes to try again. */ - return -1; + ChannelBuffer *nextPtr = bufPtr->nextPtr; + + if (nextPtr == NULL) { + if (srcLen > 0) { + SetFlag(statePtr, CHANNEL_NEED_MORE_DATA); + } + return -1; + } + + /* + * Space is made at the beginning of the buffer to copy the + * previous unused bytes there. Check first if the buffer we + * are using actually has enough space at its beginning for + * the data we are copying. Because if not we will write over + * the buffer management information, especially the 'nextPtr'. + * + * Note that the BUFFER_PADDING (See AllocChannelBuffer) is + * used to prevent exactly this situation. I.e. it should never + * happen. Therefore it is ok to panic should it happen despite + * the precautions. + */ + + if (nextPtr->nextRemoved - srcLen < 0) { + Tcl_Panic("Buffer Underflow, BUFFER_PADDING not enough"); + } + + nextPtr->nextRemoved -= srcLen; + memcpy(RemovePoint(nextPtr), src, (size_t) srcLen); + RecycleBuffer(statePtr, bufPtr, 0); + statePtr->inQueueHead = nextPtr; + return ReadChars(statePtr, objPtr, charsToRead, + offsetPtr, factorPtr); } statePtr->inputEncodingFlags &= ~TCL_ENCODING_START; @@ -5481,7 +5634,6 @@ ReadChars( *factorPtr = dstWrote * UTF_EXPANSION_FACTOR / srcRead; } *offsetPtr += dstWrote; -//fprintf(stdout, "OK: %d\n", numChars); fflush(stdout); return numChars; } -- cgit v0.12