summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2014-02-28 18:25:20 (GMT)
committerdgp <dgp@users.sourceforge.net>2014-02-28 18:25:20 (GMT)
commit607601abc11ec2e965fedc5d3cb1e6d83c3a4a10 (patch)
treef212e950a4a577e86f1c08c7c7222fb045f1f3b1 /generic
parent3df8548690a047e4fa9a445a253636a3e3a652df (diff)
downloadtcl-607601abc11ec2e965fedc5d3cb1e6d83c3a4a10.zip
tcl-607601abc11ec2e965fedc5d3cb1e6d83c3a4a10.tar.gz
tcl-607601abc11ec2e965fedc5d3cb1e6d83c3a4a10.tar.bz2
More ReadChars rewriting. Test suite now passes.
Note that this reform simplifies ReadChars a fair bit (at least in my eyes). Also it does away with the use of an INPUT_NEED_NL flag, using the same strategy for partial \r\n sequences as is used for incomplete multibyte chars.
Diffstat (limited to 'generic')
-rw-r--r--generic/tclIO.c194
1 files changed, 173 insertions, 21 deletions
diff --git a/generic/tclIO.c b/generic/tclIO.c
index 20428b5..ec71991 100644
--- a/generic/tclIO.c
+++ b/generic/tclIO.c
@@ -5403,7 +5403,6 @@ ReadChars(
* srcLen bytes, write no more than dstLimit bytes.
*/
-//fprintf(stdout, "Start %d %d\n", dstLimit, srcLen); fflush(stdout);
int code = Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen,
statePtr->inputEncodingFlags & (bufPtr->nextPtr
? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState,
@@ -5416,9 +5415,6 @@ ReadChars(
* Capture the number of bytes actually consumed in dstRead.
*/
-//fprintf(stdout, "Key NS=%d MB=%d S=%d\n", TCL_CONVERT_NOSPACE,
-//TCL_CONVERT_MULTIBYTE, TCL_CONVERT_SYNTAX); fflush(stdout);
-//fprintf(stdout, "Decoded %d %d\n", dstDecoded,code); fflush(stdout);
dstWrote = dstRead = dstDecoded;
TranslateInputEOL(statePtr, dst, dst, &dstWrote, &dstRead);
@@ -5426,20 +5422,144 @@ ReadChars(
/*
* The encoding transformation produced bytes that the
- * translation transformation did not consume. Start over
- * and impose new limits so that doesn't happen again.
+ * translation transformation did not consume. Why did
+ * this happen?
*/
-//fprintf(stdout, "X! %d %d\n", dstRead, dstDecoded); fflush(stdout);
- dstLimit = dstRead + TCL_UTF_MAX;
- statePtr->flags = savedFlags;
- statePtr->inputEncodingFlags = savedIEFlags;
- statePtr->inputEncodingState = savedState;
- continue;
- }
+ if (statePtr->inEofChar && dst[dstRead] == statePtr->inEofChar) {
+ /*
+ * 1) There's an eof char set on the channel, and
+ * we saw it and stopped translating at that point.
+ *
+ * NOTE the bizarre spec of TranslateInputEOL in this case.
+ * Clearly the eof char had to be read in order to account
+ * for the stopping, but the value of dstRead does not
+ * include it.
+ *
+ * Also rather bizarre, our caller can only notice an
+ * EOF condition if we return the value -1 as the number
+ * of chars read. This forces us to perform a 2-call
+ * dance where the first call can read all the chars
+ * up to the eof char, and the second call is solely
+ * for consuming the encoded eof char then pointed at
+ * by src so that we can return that magic -1 value.
+ * This seems really wasteful, especially since
+ * the first decoding pass of each call is likely to
+ * decode many bytes beyond that eof char that's all we
+ * care about.
+ */
+
+ if (dstRead == 0) {
+ /*
+ * Curious choice in the eof char handling. We leave
+ * the eof char in the buffer. So, no need to compute
+ * a proper srcRead value. At this point, there
+ * are no chars before the eof char in the buffer.
+ */
+ return -1;
+ }
+
+ {
+ /*
+ * There are chars leading the buffer before the eof
+ * char. Adjust the dstLimit so we go back and read
+ * only those and do not encounter the eof char this
+ * time.
+ */
+
+ dstLimit = dstRead + TCL_UTF_MAX;
+ statePtr->flags = savedFlags;
+ statePtr->inputEncodingFlags = savedIEFlags;
+ statePtr->inputEncodingState = savedState;
+ continue;
+ }
+ }
+
+ /*
+ * 2) The other way to read fewer bytes than are decoded
+ * is when the final byte is \r and we're in a CRLF
+ * translation mode so we cannot decide whether to
+ * record \r or \n yet.
+ */
+
+ assert(dstRead + 1 == dstDecoded);
+ assert(dst[dstRead] == '\r');
+ assert(statePtr->inputTranslation == TCL_TRANSLATE_CRLF);
+
+ if (dstWrote > 0) {
+ /*
+ * There are chars we can read before we hit the bare cr.
+ * Go back with a smaller dstLimit so we get them in the
+ * next pass, compute a matching srcRead, and don't end
+ * up back here in this call.
+ */
+
+ dstLimit = dstRead + TCL_UTF_MAX;
+ statePtr->flags = savedFlags;
+ statePtr->inputEncodingFlags = savedIEFlags;
+ statePtr->inputEncodingState = savedState;
+ continue;
+ }
+
+ assert(dstWrote == 0);
+ assert(dstRead == 0);
+ assert(dstDecoded == 1);
+
+ /*
+ * We decoded only the bare cr, and we cannot read a
+ * translated char from that alone. We have to know what's
+ * next. So why do we only have the one decoded char?
+ */
+
+ if (code != TCL_OK) {
+ char buffer[TCL_UTF_MAX + 2];
+ int read, decoded, count;
+
+ /*
+ * Didn't get everything the buffer could offer
+ */
+
+ statePtr->flags = savedFlags;
+ statePtr->inputEncodingFlags = savedIEFlags;
+ statePtr->inputEncodingState = savedState;
+
+ Tcl_ExternalToUtf(NULL, statePtr->encoding, src, srcLen,
+ statePtr->inputEncodingFlags & (bufPtr->nextPtr
+ ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState,
+ buffer, TCL_UTF_MAX + 2, &read, &decoded, &count);
+
+ if (count == 2) {
+ if (buffer[1] == '\n') {
+ /* \r\n translate to \n */
+ dst[0] = '\n';
+ bufPtr->nextRemoved += read;
+ } else {
+ dst[0] = '\r';
+ bufPtr->nextRemoved += srcRead;
+ }
+
+ dst[1] = '\0';
+ statePtr->inputEncodingFlags &= ~TCL_ENCODING_START;
+
+ *offsetPtr += 1;
+ return 1;
+ }
-//fprintf(stdout, "check %d %d %d\n", dstWrote, dstRead, dstDecoded);
-//fflush(stdout);
+ } else if (statePtr->flags & CHANNEL_EOF) {
+
+ /*
+ * The bare \r is the only char and we will never read
+ * a subsequent char to make the determination.
+ */
+
+ dst[0] = '\r';
+ bufPtr->nextRemoved = bufPtr->nextAdded;
+ *offsetPtr += 1;
+ return 1;
+ }
+
+ /* FALL THROUGH - get more data (dstWrote == 0) */
+ }
/*
* The translation transformation can only reduce the number
@@ -5455,7 +5575,6 @@ ReadChars(
* We read more chars than allowed. Reset limits to
* prevent that and try again.
*/
-//fprintf(stdout, "Y!\n"); fflush(stdout);
dstLimit = Tcl_UtfAtIndex(dst, charsToRead + 1) - dst;
statePtr->flags = savedFlags;
@@ -5466,12 +5585,46 @@ ReadChars(
if (dstWrote == 0) {
-//fprintf(stdout, "Z!\n"); fflush(stdout);
- /*
- * Could not read anything. Ask caller to get more data.
+ /*
+ * We were not able to read any chars. Maybe there were
+ * not enough src bytes to decode into a char. Maybe
+ * a lone \r could not be translated (crlf mode). Need
+ * to combine any unused src bytes we have in the first
+ * buffer with subsequent bytes to try again.
*/
- return -1;
+ ChannelBuffer *nextPtr = bufPtr->nextPtr;
+
+ if (nextPtr == NULL) {
+ if (srcLen > 0) {
+ SetFlag(statePtr, CHANNEL_NEED_MORE_DATA);
+ }
+ return -1;
+ }
+
+ /*
+ * Space is made at the beginning of the buffer to copy the
+ * previous unused bytes there. Check first if the buffer we
+ * are using actually has enough space at its beginning for
+ * the data we are copying. Because if not we will write over
+ * the buffer management information, especially the 'nextPtr'.
+ *
+ * Note that the BUFFER_PADDING (See AllocChannelBuffer) is
+ * used to prevent exactly this situation. I.e. it should never
+ * happen. Therefore it is ok to panic should it happen despite
+ * the precautions.
+ */
+
+ if (nextPtr->nextRemoved - srcLen < 0) {
+ Tcl_Panic("Buffer Underflow, BUFFER_PADDING not enough");
+ }
+
+ nextPtr->nextRemoved -= srcLen;
+ memcpy(RemovePoint(nextPtr), src, (size_t) srcLen);
+ RecycleBuffer(statePtr, bufPtr, 0);
+ statePtr->inQueueHead = nextPtr;
+ return ReadChars(statePtr, objPtr, charsToRead,
+ offsetPtr, factorPtr);
}
statePtr->inputEncodingFlags &= ~TCL_ENCODING_START;
@@ -5481,7 +5634,6 @@ ReadChars(
*factorPtr = dstWrote * UTF_EXPANSION_FACTOR / srcRead;
}
*offsetPtr += dstWrote;
-//fprintf(stdout, "OK: %d\n", numChars); fflush(stdout);
return numChars;
}