From 55688af5e97888536930067e5e23a686661a0b1c Mon Sep 17 00:00:00 2001 From: dgp Date: Sat, 15 Nov 2014 21:08:57 +0000 Subject: Tcl_ExternalToUtf appends a terminating NUL to its encoded results. Perhaps this is a welcome convenience for some callers, but not for Tcl's I/O system, which has no need for that. Added a new flag value TCL_ENCODING_NO_TERMINATE that callers can use to suppress this behavior. This means buffers don't require so much padding, and a tiny bit of processing is saved. Update I/O callers to use the feature. --- generic/tcl.h | 17 +++++++++++++++++ generic/tclEncoding.c | 22 +++++++++++++++------- generic/tclIO.c | 44 +++++++++++++++++++++++--------------------- 3 files changed, 55 insertions(+), 28 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index fc477f2..95f2b3f 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2144,11 +2144,28 @@ typedef struct Tcl_EncodingType { * substituting one or more "close" characters in * the destination buffer and then continue to * convert the source. + * TCL_ENCODING_NO_TERMINATE - If set, Tcl_ExternalToUtf will not append a + * terminating NUL byte. Knowing that it will + * not need space to do so, it will fill all + * dstLen bytes with encoded UTF-8 content, as + * other circumstances permit. If clear, the + * default behavior is to reserve a byte in + * the dst space for NUL termination, and to + * append the NUL byte. + * TCL_ENCODING_CHAR_LIMIT - If set and dstCharsPtr is not NULL, then + * Tcl_ExternalToUtf takes the initial value + * of *dstCharsPtr is taken as a limit of the + * maximum number of chars to produce in the + * encoded UTF-8 content. Otherwise, the + * number of chars produced is controlled only + * by other limiting factors. */ #define TCL_ENCODING_START 0x01 #define TCL_ENCODING_END 0x02 #define TCL_ENCODING_STOPONERROR 0x04 +#define TCL_ENCODING_NO_TERMINATE 0x08 +#define TCL_ENCODING_CHAR_LIMIT 0x10 /* * The following definitions are the error codes returned by the conversion diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d246cb2..0446816 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1204,6 +1204,7 @@ Tcl_ExternalToUtf( { const Encoding *encodingPtr; int result, srcRead, dstWrote, dstChars; + int noTerminate = flags & TCL_ENCODING_NO_TERMINATE; Tcl_EncodingState state; if (encoding == NULL) { @@ -1230,17 +1231,24 @@ Tcl_ExternalToUtf( dstCharsPtr = &dstChars; } - /* - * If there are any null characters in the middle of the buffer, they will - * converted to the UTF-8 null character (\xC080). To get the actual \0 at - * the end of the destination buffer, we need to append it manually. - */ + if (!noTerminate) { + /* + * If there are any null characters in the middle of the buffer, + * they will converted to the UTF-8 null character (\xC080). To get + * the actual \0 at the end of the destination buffer, we need to + * append it manually. First make room for it... + */ - dstLen--; + dstLen--; + } result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); - dst[*dstWrotePtr] = '\0'; + if (!noTerminate) { + /* ...and then append it */ + + dst[*dstWrotePtr] = '\0'; + } return result; } diff --git a/generic/tclIO.c b/generic/tclIO.c index 2025742..b759c0e 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -4578,14 +4578,14 @@ Tcl_GetsObj( * Skip the raw bytes that make up the '\n'. */ - char tmp[1 + TCL_UTF_MAX]; + char tmp[TCL_UTF_MAX]; int rawRead; bufPtr = gs.bufPtr; Tcl_ExternalToUtf(NULL, gs.encoding, RemovePoint(bufPtr), - gs.rawRead, statePtr->inputEncodingFlags, - &gs.state, tmp, 1 + TCL_UTF_MAX, &rawRead, NULL, - NULL); + gs.rawRead, statePtr->inputEncodingFlags + | TCL_ENCODING_NO_TERMINATE, &gs.state, tmp, + TCL_UTF_MAX, &rawRead, NULL, NULL); bufPtr->nextRemoved += rawRead; gs.rawRead -= rawRead; gs.bytesWrote--; @@ -4686,8 +4686,9 @@ Tcl_GetsObj( } statePtr->inputEncodingState = gs.state; Tcl_ExternalToUtf(NULL, gs.encoding, RemovePoint(bufPtr), gs.rawRead, - statePtr->inputEncodingFlags, &statePtr->inputEncodingState, dst, - eol - dst + skip + TCL_UTF_MAX, &gs.rawRead, NULL, + statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE, + &statePtr->inputEncodingState, dst, + eol - dst + skip + TCL_UTF_MAX - 1, &gs.rawRead, NULL, &gs.charsWrote); bufPtr->nextRemoved += gs.rawRead; @@ -5219,9 +5220,9 @@ FilterInputBytes( } gsPtr->state = statePtr->inputEncodingState; result = Tcl_ExternalToUtf(NULL, gsPtr->encoding, raw, rawLen, - statePtr->inputEncodingFlags, &statePtr->inputEncodingState, - dst, spaceLeft+1, &gsPtr->rawRead, &gsPtr->bytesWrote, - &gsPtr->charsWrote); + statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE, + &statePtr->inputEncodingState, dst, spaceLeft, &gsPtr->rawRead, + &gsPtr->bytesWrote, &gsPtr->charsWrote); /* * Make sure that if we go through 'gets', that we reset the @@ -5975,7 +5976,7 @@ ReadChars( * a consistent set of results. This takes the shape of a loop. */ - dstLimit = dstNeeded + 1; + dstLimit = dstNeeded; while (1) { int dstDecoded, dstRead, dstWrote, srcRead, numChars; @@ -5985,9 +5986,10 @@ ReadChars( */ int code = Tcl_ExternalToUtf(NULL, encoding, src, srcLen, - statePtr->inputEncodingFlags & (bufPtr->nextPtr - ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, - dst, dstLimit, &srcRead, &dstDecoded, &numChars); + (statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE) + & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), + &statePtr->inputEncodingState, dst, dstLimit, &srcRead, + &dstDecoded, &numChars); /* * Perform the translation transformation in place. Read no more @@ -6050,7 +6052,7 @@ ReadChars( * time. */ - dstLimit = dstRead + TCL_UTF_MAX; + dstLimit = dstRead - 1 + TCL_UTF_MAX; statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; @@ -6076,7 +6078,7 @@ ReadChars( * up back here in this call. */ - dstLimit = dstRead + TCL_UTF_MAX; + dstLimit = dstRead - 1 + TCL_UTF_MAX; statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; @@ -6093,7 +6095,7 @@ ReadChars( */ if (code != TCL_OK) { - char buffer[TCL_UTF_MAX + 2]; + char buffer[TCL_UTF_MAX + 1]; int read, decoded, count; /* @@ -6105,9 +6107,10 @@ ReadChars( statePtr->inputEncodingState = savedState; Tcl_ExternalToUtf(NULL, encoding, src, srcLen, - statePtr->inputEncodingFlags & (bufPtr->nextPtr - ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, - buffer, TCL_UTF_MAX + 2, &read, &decoded, &count); + (statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE) + & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), + &statePtr->inputEncodingState, buffer, TCL_UTF_MAX + 1, + &read, &decoded, &count); if (count == 2) { if (buffer[1] == '\n') { @@ -6119,7 +6122,6 @@ ReadChars( bufPtr->nextRemoved += srcRead; } - dst[1] = '\0'; statePtr->inputEncodingFlags &= ~TCL_ENCODING_START; Tcl_SetObjLength(objPtr, numBytes + 1); @@ -6166,7 +6168,7 @@ ReadChars( * Tcl_ExternalToUtf() call! */ - dstLimit = Tcl_UtfAtIndex(dst, charsToRead) + TCL_UTF_MAX - dst; + dstLimit = Tcl_UtfAtIndex(dst, charsToRead) - 1 + TCL_UTF_MAX - dst; statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; -- cgit v0.12