From cc8d3df1e0c6eae3db26683ca674c59a7e8a8080 Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Sat, 15 Nov 2014 21:08:57 +0000 Subject: Tcl_ExternalToUtf appends a terminating NUL to its encoded results. Perhaps this is a welcome convenience for some callers, but not for Tcl's I/O system, which has no need for that. Added a new flag value TCL_ENCODING_NO_TERMINATE that callers can use to suppress this behavior. This means buffers don't require so much padding, and a tiny bit of processing is saved. Update I/O callers to use the feature. --- generic/tcl.h | 17 +++++++++++++++++ generic/tclEncoding.c | 22 +++++++++++++++------- generic/tclIO.c | 44 +++++++++++++++++++++++--------------------- 3 files changed, 55 insertions(+), 28 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index fc477f2..95f2b3f 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2144,11 +2144,28 @@ typedef struct Tcl_EncodingType { * substituting one or more "close" characters in * the destination buffer and then continue to * convert the source. + * TCL_ENCODING_NO_TERMINATE - If set, Tcl_ExternalToUtf will not append a + * terminating NUL byte. Knowing that it will + * not need space to do so, it will fill all + * dstLen bytes with encoded UTF-8 content, as + * other circumstances permit. If clear, the + * default behavior is to reserve a byte in + * the dst space for NUL termination, and to + * append the NUL byte. + * TCL_ENCODING_CHAR_LIMIT - If set and dstCharsPtr is not NULL, then + * Tcl_ExternalToUtf takes the initial value + * of *dstCharsPtr is taken as a limit of the + * maximum number of chars to produce in the + * encoded UTF-8 content. Otherwise, the + * number of chars produced is controlled only + * by other limiting factors. */ #define TCL_ENCODING_START 0x01 #define TCL_ENCODING_END 0x02 #define TCL_ENCODING_STOPONERROR 0x04 +#define TCL_ENCODING_NO_TERMINATE 0x08 +#define TCL_ENCODING_CHAR_LIMIT 0x10 /* * The following definitions are the error codes returned by the conversion diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index d246cb2..0446816 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1204,6 +1204,7 @@ Tcl_ExternalToUtf( { const Encoding *encodingPtr; int result, srcRead, dstWrote, dstChars; + int noTerminate = flags & TCL_ENCODING_NO_TERMINATE; Tcl_EncodingState state; if (encoding == NULL) { @@ -1230,17 +1231,24 @@ Tcl_ExternalToUtf( dstCharsPtr = &dstChars; } - /* - * If there are any null characters in the middle of the buffer, they will - * converted to the UTF-8 null character (\xC080). To get the actual \0 at - * the end of the destination buffer, we need to append it manually. - */ + if (!noTerminate) { + /* + * If there are any null characters in the middle of the buffer, + * they will converted to the UTF-8 null character (\xC080). To get + * the actual \0 at the end of the destination buffer, we need to + * append it manually. First make room for it... + */ - dstLen--; + dstLen--; + } result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); - dst[*dstWrotePtr] = '\0'; + if (!noTerminate) { + /* ...and then append it */ + + dst[*dstWrotePtr] = '\0'; + } return result; } diff --git a/generic/tclIO.c b/generic/tclIO.c index 2025742..b759c0e 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -4578,14 +4578,14 @@ Tcl_GetsObj( * Skip the raw bytes that make up the '\n'. */ - char tmp[1 + TCL_UTF_MAX]; + char tmp[TCL_UTF_MAX]; int rawRead; bufPtr = gs.bufPtr; Tcl_ExternalToUtf(NULL, gs.encoding, RemovePoint(bufPtr), - gs.rawRead, statePtr->inputEncodingFlags, - &gs.state, tmp, 1 + TCL_UTF_MAX, &rawRead, NULL, - NULL); + gs.rawRead, statePtr->inputEncodingFlags + | TCL_ENCODING_NO_TERMINATE, &gs.state, tmp, + TCL_UTF_MAX, &rawRead, NULL, NULL); bufPtr->nextRemoved += rawRead; gs.rawRead -= rawRead; gs.bytesWrote--; @@ -4686,8 +4686,9 @@ Tcl_GetsObj( } statePtr->inputEncodingState = gs.state; Tcl_ExternalToUtf(NULL, gs.encoding, RemovePoint(bufPtr), gs.rawRead, - statePtr->inputEncodingFlags, &statePtr->inputEncodingState, dst, - eol - dst + skip + TCL_UTF_MAX, &gs.rawRead, NULL, + statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE, + &statePtr->inputEncodingState, dst, + eol - dst + skip + TCL_UTF_MAX - 1, &gs.rawRead, NULL, &gs.charsWrote); bufPtr->nextRemoved += gs.rawRead; @@ -5219,9 +5220,9 @@ FilterInputBytes( } gsPtr->state = statePtr->inputEncodingState; result = Tcl_ExternalToUtf(NULL, gsPtr->encoding, raw, rawLen, - statePtr->inputEncodingFlags, &statePtr->inputEncodingState, - dst, spaceLeft+1, &gsPtr->rawRead, &gsPtr->bytesWrote, - &gsPtr->charsWrote); + statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE, + &statePtr->inputEncodingState, dst, spaceLeft, &gsPtr->rawRead, + &gsPtr->bytesWrote, &gsPtr->charsWrote); /* * Make sure that if we go through 'gets', that we reset the @@ -5975,7 +5976,7 @@ ReadChars( * a consistent set of results. This takes the shape of a loop. */ - dstLimit = dstNeeded + 1; + dstLimit = dstNeeded; while (1) { int dstDecoded, dstRead, dstWrote, srcRead, numChars; @@ -5985,9 +5986,10 @@ ReadChars( */ int code = Tcl_ExternalToUtf(NULL, encoding, src, srcLen, - statePtr->inputEncodingFlags & (bufPtr->nextPtr - ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, - dst, dstLimit, &srcRead, &dstDecoded, &numChars); + (statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE) + & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), + &statePtr->inputEncodingState, dst, dstLimit, &srcRead, + &dstDecoded, &numChars); /* * Perform the translation transformation in place. Read no more @@ -6050,7 +6052,7 @@ ReadChars( * time. */ - dstLimit = dstRead + TCL_UTF_MAX; + dstLimit = dstRead - 1 + TCL_UTF_MAX; statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; @@ -6076,7 +6078,7 @@ ReadChars( * up back here in this call. */ - dstLimit = dstRead + TCL_UTF_MAX; + dstLimit = dstRead - 1 + TCL_UTF_MAX; statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; @@ -6093,7 +6095,7 @@ ReadChars( */ if (code != TCL_OK) { - char buffer[TCL_UTF_MAX + 2]; + char buffer[TCL_UTF_MAX + 1]; int read, decoded, count; /* @@ -6105,9 +6107,10 @@ ReadChars( statePtr->inputEncodingState = savedState; Tcl_ExternalToUtf(NULL, encoding, src, srcLen, - statePtr->inputEncodingFlags & (bufPtr->nextPtr - ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, - buffer, TCL_UTF_MAX + 2, &read, &decoded, &count); + (statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE) + & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), + &statePtr->inputEncodingState, buffer, TCL_UTF_MAX + 1, + &read, &decoded, &count); if (count == 2) { if (buffer[1] == '\n') { @@ -6119,7 +6122,6 @@ ReadChars( bufPtr->nextRemoved += srcRead; } - dst[1] = '\0'; statePtr->inputEncodingFlags &= ~TCL_ENCODING_START; Tcl_SetObjLength(objPtr, numBytes + 1); @@ -6166,7 +6168,7 @@ ReadChars( * Tcl_ExternalToUtf() call! */ - dstLimit = Tcl_UtfAtIndex(dst, charsToRead) + TCL_UTF_MAX - dst; + dstLimit = Tcl_UtfAtIndex(dst, charsToRead) - 1 + TCL_UTF_MAX - dst; statePtr->flags = savedFlags; statePtr->inputEncodingFlags = savedIEFlags; statePtr->inputEncodingState = savedState; -- cgit v0.12 From ee0a84dae56b7570f8065a273a8408e201c8b449 Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Sat, 15 Nov 2014 21:13:52 +0000 Subject: With no padding needed for a terminating NUL, there no need for a distinction between the dstNeeded and dstLimit values. --- generic/tclIO.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index b759c0e..f8b9bfa 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -5929,7 +5929,7 @@ ReadChars( int savedIEFlags = statePtr->inputEncodingFlags; int savedFlags = statePtr->flags; char *dst, *src = RemovePoint(bufPtr); - int dstLimit, numBytes, srcLen = BytesLeft(bufPtr); + int numBytes, srcLen = BytesLeft(bufPtr); /* * One src byte can yield at most one character. So when the @@ -5948,14 +5948,14 @@ ReadChars( */ int factor = *factorPtr; - int dstNeeded = TCL_UTF_MAX - 1 + toRead * factor / UTF_EXPANSION_FACTOR; + int dstLimit = TCL_UTF_MAX - 1 + toRead * factor / UTF_EXPANSION_FACTOR; (void) TclGetStringFromObj(objPtr, &numBytes); - Tcl_AppendToObj(objPtr, NULL, dstNeeded); + Tcl_AppendToObj(objPtr, NULL, dstLimit); if (toRead == srcLen) { unsigned int size; dst = TclGetStringStorage(objPtr, &size) + numBytes; - dstNeeded = size - numBytes; + dstLimit = size - numBytes; } else { dst = TclGetString(objPtr) + numBytes; } @@ -5976,7 +5976,6 @@ ReadChars( * a consistent set of results. This takes the shape of a loop. */ - dstLimit = dstNeeded; while (1) { int dstDecoded, dstRead, dstWrote, srcRead, numChars; -- cgit v0.12 From 20c444ceb73da2f79bedcc1056b43ebc29097ac0 Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Sat, 15 Nov 2014 21:46:19 +0000 Subject: Add to Tcl_ExternalToUtf() a capability to impose a limit on the number of chars produce in the encoding result. When the flag TCL_ENCODING_CHAR_LIMIT is set and dstCharsPtr is not NULL, then the initial value of *dstCharsPtr is taken as the max number of chars to produce. The limit is imposed in a way that does not require the assistance of the encoding's driver procs, but the flag is passed on to them in case they can do better when they know they should. No callers updated yet. No drivers updated yet. One difficulty is that this necessarily imposes a pre-translation limit, and the I/O system has a history of wanting to impose only a post-translation limit. --- generic/tclEncoding.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0446816..f33e0e6 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -1203,8 +1203,10 @@ Tcl_ExternalToUtf( * output buffer. */ { const Encoding *encodingPtr; - int result, srcRead, dstWrote, dstChars; + int result, srcRead, dstWrote, dstChars = 0; int noTerminate = flags & TCL_ENCODING_NO_TERMINATE; + int charLimited = (flags & TCL_ENCODING_CHAR_LIMIT) && dstCharsPtr; + int maxChars = INT_MAX; Tcl_EncodingState state; if (encoding == NULL) { @@ -1229,6 +1231,9 @@ Tcl_ExternalToUtf( } if (dstCharsPtr == NULL) { dstCharsPtr = &dstChars; + flags &= ~TCL_ENCODING_CHAR_LIMIT; + } else if (charLimited) { + maxChars = *dstCharsPtr; } if (!noTerminate) { @@ -1241,9 +1246,20 @@ Tcl_ExternalToUtf( dstLen--; } - result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, - flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, - dstCharsPtr); + do { + int savedFlags = flags; + Tcl_EncodingState savedState = *statePtr; + + result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, + flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + dstCharsPtr); + if (*dstCharsPtr <= maxChars) { + break; + } + dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX; + flags = savedFlags; + *statePtr = savedState; + } while (1); if (!noTerminate) { /* ...and then append it */ -- cgit v0.12 From 25c010db735cbf6e820482b7b62bc85833f85a41 Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 23 Dec 2014 16:57:02 +0000 Subject: Use the new TCL_ENCODING_CHAR_LIMIT flag to have the encoding system manage the max chars to read constraint. --- generic/tclIO.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/generic/tclIO.c b/generic/tclIO.c index 79aa667..9bbf2a6 100644 --- a/generic/tclIO.c +++ b/generic/tclIO.c @@ -5977,16 +5977,21 @@ ReadChars( */ while (1) { - int dstDecoded, dstRead, dstWrote, srcRead, numChars; + int dstDecoded, dstRead, dstWrote, srcRead, numChars, code; + int flags = statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE; + + if (charsToRead > 0) { + flags |= TCL_ENCODING_CHAR_LIMIT; + numChars = charsToRead; + } /* * Perform the encoding transformation. Read no more than * srcLen bytes, write no more than dstLimit bytes. */ - int code = Tcl_ExternalToUtf(NULL, encoding, src, srcLen, - (statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE) - & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), + code = Tcl_ExternalToUtf(NULL, encoding, src, srcLen, + flags & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState, dst, dstLimit, &srcRead, &dstDecoded, &numChars); @@ -6161,6 +6166,8 @@ ReadChars( if (charsToRead > 0 && numChars > charsToRead) { /* + * TODO: This cannot happen anymore. + * * We read more chars than allowed. Reset limits to * prevent that and try again. Don't forget the extra * padding of TCL_UTF_MAX bytes demanded by the -- cgit v0.12 From ba3df90803cf86284bd11f8dc99881e0d29ee5be Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 23 Dec 2014 17:13:13 +0000 Subject: Support TCL_ENCODING_CHAR_LIMIT in the BinaryProc driver. --- generic/tclEncoding.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2a766d1..5c5254b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2131,6 +2131,9 @@ BinaryProc( if (dstLen < 0) { dstLen = 0; } + if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) { + srcLen = *dstCharsPtr; + } if (srcLen > dstLen) { srcLen = dstLen; result = TCL_CONVERT_NOSPACE; -- cgit v0.12 From 0c8d6f5fda8fb777ac700ee3eb67d9982bb0d94c Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 23 Dec 2014 17:26:07 +0000 Subject: Support TCL_ENCODING_CHAR_LIMIT in the UtfToUtfProc driver. --- generic/tclEncoding.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5c5254b..f92cabb 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2294,7 +2294,7 @@ UtfToUtfProc( { const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; - int result, numChars; + int result, numChars, charLimit = INT_MAX; Tcl_UniChar ch; result = TCL_OK; @@ -2305,11 +2305,14 @@ UtfToUtfProc( if ((flags & TCL_ENCODING_END) == 0) { srcClose -= TCL_UTF_MAX; } + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the -- cgit v0.12 From 2201e6a76496ac7ba253481e7d26618d85cde429 Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 23 Dec 2014 17:46:05 +0000 Subject: Support TCL_ENCODING_CHAR_LIMIT in the UnicodeToUtfProc driver. --- generic/tclEncoding.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index f92cabb..0fe224e 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2408,9 +2408,12 @@ UnicodeToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; - int result, numChars; + int result, numChars, charLimit = INT_MAX; Tcl_UniChar ch; + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } result = TCL_OK; if ((srcLen % sizeof(Tcl_UniChar)) != 0) { result = TCL_CONVERT_MULTIBYTE; @@ -2424,7 +2427,7 @@ UnicodeToUtfProc( dstStart = dst; dstEnd = dst + dstLen - TCL_UTF_MAX; - for (numChars = 0; src < srcEnd; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; -- cgit v0.12 From ebb68b687dd512a7cbd3ea439be0e064a810dbf6 Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 23 Dec 2014 17:48:36 +0000 Subject: Support TCL_ENCODING_CHAR_LIMIT in the Iso88591ToUtfProc driver. --- generic/tclEncoding.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 0fe224e..3df720d 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2826,8 +2826,11 @@ Iso88591ToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; - int result, numChars; + int result, numChars, charLimit = INT_MAX; + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } srcStart = src; srcEnd = src + srcLen; @@ -2835,7 +2838,7 @@ Iso88591ToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; result = TCL_OK; - for (numChars = 0; src < srcEnd; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { Tcl_UniChar ch; if (dst > dstEnd) { -- cgit v0.12 From af48d8bf01566f64a168a6c4ee5bdb5ebfd359fd Mon Sep 17 00:00:00 2001 From: "dgp@users.sourceforge.net" Date: Tue, 23 Dec 2014 17:53:21 +0000 Subject: Support TCL_ENCODING_CHAR_LIMIT in TableToUtfProc and EscapeToUtfProc drivers. --- generic/tclEncoding.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 3df720d..179ca17 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2595,12 +2595,15 @@ TableToUtfProc( { const char *srcStart, *srcEnd; const char *dstEnd, *dstStart, *prefixBytes; - int result, byte, numChars; + int result, byte, numChars, charLimit = INT_MAX; Tcl_UniChar ch; const unsigned short *const *toUnicode; const unsigned short *pageZero; TableEncodingData *dataPtr = clientData; + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } srcStart = src; srcEnd = src + srcLen; @@ -2612,7 +2615,7 @@ TableToUtfProc( pageZero = toUnicode[0]; result = TCL_OK; - for (numChars = 0; src < srcEnd; numChars++) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; @@ -3054,9 +3057,12 @@ EscapeToUtfProc( const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; - int state, result, numChars; + int state, result, numChars, charLimit = INT_MAX; const char *dstStart, *dstEnd; + if (flags & TCL_ENCODING_CHAR_LIMIT) { + charLimit = *dstCharsPtr; + } result = TCL_OK; tablePrefixBytes = NULL; /* lint. */ tableToUnicode = NULL; /* lint. */ @@ -3074,7 +3080,7 @@ EscapeToUtfProc( state = 0; } - for (numChars = 0; src < srcEnd; ) { + for (numChars = 0; src < srcEnd && numChars <= charLimit; ) { int byte, hi, lo, ch; if (dst > dstEnd) { -- cgit v0.12