Add two new (undocumented) flags to the Tcl_ExternalToUtf() interface.

TCL_ENCODING_NO_TERMINATE rejects the default behavior of appending a terminating NUL byte to the produced Utf output. This permits use of all of the dstLen bytes provided, and simplifies the buffer size calculations demanded from callers. Perhaps some callers need or appreciate this default behavior, but for Tcl's own main use of encodings - conversions within I/O - this just gets in the way. TCL_ENCODING_CHAR_LIMIT lets the caller set a limit on the number of chars to be output to be enforced by the encoding routines themselves. Without this, callers have to check after the fact for going beyond limits and make multiple encoding calls in a trial and error approach. Full compatibility is supported. No defaults are changed, and the flags have their effect even if an encoding driver has not been written to support these flags (but greater efficiency is enjoyed if they do!). All of Tcl's own encoding drivers are updated to support this. Other encoding drivers may exist somewhere, but I cannot point to any. A TIP to document this and make it officially supported may come in time.
author: dgp <dgp@users.sourceforge.net> 2014-12-23 18:39:35 (GMT)
committer: dgp <dgp@users.sourceforge.net> 2014-12-23 18:39:35 (GMT)
commit: c826f6d268355204eeef4506cec6a8948cc290a3 (patch)
tree: 94a25cd0a1f31aa5532c59bd74202b7469ab8311
parent: dd4496550090acf79f5e144602c745533549eea5 (diff)
parent: efce992c9a86e2e12f0142274afa9d58012b5870 (diff)
download: tcl-c826f6d268355204eeef4506cec6a8948cc290a3.zip
tcl-c826f6d268355204eeef4506cec6a8948cc290a3.tar.gz
tcl-c826f6d268355204eeef4506cec6a8948cc290a3.tar.bz2
3 files changed, 115 insertions, 48 deletions
diff --git a/generic/tcl.h b/generic/tcl.h
index fc477f2..95f2b3f 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2144,11 +2144,28 @@ typedef struct Tcl_EncodingType {
  *				substituting one or more "close" characters in
  *				the destination buffer and then continue to
  *				convert the source.
+ * TCL_ENCODING_NO_TERMINATE - 	If set, Tcl_ExternalToUtf will not append a
+ *				terminating NUL byte.  Knowing that it will
+ *				not need space to do so, it will fill all
+ *				dstLen bytes with encoded UTF-8 content, as
+ *				other circumstances permit.  If clear, the
+ *				default behavior is to reserve a byte in
+ *				the dst space for NUL termination, and to
+ *				append the NUL byte.
+ * TCL_ENCODING_CHAR_LIMIT -	If set and dstCharsPtr is not NULL, then
+ *				Tcl_ExternalToUtf takes the initial value
+ *				of *dstCharsPtr is taken as a limit of the
+ *				maximum number of chars to produce in the
+ *				encoded UTF-8 content.  Otherwise, the 
+ *				number of chars produced is controlled only
+ *				by other limiting factors.
  */
 
 #define TCL_ENCODING_START		0x01
 #define TCL_ENCODING_END		0x02
 #define TCL_ENCODING_STOPONERROR	0x04
+#define TCL_ENCODING_NO_TERMINATE	0x08
+#define TCL_ENCODING_CHAR_LIMIT		0x10
 
 /*
  * The following definitions are the error codes returned by the conversion
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 95c59c0..179ca17 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -1206,7 +1206,10 @@ Tcl_ExternalToUtf(
 				 * output buffer. */
 {
     const Encoding *encodingPtr;
-    int result, srcRead, dstWrote, dstChars;
+    int result, srcRead, dstWrote, dstChars = 0;
+    int noTerminate = flags & TCL_ENCODING_NO_TERMINATE;
+    int charLimited = (flags & TCL_ENCODING_CHAR_LIMIT) && dstCharsPtr;
+    int maxChars = INT_MAX;
     Tcl_EncodingState state;
 
     if (encoding == NULL) {
@@ -1231,19 +1234,40 @@ Tcl_ExternalToUtf(
     }
     if (dstCharsPtr == NULL) {
 	dstCharsPtr = &dstChars;
+	flags &= ~TCL_ENCODING_CHAR_LIMIT;
+    } else if (charLimited) {
+	maxChars = *dstCharsPtr;
     }
 
-    /*
-     * If there are any null characters in the middle of the buffer, they will
-     * converted to the UTF-8 null character (\xC080). To get the actual \0 at
-     * the end of the destination buffer, we need to append it manually.
-     */
+    if (!noTerminate) {
+	/*
+	 * If there are any null characters in the middle of the buffer,
+	 * they will converted to the UTF-8 null character (\xC080). To get
+	 * the actual \0 at the end of the destination buffer, we need to
+	 * append it manually.  First make room for it...
+	 */
 
-    dstLen--;
-    result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
-	    flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
-	    dstCharsPtr);
-    dst[*dstWrotePtr] = '\0';
+	dstLen--;
+    }
+    do {
+	int savedFlags = flags;
+	Tcl_EncodingState savedState = *statePtr;
+
+	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
+		flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
+		dstCharsPtr);
+	if (*dstCharsPtr <= maxChars) {
+	    break;
+	}
+	dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX;
+	flags = savedFlags;
+	*statePtr = savedState;
+    } while (1);
+    if (!noTerminate) {
+	/* ...and then append it */
+
+	dst[*dstWrotePtr] = '\0';
+    }
 
     return result;
 }
@@ -2107,6 +2131,9 @@ BinaryProc(
     if (dstLen < 0) {
 	dstLen = 0;
     }
+    if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) {
+	srcLen = *dstCharsPtr;
+    }
     if (srcLen > dstLen) {
 	srcLen = dstLen;
 	result = TCL_CONVERT_NOSPACE;
@@ -2267,7 +2294,7 @@ UtfToUtfProc(
 {
     const char *srcStart, *srcEnd, *srcClose;
     const char *dstStart, *dstEnd;
-    int result, numChars;
+    int result, numChars, charLimit = INT_MAX;
     Tcl_UniChar ch;
 
     result = TCL_OK;
@@ -2278,11 +2305,14 @@ UtfToUtfProc(
     if ((flags & TCL_ENCODING_END) == 0) {
 	srcClose -= TCL_UTF_MAX;
     }
+    if (flags & TCL_ENCODING_CHAR_LIMIT) {
+	charLimit = *dstCharsPtr;
+    }
 
     dstStart = dst;
     dstEnd = dst + dstLen - TCL_UTF_MAX;
 
-    for (numChars = 0; src < srcEnd; numChars++) {
+    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
 	    /*
 	     * If there is more string to follow, this will ensure that the
@@ -2378,9 +2408,12 @@ UnicodeToUtfProc(
 {
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart;
-    int result, numChars;
+    int result, numChars, charLimit = INT_MAX;
     Tcl_UniChar ch;
 
+    if (flags & TCL_ENCODING_CHAR_LIMIT) {
+	charLimit = *dstCharsPtr;
+    }
     result = TCL_OK;
     if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
 	result = TCL_CONVERT_MULTIBYTE;
@@ -2394,7 +2427,7 @@ UnicodeToUtfProc(
     dstStart = dst;
     dstEnd = dst + dstLen - TCL_UTF_MAX;
 
-    for (numChars = 0; src < srcEnd; numChars++) {
+    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 	if (dst > dstEnd) {
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
@@ -2562,12 +2595,15 @@ TableToUtfProc(
 {
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart, *prefixBytes;
-    int result, byte, numChars;
+    int result, byte, numChars, charLimit = INT_MAX;
     Tcl_UniChar ch;
     const unsigned short *const *toUnicode;
     const unsigned short *pageZero;
     TableEncodingData *dataPtr = clientData;
 
+    if (flags & TCL_ENCODING_CHAR_LIMIT) {
+	charLimit = *dstCharsPtr;
+    }
     srcStart = src;
     srcEnd = src + srcLen;
 
@@ -2579,7 +2615,7 @@ TableToUtfProc(
     pageZero = toUnicode[0];
 
     result = TCL_OK;
-    for (numChars = 0; src < srcEnd; numChars++) {
+    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 	if (dst > dstEnd) {
 	    result = TCL_CONVERT_NOSPACE;
 	    break;
@@ -2793,8 +2829,11 @@ Iso88591ToUtfProc(
 {
     const char *srcStart, *srcEnd;
     const char *dstEnd, *dstStart;
-    int result, numChars;
+    int result, numChars, charLimit = INT_MAX;
 
+    if (flags & TCL_ENCODING_CHAR_LIMIT) {
+	charLimit = *dstCharsPtr;
+    }
     srcStart = src;
     srcEnd = src + srcLen;
 
@@ -2802,7 +2841,7 @@ Iso88591ToUtfProc(
     dstEnd = dst + dstLen - TCL_UTF_MAX;
 
     result = TCL_OK;
-    for (numChars = 0; src < srcEnd; numChars++) {
+    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
 	Tcl_UniChar ch;
 
 	if (dst > dstEnd) {
@@ -3018,9 +3057,12 @@ EscapeToUtfProc(
     const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
     const unsigned short *const *tableToUnicode;
     const Encoding *encodingPtr;
-    int state, result, numChars;
+    int state, result, numChars, charLimit = INT_MAX;
     const char *dstStart, *dstEnd;
 
+    if (flags & TCL_ENCODING_CHAR_LIMIT) {
+	charLimit = *dstCharsPtr;
+    }
     result = TCL_OK;
     tablePrefixBytes = NULL;	/* lint. */
     tableToUnicode = NULL;	/* lint. */
@@ -3038,7 +3080,7 @@ EscapeToUtfProc(
 	state = 0;
     }
 
-    for (numChars = 0; src < srcEnd; ) {
+    for (numChars = 0; src < srcEnd && numChars <= charLimit; ) {
 	int byte, hi, lo, ch;
 
 	if (dst > dstEnd) {
diff --git a/generic/tclIO.c b/generic/tclIO.c
index 8a35aee..9bbf2a6 100644
--- a/generic/tclIO.c
+++ b/generic/tclIO.c
@@ -4578,14 +4578,14 @@ Tcl_GetsObj(
 		     * Skip the raw bytes that make up the '\n'.
 		     */
 
-		    char tmp[1 + TCL_UTF_MAX];
+		    char tmp[TCL_UTF_MAX];
 		    int rawRead;
 
 		    bufPtr = gs.bufPtr;
 		    Tcl_ExternalToUtf(NULL, gs.encoding, RemovePoint(bufPtr),
-			    gs.rawRead, statePtr->inputEncodingFlags,
-			    &gs.state, tmp, 1 + TCL_UTF_MAX, &rawRead, NULL,
-			    NULL);
+			    gs.rawRead, statePtr->inputEncodingFlags
+				| TCL_ENCODING_NO_TERMINATE, &gs.state, tmp,
+			    TCL_UTF_MAX, &rawRead, NULL, NULL);
 		    bufPtr->nextRemoved += rawRead;
 		    gs.rawRead -= rawRead;
 		    gs.bytesWrote--;
@@ -4686,8 +4686,9 @@ Tcl_GetsObj(
     }
     statePtr->inputEncodingState = gs.state;
     Tcl_ExternalToUtf(NULL, gs.encoding, RemovePoint(bufPtr), gs.rawRead,
-	    statePtr->inputEncodingFlags, &statePtr->inputEncodingState, dst,
-	    eol - dst + skip + TCL_UTF_MAX, &gs.rawRead, NULL,
+	    statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE,
+	    &statePtr->inputEncodingState, dst,
+	    eol - dst + skip + TCL_UTF_MAX - 1, &gs.rawRead, NULL,
 	    &gs.charsWrote);
     bufPtr->nextRemoved += gs.rawRead;
 
@@ -5219,9 +5220,9 @@ FilterInputBytes(
     }
     gsPtr->state = statePtr->inputEncodingState;
     result = Tcl_ExternalToUtf(NULL, gsPtr->encoding, raw, rawLen,
-	    statePtr->inputEncodingFlags, &statePtr->inputEncodingState,
-	    dst, spaceLeft+1, &gsPtr->rawRead, &gsPtr->bytesWrote,
-	    &gsPtr->charsWrote);
+	    statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE,
+	    &statePtr->inputEncodingState, dst, spaceLeft, &gsPtr->rawRead,
+	    &gsPtr->bytesWrote, &gsPtr->charsWrote);
 
     /*
      * Make sure that if we go through 'gets', that we reset the
@@ -5928,7 +5929,7 @@ ReadChars(
     int savedIEFlags = statePtr->inputEncodingFlags;
     int savedFlags = statePtr->flags;
     char *dst, *src = RemovePoint(bufPtr);
-    int dstLimit, numBytes, srcLen = BytesLeft(bufPtr);
+    int numBytes, srcLen = BytesLeft(bufPtr);
 
     /*
      * One src byte can yield at most one character.  So when the
@@ -5947,14 +5948,14 @@ ReadChars(
      */
     
     int factor = *factorPtr;
-    int dstNeeded = TCL_UTF_MAX - 1 + toRead * factor / UTF_EXPANSION_FACTOR;
+    int dstLimit = TCL_UTF_MAX - 1 + toRead * factor / UTF_EXPANSION_FACTOR;
 
     (void) TclGetStringFromObj(objPtr, &numBytes);
-    Tcl_AppendToObj(objPtr, NULL, dstNeeded);
+    Tcl_AppendToObj(objPtr, NULL, dstLimit);
     if (toRead == srcLen) {
 	unsigned int size;
 	dst = TclGetStringStorage(objPtr, &size) + numBytes;
-	dstNeeded = size - numBytes;
+	dstLimit = size - numBytes;
     } else {
 	dst = TclGetString(objPtr) + numBytes;
     }
@@ -5975,19 +5976,24 @@ ReadChars(
      * a consistent set of results.  This takes the shape of a loop.
      */
 
-    dstLimit = dstNeeded + 1;
     while (1) {
-	int dstDecoded, dstRead, dstWrote, srcRead, numChars;
+	int dstDecoded, dstRead, dstWrote, srcRead, numChars, code;
+	int flags = statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE;
+	
+	if (charsToRead > 0) {
+	    flags |= TCL_ENCODING_CHAR_LIMIT;
+	    numChars = charsToRead;
+	}
 
 	/*
 	 * Perform the encoding transformation.  Read no more than
 	 * srcLen bytes, write no more than dstLimit bytes.
 	 */
 
-	int code = Tcl_ExternalToUtf(NULL, encoding, src, srcLen,
-		statePtr->inputEncodingFlags & (bufPtr->nextPtr
-		? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState,
-		dst, dstLimit, &srcRead, &dstDecoded, &numChars);
+	code = Tcl_ExternalToUtf(NULL, encoding, src, srcLen,
+		flags & (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END),
+		&statePtr->inputEncodingState, dst, dstLimit, &srcRead,
+		&dstDecoded, &numChars);
 
 	/*
 	 * Perform the translation transformation in place.  Read no more
@@ -6050,7 +6056,7 @@ ReadChars(
 		     * time.
 		     */
 
-		    dstLimit = dstRead + TCL_UTF_MAX;
+		    dstLimit = dstRead - 1 + TCL_UTF_MAX;
 		    statePtr->flags = savedFlags;
 		    statePtr->inputEncodingFlags = savedIEFlags;
 		    statePtr->inputEncodingState = savedState;
@@ -6076,7 +6082,7 @@ ReadChars(
 		 * up back here in this call.
 		 */
 
-		dstLimit = dstRead + TCL_UTF_MAX;
+		dstLimit = dstRead - 1 + TCL_UTF_MAX;
 		statePtr->flags = savedFlags;
 		statePtr->inputEncodingFlags = savedIEFlags;
 		statePtr->inputEncodingState = savedState;
@@ -6093,7 +6099,7 @@ ReadChars(
 	     */
 
 	    if (code != TCL_OK) {
-		char buffer[TCL_UTF_MAX + 2];
+		char buffer[TCL_UTF_MAX + 1];
 		int read, decoded, count;
 
 		/* 
@@ -6105,9 +6111,10 @@ ReadChars(
 		statePtr->inputEncodingState = savedState;
 
 		Tcl_ExternalToUtf(NULL, encoding, src, srcLen,
-		statePtr->inputEncodingFlags & (bufPtr->nextPtr
-		? ~0 : ~TCL_ENCODING_END), &statePtr->inputEncodingState,
-		buffer, TCL_UTF_MAX + 2, &read, &decoded, &count);
+		(statePtr->inputEncodingFlags | TCL_ENCODING_NO_TERMINATE)
+		& (bufPtr->nextPtr ? ~0 : ~TCL_ENCODING_END),
+		&statePtr->inputEncodingState, buffer, TCL_UTF_MAX + 1,
+		&read, &decoded, &count);
 
 		if (count == 2) {
 		    if (buffer[1] == '\n') {
@@ -6119,7 +6126,6 @@ ReadChars(
 			bufPtr->nextRemoved += srcRead;
 		    }
 
-		    dst[1] = '\0';
 		    statePtr->inputEncodingFlags &= ~TCL_ENCODING_START;
 
 		    Tcl_SetObjLength(objPtr, numBytes + 1);
@@ -6160,13 +6166,15 @@ ReadChars(
 	if (charsToRead > 0 && numChars > charsToRead) {
 
 	    /* 
+	     * TODO: This cannot happen anymore.
+	     *
 	     * We read more chars than allowed.  Reset limits to
 	     * prevent that and try again.  Don't forget the extra
 	     * padding of TCL_UTF_MAX bytes demanded by the
 	     * Tcl_ExternalToUtf() call!
 	     */
 
-	    dstLimit = Tcl_UtfAtIndex(dst, charsToRead) + TCL_UTF_MAX - dst;
+	    dstLimit = Tcl_UtfAtIndex(dst, charsToRead) - 1 + TCL_UTF_MAX - dst;
 	    statePtr->flags = savedFlags;
 	    statePtr->inputEncodingFlags = savedIEFlags;
 	    statePtr->inputEncodingState = savedState;
author	dgp <dgp@users.sourceforge.net>	2014-12-23 18:39:35 (GMT)
committer	dgp <dgp@users.sourceforge.net>	2014-12-23 18:39:35 (GMT)
commit	c826f6d268355204eeef4506cec6a8948cc290a3 (patch)
tree	94a25cd0a1f31aa5532c59bd74202b7469ab8311
parent	dd4496550090acf79f5e144602c745533549eea5 (diff)
parent	efce992c9a86e2e12f0142274afa9d58012b5870 (diff)
download	tcl-c826f6d268355204eeef4506cec6a8948cc290a3.zip tcl-c826f6d268355204eeef4506cec6a8948cc290a3.tar.gz tcl-c826f6d268355204eeef4506cec6a8948cc290a3.tar.bz2