[Bug 3285375]: Rewrite Tcl_Concat*() and [string trim*].

author: dgp <dgp@users.sourceforge.net> 2011-04-13 20:27:29 (GMT)
committer: dgp <dgp@users.sourceforge.net> 2011-04-13 20:27:29 (GMT)
commit: ef9aca4d1c097c0bb5c50d64ed819c38baef6387 (patch)
tree: ed1d89e9c50ba82b9724bfc18033907549896ec3
parent: bccb266ff70f020e3d62ef686b1bd7e79b2245ae (diff)
download: tcl-ef9aca4d1c097c0bb5c50d64ed819c38baef6387.zip
tcl-ef9aca4d1c097c0bb5c50d64ed819c38baef6387.tar.gz
tcl-ef9aca4d1c097c0bb5c50d64ed819c38baef6387.tar.bz2
4 files changed, 196 insertions, 185 deletions
diff --git a/ChangeLog b/ChangeLog
index e90ee9d..508a72d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2011-04-13  Don Porter  <dgp@users.sourceforge.net>
+
+	* generic/tclUtil.c:	Rewrite of Tcl_Concat*() routines to 
+	prevent segfaults on buffer overflow.  Build them out of existing
+	primitives already coded to handle overflow properly.  Uses the
+	new TclTrim*() routines. [Bug 3285375]
+
+	* generic/tclCmdMZ.c:	New internal utility routines TclTrimLeft()
+	* generic/tclInt.h:	and TclTrimRight().  Refactor the
+	* generic/tclUtil.c:	[string trim*] implementations to use them.
+
 2011-04-13  Miguel Sofer  <msofer@users.sf.net>
 
 	* generic/tclVar.c: fix for [Bug 2662380], crash caused by
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index cf74db5..7c3855c 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -3108,10 +3108,8 @@ StringTrimCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    Tcl_UniChar ch, trim;
-    register const char *p, *end;
-    const char *check, *checkEnd, *string1, *string2;
-    int offset, length1, length2;
+    const char *string1, *string2;
+    int triml, trimr, length1, length2;
 
     if (objc == 3) {
 	string2 = TclGetStringFromObj(objv[2], &length2);
@@ -3123,58 +3121,12 @@ StringTrimCmd(
 	return TCL_ERROR;
     }
     string1 = TclGetStringFromObj(objv[1], &length1);
-    checkEnd = string2 + length2;
 
-    /*
-     * The outer loop iterates over the string. The inner loop iterates over
-     * the trim characters. The loops terminate as soon as a non-trim
-     * character is discovered and string1 is left pointing at the first
-     * non-trim character.
-     */
-
-    end = string1 + length1;
-    for (p = string1; p < end; p += offset) {
-	offset = TclUtfToUniChar(p, &ch);
-
-	for (check = string2; ; ) {
-	    if (check >= checkEnd) {
-		p = end;
-		break;
-	    }
-	    check += TclUtfToUniChar(check, &trim);
-	    if (ch == trim) {
-		length1 -= offset;
-		string1 += offset;
-		break;
-	    }
-	}
-    }
-
-    /*
-     * The outer loop iterates over the string. The inner loop iterates over
-     * the trim characters. The loops terminate as soon as a non-trim
-     * character is discovered and length1 marks the last non-trim character.
-     */
-
-    end = string1;
-    for (p = string1 + length1; p > end; ) {
-	p = Tcl_UtfPrev(p, string1);
-	offset = TclUtfToUniChar(p, &ch);
-	check = string2;
-	while (1) {
-	    if (check >= checkEnd) {
-		p = end;
-		break;
-	    }
-	    check += TclUtfToUniChar(check, &trim);
-	    if (ch == trim) {
-		length1 -= offset;
-		break;
-	    }
-	}
-    }
+    triml = TclTrimLeft(string1, length1, string2, length2);
+    trimr = TclTrimRight(string1 + triml, length1 - triml, string2, length2);
 
-    Tcl_SetObjResult(interp, Tcl_NewStringObj(string1, length1));
+    Tcl_SetObjResult(interp,
+	    Tcl_NewStringObj(string1 + triml, length1 - triml - trimr));
     return TCL_OK;
 }
 
@@ -3204,10 +3156,8 @@ StringTrimLCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    Tcl_UniChar ch, trim;
-    register const char *p, *end;
-    const char *check, *checkEnd, *string1, *string2;
-    int offset, length1, length2;
+    const char *string1, *string2;
+    int trim, length1, length2;
 
     if (objc == 3) {
 	string2 = TclGetStringFromObj(objv[2], &length2);
@@ -3219,34 +3169,10 @@ StringTrimLCmd(
 	return TCL_ERROR;
     }
     string1 = TclGetStringFromObj(objv[1], &length1);
-    checkEnd = string2 + length2;
-
-    /*
-     * The outer loop iterates over the string. The inner loop iterates over
-     * the trim characters. The loops terminate as soon as a non-trim
-     * character is discovered and string1 is left pointing at the first
-     * non-trim character.
-     */
-
-    end = string1 + length1;
-    for (p = string1; p < end; p += offset) {
-	offset = TclUtfToUniChar(p, &ch);
 
-	for (check = string2; ; ) {
-	    if (check >= checkEnd) {
-		p = end;
-		break;
-	    }
-	    check += TclUtfToUniChar(check, &trim);
-	    if (ch == trim) {
-		length1 -= offset;
-		string1 += offset;
-		break;
-	    }
-	}
-    }
+    trim = TclTrimLeft(string1, length1, string2, length2);
 
-    Tcl_SetObjResult(interp, Tcl_NewStringObj(string1, length1));
+    Tcl_SetObjResult(interp, Tcl_NewStringObj(string1+trim, length1-trim));
     return TCL_OK;
 }
 
@@ -3276,10 +3202,8 @@ StringTrimRCmd(
     int objc,			/* Number of arguments. */
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
-    Tcl_UniChar ch, trim;
-    register const char *p, *end;
-    const char *check, *checkEnd, *string1, *string2;
-    int offset, length1, length2;
+    const char *string1, *string2;
+    int trim, length1, length2;
 
     if (objc == 3) {
 	string2 = TclGetStringFromObj(objv[2], &length2);
@@ -3291,33 +3215,10 @@ StringTrimRCmd(
 	return TCL_ERROR;
     }
     string1 = TclGetStringFromObj(objv[1], &length1);
-    checkEnd = string2 + length2;
 
-    /*
-     * The outer loop iterates over the string. The inner loop iterates over
-     * the trim characters. The loops terminate as soon as a non-trim
-     * character is discovered and length1 marks the last non-trim character.
-     */
-
-    end = string1;
-    for (p = string1 + length1; p > end; ) {
-	p = Tcl_UtfPrev(p, string1);
-	offset = TclUtfToUniChar(p, &ch);
-	check = string2;
-	while (1) {
-	    if (check >= checkEnd) {
-		p = end;
-		break;
-	    }
-	    check += TclUtfToUniChar(check, &trim);
-	    if (ch == trim) {
-		length1 -= offset;
-		break;
-	    }
-	}
-    }
+    trim = TclTrimRight(string1, length1, string2, length2);
 
-    Tcl_SetObjResult(interp, Tcl_NewStringObj(string1, length1));
+    Tcl_SetObjResult(interp, Tcl_NewStringObj(string1, length1-trim));
     return TCL_OK;
 }
 
diff --git a/generic/tclInt.h b/generic/tclInt.h
index c966610..b8a4dfa 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -2784,6 +2784,10 @@ MODULE_SCOPE int	TclSubstTokens(Tcl_Interp *interp, Tcl_Token *tokenPtr,
 			    int *clNextOuter, CONST char *outerScript);
 MODULE_SCOPE void	TclTransferResult(Tcl_Interp *sourceInterp, int result,
 			    Tcl_Interp *targetInterp);
+MODULE_SCOPE int	TclTrimLeft(const char *bytes, int numBytes,
+			    const char *trim, int numTrim);
+MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
+			    const char *trim, int numTrim);
 MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
 MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
 MODULE_SCOPE Tcl_PackageInitProc *TclpFindSymbol(Tcl_Interp *interp,
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 3565165..2f2deec 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -937,6 +937,142 @@ Tcl_Backslash(
 /*
  *----------------------------------------------------------------------
  *
+ * TclTrimRight --
+ *	Takes two counted strings in the Tcl encoding which must both be
+ *	null terminated.  Conceptually trims from the right side of the
+ *	first string all characters found in the second string.
+ *
+ * Results:
+ *	The number of bytes to be removed from the end of the string.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclTrimRight(
+    const char *bytes,	/* String to be trimmed... */
+    int numBytes,	/* ...and its length in bytes */
+    const char *trim,	/* String of trim characters... */
+    int numTrim)	/* ...and its length in bytes */
+{
+    const char *p = bytes + numBytes;
+    int pInc;
+
+    if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) {
+	Tcl_Panic("TclTrimRight works only on null-terminated strings");
+    }
+
+    /* Empty strings -> nothing to do */
+    if ((numBytes == 0) || (numTrim == 0)) {
+	return 0;
+    }
+
+    /* Outer loop: iterate over string to be trimmed */
+    do {
+	Tcl_UniChar ch1;
+	const char *q = trim;
+	int bytesLeft = numTrim;
+
+	p = Tcl_UtfPrev(p, bytes);
+ 	pInc = TclUtfToUniChar(p, &ch1);
+
+	/* Inner loop: scan trim string for match to current character */
+	do {
+	    Tcl_UniChar ch2;
+	    int qInc = TclUtfToUniChar(q, &ch2);
+
+	    if (ch1 == ch2) {
+		break;
+	    }
+
+	    q += qInc;
+	    bytesLeft -= qInc;
+	} while (bytesLeft);
+
+	if (bytesLeft == 0) {
+	    /* No match; trim task done; *p is last non-trimmed char */
+	    break;
+	}
+	pInc = 0;
+    } while (p > bytes);
+
+    return numBytes - (p - bytes) - pInc;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * TclTrimLeft --
+ *	Takes two counted strings in the Tcl encoding which must both be
+ *	null terminated.  Conceptually trims from the left side of the
+ *	first string all characters found in the second string.
+ *
+ * Results:
+ *	An integer index into the first string, pointing to the first
+ *	character not to be trimmed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclTrimLeft(
+    const char *bytes,	/* String to be trimmed... */
+    int numBytes,	/* ...and its length in bytes */
+    const char *trim,	/* String of trim characters... */
+    int numTrim)	/* ...and its length in bytes */
+{
+    const char *p = bytes;
+
+    if ((bytes[numBytes] != '\0') || (trim[numTrim] != '\0')) {
+	Tcl_Panic("TclTrimLeft works only on null-terminated strings");
+    }
+
+    /* Empty strings -> nothing to do */
+    if ((numBytes == 0) || (numTrim == 0)) {
+	return 0;
+    }
+
+    /* Outer loop: iterate over string to be trimmed */
+    do {
+	Tcl_UniChar ch1;
+	int pInc = TclUtfToUniChar(p, &ch1);
+	const char *q = trim;
+	int bytesLeft = numTrim;
+
+	/* Inner loop: scan trim string for match to current character */
+	do {
+	    Tcl_UniChar ch2;
+	    int qInc = TclUtfToUniChar(q, &ch2);
+
+	    if (ch1 == ch2) {
+		break;
+	    }
+
+	    q += qInc;
+	    bytesLeft -= qInc;
+	} while (bytesLeft);
+
+	if (bytesLeft == 0) {
+	    /* No match; trim task done; *p is first non-trimmed char */
+	    break;
+	}
+
+	p += pInc;
+	numBytes -= pInc;
+    } while (numBytes);
+
+    return p - bytes;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
  * Tcl_Concat --
  *
  *	Concatenate a set of strings into a single large string.
@@ -964,6 +1100,9 @@ Tcl_Concat(
 
     for (totalSize = 1, i = 0; i < argc; i++) {
 	totalSize += strlen(argv[i]) + 1;
+	if (totalSize <= 0) {
+	    Tcl_Panic("Tcl_Concat: max size of Tcl value exceeded");
+	}
     }
     result = (char *) ckalloc((unsigned) totalSize);
     if (argc == 0) {
@@ -1029,19 +1168,13 @@ Tcl_ConcatObj(
     int objc,			/* Number of objects to concatenate. */
     Tcl_Obj *CONST objv[])	/* Array of objects to concatenate. */
 {
-    int allocSize, finalSize, length, elemLength, i;
-    char *p;
-    char *element;
-    char *concatStr;
+    int i, needSpace = 0;
     Tcl_Obj *objPtr, *resPtr;
 
     /*
      * Check first to see if all the items are of list type or empty. If so,
      * we will concat them together as lists, and return a list object. This
-     * is only valid when the lists have no current string representation,
-     * since we don't know what the original type was. An original string rep
-     * may have lost some whitespace info when converted which could be
-     * important.
+     * is only valid when the lists are in canonical form.
      */
 
     for (i = 0;  i < objc;  i++) {
@@ -1100,79 +1233,41 @@ Tcl_ConcatObj(
      * the slow way, using the string representations.
      */
 
-    allocSize = 0;
+    TclNewObj(resPtr);
     for (i = 0;  i < objc;  i++) {
+	int trim, elemLength;
+	const char *element;
+	
 	objPtr = objv[i];
-	element = TclGetStringFromObj(objPtr, &length);
-	if ((element != NULL) && (length > 0)) {
-	    allocSize += (length + 1);
-	}
-    }
-    if (allocSize == 0) {
-	allocSize = 1;		/* enough for the NULL byte at end */
-    }
-
-    /*
-     * Allocate storage for the concatenated result. Note that allocSize is
-     * one more than the total number of characters, and so includes room for
-     * the terminating NULL byte.
-     */
-
-    concatStr = ckalloc((unsigned) allocSize);
+	element = TclGetStringFromObj(objPtr, &elemLength);
 
-    /*
-     * Now concatenate the elements. Clip white space off the front and back
-     * to generate a neater result, and ignore any empty elements. Also put a
-     * null byte at the end.
-     */
+	/* Trim away the leading whitespace */
+	trim = TclTrimLeft(element, elemLength, " \f\v\r\t\n", 6);
+	element += trim;
+	elemLength -= trim;
 
-    finalSize = 0;
-    if (objc == 0) {
-	*concatStr = '\0';
-    } else {
-	p = concatStr;
-	for (i = 0;  i < objc;  i++) {
-	    objPtr = objv[i];
-	    element = TclGetStringFromObj(objPtr, &elemLength);
-	    while ((elemLength > 0) && (UCHAR(*element) < 127)
-		    && isspace(UCHAR(*element))) { /* INTL: ISO C space. */
-		element++;
-		elemLength--;
-	    }
+	/*
+	 * Trim away the trailing whitespace.  Do not permit trimming
+	 * to expose a final backslash character.
+	 */
 
-	    /*
-	     * Trim trailing white space. But, be careful not to trim a space
-	     * character if it is preceded by a backslash: in this case it
-	     * could be significant.
-	     */
+	trim = TclTrimRight(element, elemLength, " \f\v\r\t\n", 6);
+	trim -= trim && (element[elemLength - trim - 1] == '\\');
+	elemLength -= trim;
 
-	    while ((elemLength > 0) && (UCHAR(element[elemLength-1]) < 127)
-		    && isspace(UCHAR(element[elemLength-1]))
-						/* INTL: ISO C space. */
-		    && ((elemLength < 2) || (element[elemLength-2] != '\\'))) {
-		elemLength--;
-	    }
-	    if (elemLength == 0) {
-		continue;	/* nothing left of this element */
-	    }
-	    memcpy(p, element, (size_t) elemLength);
-	    p += elemLength;
-	    *p = ' ';
-	    p++;
-	    finalSize += (elemLength + 1);
+	/* If we're left with empty element after trimming, do nothing */
+	if (elemLength == 0) {
+	    continue;
 	}
-	if (p != concatStr) {
-	    p[-1] = 0;
-	    finalSize -= 1;	/* we overwrote the final ' ' */
-	} else {
-	    *p = 0;
+
+	/* Append to the result with space if needed */
+	if (needSpace) {
+	    Tcl_AppendToObj(resPtr, " ", 1);
 	}
+	Tcl_AppendToObj(resPtr, element, elemLength);
+	needSpace = 1;
     }
-
-    TclNewObj(objPtr);
-    objPtr->bytes = concatStr;
-    objPtr->length = finalSize;
-    return objPtr;
+    return resPtr;
 }
 
 /*
author	dgp <dgp@users.sourceforge.net>	2011-04-13 20:27:29 (GMT)
committer	dgp <dgp@users.sourceforge.net>	2011-04-13 20:27:29 (GMT)
commit	ef9aca4d1c097c0bb5c50d64ed819c38baef6387 (patch)
tree	ed1d89e9c50ba82b9724bfc18033907549896ec3
parent	bccb266ff70f020e3d62ef686b1bd7e79b2245ae (diff)
download	tcl-ef9aca4d1c097c0bb5c50d64ed819c38baef6387.zip tcl-ef9aca4d1c097c0bb5c50d64ed819c38baef6387.tar.gz tcl-ef9aca4d1c097c0bb5c50d64ed819c38baef6387.tar.bz2