From 300db4575ea04674f1f3ec8a0659075db7d18ac3 Mon Sep 17 00:00:00 2001
From: dgp <dgp@users.sourceforge.net>
Date: Sun, 22 Feb 2009 04:38:58 +0000
Subject:         * generic/tclStringObj.c:       Several revisions to the
 shimmering         patterns between Unicode and UTF string reps.  Most
 notably the         call: objPtr = Tcl_NewUnicodeObj(...,0); followed by a
 loop of calls:         Tcl_AppendUnicodeToObj(objPtr, u, n); will now grow
 and append to         the Unicode representation.  Before this commit, the
 sequence would         convert each append to UTF and perform the append to
 the UTF rep.         This is puzzling and likely a bug.  The performance of
 [string map]         is significantly improved by this change (according to
 the MAP         collection of benchmarks in tclbench).  Just in case there
 was some         wisdom in the old ways that I missed, I left in the ability
 to restore         the old patterns with a #define COMPAT 1 at the top of the
 file.

---
 ChangeLog              | 14 +++++++++
 generic/tclStringObj.c | 82 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index d456acb..44c049d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2009-02-21  Don Porter  <dgp@users.sourceforge.net>
+
+	* generic/tclStringObj.c:	Several revisions to the shimmering
+	patterns between Unicode and UTF string reps.  Most notably the 
+	call: objPtr = Tcl_NewUnicodeObj(...,0); followed by a loop of calls:
+	Tcl_AppendUnicodeToObj(objPtr, u, n); will now grow and append to
+	the Unicode representation.  Before this commit, the sequence would
+	convert each append to UTF and perform the append to the UTF rep.
+	This is puzzling and likely a bug.  The performance of [string map]
+	is significantly improved by this change (according to the MAP
+	collection of benchmarks in tclbench).  Just in case there was some
+	wisdom in the old ways that I missed, I left in the ability to restore
+	the old patterns with a #define COMPAT 1 at the top of the file.
+
 2009-02-20  Don Porter  <dgp@users.sourceforge.net>
 
 	* generic/tclPathObj.c:	Fixed mistaken logic in TclFSGetPathType()
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 23f98e0..716c272 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -33,12 +33,20 @@
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclStringObj.c,v 1.120 2009/02/19 14:55:55 dgp Exp $ */
+ * RCS: @(#) $Id: tclStringObj.c,v 1.121 2009/02/22 04:38:58 dgp Exp $ */
 
 #include "tclInt.h"
 #include "tommath.h"
 
 /*
+ * Set COMPAT to 1 to restore the shimmering patterns to those of Tcl 8.5.
+ * This is an escape hatch in case the changes have some unexpected unwelcome
+ * impact on performance.  If things go well, this mechanism can go away when
+ * post-8.6 development begins.
+ */
+#define COMPAT 0
+
+/*
  * Prototypes for functions defined later in this file:
  */
 
@@ -484,18 +492,12 @@ Tcl_GetCharLength(
 	TclNumUtfChars(numChars, objPtr->bytes, objPtr->length);
 	stringPtr->numChars = numChars;
 
-	/*
-	 * Disabled the auto-fill of the unicode rep when multi-byte
-	 * characters have been detected, on the YAGNI principle.
-	 */
-#if 1
+#if COMPAT
 	if (numChars < objPtr->length) {
 	    /*
 	     * Since we've just computed the number of chars, and not all
 	     * UTF chars are 1-byte long, go ahead and populate the unicode
 	     * string.
-	     *
-	     * TODO: Examine does this really help?  How?
 	     */
 
 	    FillUnicodeRep(objPtr);
@@ -1215,8 +1217,11 @@ Tcl_AppendUnicodeToObj(
      * objPtr's string rep.
      */
 
-    /* TODO: shift appends to empty to work on Unicode? */
-    if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
+    if (stringPtr->hasUnicode
+#if COMPAT
+		&& stringPtr->numChars > 0
+#endif
+	    ) {
 	AppendUnicodeToUnicodeRep(objPtr, unicode, length);
     } else {
 	AppendUnicodeToUtfRep(objPtr, unicode, length);
@@ -1291,8 +1296,11 @@ Tcl_AppendObjToObj(
      * appendObjPtr and append it.
      */
 
-    /* TODO: optimize unicode appends */
-    if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
+    if (stringPtr->hasUnicode 
+#if COMPAT
+		&& stringPtr->numChars > 0
+#endif
+	    ) {
 	/*
 	 * If appendObjPtr is not of the "String" type, don't convert it.
 	 */
@@ -1325,7 +1333,11 @@ Tcl_AppendObjToObj(
 
     AppendUtfToUtfRep(objPtr, bytes, length);
 
-    if (numChars >= 0 && appendNumChars >= 0) {
+    if (numChars >= 0 && appendNumChars >= 0
+#if COMPAT
+		&& appendNumChars == length
+#endif
+	    ) {
 	stringPtr->numChars = numChars + appendNumChars;
     }
 }
@@ -1443,9 +1455,10 @@ AppendUnicodeToUtfRep(
 	stringPtr->numChars += numChars;
     }
 
-    /* TODO: Condition on (numChars > 0) ?  or change caller & eliminate ? */
+#if COMPAT
     /* Invalidate the unicode rep */
     stringPtr->hasUnicode = 0;
+#endif
 }
 
 /*
@@ -2674,6 +2687,43 @@ DupStringInternalRep(
     String *srcStringPtr = GET_STRING(srcPtr);
     String *copyStringPtr = NULL;
 
+#if COMPAT==0
+    if (srcStringPtr->numChars == -1) {
+	/*
+	 * The String struct in the source value holds zero useful data.
+	 * Don't bother copying it.  Don't even bother allocating space in
+	 * which to copy it.  Just let the copy be untyped.
+	 */
+	return;
+    }
+
+    if (srcStringPtr->hasUnicode) {
+	int copyMaxChars;
+	if (srcStringPtr->maxChars / 2 >= srcStringPtr->numChars) {
+	    copyMaxChars = 2 * srcStringPtr->numChars;
+	} else {
+	    copyMaxChars = srcStringPtr->maxChars;
+	}
+	copyStringPtr = stringAlloc(copyMaxChars);
+	copyStringPtr->maxChars = copyMaxChars;
+	memcpy(copyStringPtr->unicode, srcStringPtr->unicode,
+		srcStringPtr->numChars * sizeof(Tcl_UniChar));
+	copyStringPtr->unicode[srcStringPtr->numChars] = 0;
+    } else {
+	copyStringPtr = stringAlloc(0);
+	copyStringPtr->maxChars = 0;
+	copyStringPtr->unicode[0] = 0;
+    }
+    copyStringPtr->hasUnicode = srcStringPtr->hasUnicode;
+    copyStringPtr->numChars = srcStringPtr->numChars;
+
+    /*
+     * Tricky point: the string value was copied by generic object
+     * management code, so it doesn't contain any extra bytes that
+     * might exist in the source object.
+     */
+    copyStringPtr->allocated = copyPtr->bytes ? copyPtr->length : 0;
+#else
     /*
      * If the src obj is a string of 1-byte Utf chars, then copy the string
      * rep of the source object and create an "empty" Unicode internal rep for
@@ -2683,8 +2733,6 @@ DupStringInternalRep(
 
     if (srcStringPtr->hasUnicode && srcStringPtr->numChars > 0) {
 	/* Copy the full allocation for the Unicode buffer. */
-	/* TODO: consider a more limited copy to the min of
-	 * the current maxChars value and twice the current numChars */
 	copyStringPtr = stringAlloc(srcStringPtr->maxChars);
 	copyStringPtr->maxChars = srcStringPtr->maxChars;
 	memcpy(copyStringPtr->unicode, srcStringPtr->unicode,
@@ -2692,7 +2740,6 @@ DupStringInternalRep(
 	copyStringPtr->unicode[srcStringPtr->numChars] = 0;
 	copyStringPtr->allocated = 0;
     } else {
-	/* TODO: consider not bothering to make a String intrep. */
 	copyStringPtr = stringAlloc(0);
 	copyStringPtr->unicode[0] = 0;
 	copyStringPtr->maxChars = 0;
@@ -2705,6 +2752,7 @@ DupStringInternalRep(
     }
     copyStringPtr->numChars = srcStringPtr->numChars;
     copyStringPtr->hasUnicode = srcStringPtr->hasUnicode;
+#endif
 
     SET_STRING(copyPtr, copyStringPtr);
     copyPtr->typePtr = &tclStringType;
-- 
cgit v0.12