diff options
author | dgp <dgp@users.sourceforge.net> | 2009-02-22 04:38:58 (GMT) |
---|---|---|
committer | dgp <dgp@users.sourceforge.net> | 2009-02-22 04:38:58 (GMT) |
commit | 300db4575ea04674f1f3ec8a0659075db7d18ac3 (patch) | |
tree | 17fda4b54b3e6a5ef902d948f42f9b8868170807 | |
parent | cabffe92aa625b6d83881d3f28857b4c50775aa5 (diff) | |
download | tcl-300db4575ea04674f1f3ec8a0659075db7d18ac3.zip tcl-300db4575ea04674f1f3ec8a0659075db7d18ac3.tar.gz tcl-300db4575ea04674f1f3ec8a0659075db7d18ac3.tar.bz2 |
* generic/tclStringObj.c: Several revisions to the shimmering
patterns between Unicode and UTF string reps. Most notably the
call: objPtr = Tcl_NewUnicodeObj(...,0); followed by a loop of calls:
Tcl_AppendUnicodeToObj(objPtr, u, n); will now grow and append to
the Unicode representation. Before this commit, the sequence would
convert each append to UTF and perform the append to the UTF rep.
This is puzzling and likely a bug. The performance of [string map]
is significantly improved by this change (according to the MAP
collection of benchmarks in tclbench). Just in case there was some
wisdom in the old ways that I missed, I left in the ability to restore
the old patterns with a #define COMPAT 1 at the top of the file.
-rw-r--r-- | ChangeLog | 14 | ||||
-rw-r--r-- | generic/tclStringObj.c | 82 |
2 files changed, 79 insertions, 17 deletions
@@ -1,3 +1,17 @@ +2009-02-21 Don Porter <dgp@users.sourceforge.net> + + * generic/tclStringObj.c: Several revisions to the shimmering + patterns between Unicode and UTF string reps. Most notably the + call: objPtr = Tcl_NewUnicodeObj(...,0); followed by a loop of calls: + Tcl_AppendUnicodeToObj(objPtr, u, n); will now grow and append to + the Unicode representation. Before this commit, the sequence would + convert each append to UTF and perform the append to the UTF rep. + This is puzzling and likely a bug. The performance of [string map] + is significantly improved by this change (according to the MAP + collection of benchmarks in tclbench). Just in case there was some + wisdom in the old ways that I missed, I left in the ability to restore + the old patterns with a #define COMPAT 1 at the top of the file. + 2009-02-20 Don Porter <dgp@users.sourceforge.net> * generic/tclPathObj.c: Fixed mistaken logic in TclFSGetPathType() diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 23f98e0..716c272 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -33,12 +33,20 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStringObj.c,v 1.120 2009/02/19 14:55:55 dgp Exp $ */ + * RCS: @(#) $Id: tclStringObj.c,v 1.121 2009/02/22 04:38:58 dgp Exp $ */ #include "tclInt.h" #include "tommath.h" /* + * Set COMPAT to 1 to restore the shimmering patterns to those of Tcl 8.5. + * This is an escape hatch in case the changes have some unexpected unwelcome + * impact on performance. If things go well, this mechanism can go away when + * post-8.6 development begins. + */ +#define COMPAT 0 + +/* * Prototypes for functions defined later in this file: */ @@ -484,18 +492,12 @@ Tcl_GetCharLength( TclNumUtfChars(numChars, objPtr->bytes, objPtr->length); stringPtr->numChars = numChars; - /* - * Disabled the auto-fill of the unicode rep when multi-byte - * characters have been detected, on the YAGNI principle. - */ -#if 1 +#if COMPAT if (numChars < objPtr->length) { /* * Since we've just computed the number of chars, and not all * UTF chars are 1-byte long, go ahead and populate the unicode * string. - * - * TODO: Examine does this really help? How? */ FillUnicodeRep(objPtr); @@ -1215,8 +1217,11 @@ Tcl_AppendUnicodeToObj( * objPtr's string rep. */ - /* TODO: shift appends to empty to work on Unicode? */ - if (stringPtr->hasUnicode && stringPtr->numChars > 0) { + if (stringPtr->hasUnicode +#if COMPAT + && stringPtr->numChars > 0 +#endif + ) { AppendUnicodeToUnicodeRep(objPtr, unicode, length); } else { AppendUnicodeToUtfRep(objPtr, unicode, length); @@ -1291,8 +1296,11 @@ Tcl_AppendObjToObj( * appendObjPtr and append it. */ - /* TODO: optimize unicode appends */ - if (stringPtr->hasUnicode && stringPtr->numChars > 0) { + if (stringPtr->hasUnicode +#if COMPAT + && stringPtr->numChars > 0 +#endif + ) { /* * If appendObjPtr is not of the "String" type, don't convert it. */ @@ -1325,7 +1333,11 @@ Tcl_AppendObjToObj( AppendUtfToUtfRep(objPtr, bytes, length); - if (numChars >= 0 && appendNumChars >= 0) { + if (numChars >= 0 && appendNumChars >= 0 +#if COMPAT + && appendNumChars == length +#endif + ) { stringPtr->numChars = numChars + appendNumChars; } } @@ -1443,9 +1455,10 @@ AppendUnicodeToUtfRep( stringPtr->numChars += numChars; } - /* TODO: Condition on (numChars > 0) ? or change caller & eliminate ? */ +#if COMPAT /* Invalidate the unicode rep */ stringPtr->hasUnicode = 0; +#endif } /* @@ -2674,6 +2687,43 @@ DupStringInternalRep( String *srcStringPtr = GET_STRING(srcPtr); String *copyStringPtr = NULL; +#if COMPAT==0 + if (srcStringPtr->numChars == -1) { + /* + * The String struct in the source value holds zero useful data. + * Don't bother copying it. Don't even bother allocating space in + * which to copy it. Just let the copy be untyped. + */ + return; + } + + if (srcStringPtr->hasUnicode) { + int copyMaxChars; + if (srcStringPtr->maxChars / 2 >= srcStringPtr->numChars) { + copyMaxChars = 2 * srcStringPtr->numChars; + } else { + copyMaxChars = srcStringPtr->maxChars; + } + copyStringPtr = stringAlloc(copyMaxChars); + copyStringPtr->maxChars = copyMaxChars; + memcpy(copyStringPtr->unicode, srcStringPtr->unicode, + srcStringPtr->numChars * sizeof(Tcl_UniChar)); + copyStringPtr->unicode[srcStringPtr->numChars] = 0; + } else { + copyStringPtr = stringAlloc(0); + copyStringPtr->maxChars = 0; + copyStringPtr->unicode[0] = 0; + } + copyStringPtr->hasUnicode = srcStringPtr->hasUnicode; + copyStringPtr->numChars = srcStringPtr->numChars; + + /* + * Tricky point: the string value was copied by generic object + * management code, so it doesn't contain any extra bytes that + * might exist in the source object. + */ + copyStringPtr->allocated = copyPtr->bytes ? copyPtr->length : 0; +#else /* * If the src obj is a string of 1-byte Utf chars, then copy the string * rep of the source object and create an "empty" Unicode internal rep for @@ -2683,8 +2733,6 @@ DupStringInternalRep( if (srcStringPtr->hasUnicode && srcStringPtr->numChars > 0) { /* Copy the full allocation for the Unicode buffer. */ - /* TODO: consider a more limited copy to the min of - * the current maxChars value and twice the current numChars */ copyStringPtr = stringAlloc(srcStringPtr->maxChars); copyStringPtr->maxChars = srcStringPtr->maxChars; memcpy(copyStringPtr->unicode, srcStringPtr->unicode, @@ -2692,7 +2740,6 @@ DupStringInternalRep( copyStringPtr->unicode[srcStringPtr->numChars] = 0; copyStringPtr->allocated = 0; } else { - /* TODO: consider not bothering to make a String intrep. */ copyStringPtr = stringAlloc(0); copyStringPtr->unicode[0] = 0; copyStringPtr->maxChars = 0; @@ -2705,6 +2752,7 @@ DupStringInternalRep( } copyStringPtr->numChars = srcStringPtr->numChars; copyStringPtr->hasUnicode = srcStringPtr->hasUnicode; +#endif SET_STRING(copyPtr, copyStringPtr); copyPtr->typePtr = &tclStringType; |