* generic/tclStringObj.c: Revisions so that we avoid creating

the strange representation of an empty string with objPtr->bytes == NULL and stringPtr->hasUnicode == 0. Instead in the situations where that was being created, create a traditional two-legged stork representation (objPtr->bytes = tclEmptyStringRep and stringPtr->hasUnicode = 1). In the situations where the strange rep was treated differently, continue to do so by testing stringPtr->numChars == 0 to detect it. These changes make the code more conventional so easier for new maintainers to pick up. Also sets up further simplifications.
author: dgp <dgp@users.sourceforge.net> 2009-02-14 23:07:17 (GMT)
committer: dgp <dgp@users.sourceforge.net> 2009-02-14 23:07:17 (GMT)
commit: 9325ac89905cf6cadf12491e007de22301362651 (patch)
tree: 0444ca159ecb1aa9c156bec35a4b82953ce34b43 /generic/tclStringObj.c
parent: 34bab026d12e407d9bf7e5d9eca2012f2e9a1f97 (diff)
download: tcl-9325ac89905cf6cadf12491e007de22301362651.zip
tcl-9325ac89905cf6cadf12491e007de22301362651.tar.gz
tcl-9325ac89905cf6cadf12491e007de22301362651.tar.bz2
1 files changed, 73 insertions, 51 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 8d24001..9112572 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -33,7 +33,7 @@
  * See the file "license.terms" for information on usage and redistribution of
  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclStringObj.c,v 1.104 2009/02/13 14:45:54 dgp Exp $ */
+ * RCS: @(#) $Id: tclStringObj.c,v 1.105 2009/02/14 23:07:17 dgp Exp $ */
 
 #include "tclInt.h"
 #include "tommath.h"
@@ -64,6 +64,7 @@ static void		FreeStringInternalRep(Tcl_Obj *objPtr);
 static int		SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
 static void		SetUnicodeObj(Tcl_Obj *objPtr,
 			    const Tcl_UniChar *unicode, int numChars);
+static int		UnicodeLength(const Tcl_UniChar *unicode);
 static void		UpdateStringOfString(Tcl_Obj *objPtr);
 
 /*
@@ -406,7 +407,7 @@ Tcl_GetCharLength(
 	 * Disabled the auto-fill of the unicode rep when multi-byte
 	 * characters have been detected, on the YAGNI principle.
 	 */
-#if 0
+#if 1
 	if (numChars < objPtr->length) {
 	    /*
 	     * Since we've just computed the number of chars, and not all
@@ -723,7 +724,6 @@ Tcl_SetObjLength(
     }
 
     SetStringFromAny(NULL, objPtr);
-
     stringPtr = GET_STRING(objPtr);
 
     /*
@@ -761,6 +761,8 @@ Tcl_SetObjLength(
 	    objPtr->bytes[length] = 0;
 	}
 
+	/* Note: here we can get an empty string != tclEmptyStringRep */
+
 	/*
 	 * Invalidate the unicode data.
 	 */
@@ -779,16 +781,21 @@ Tcl_SetObjLength(
 	    SET_STRING(objPtr, stringPtr);
 	    stringPtr->uallocated = uallocated;
 	}
+
+	/* Mark the new end of the unicode string */
 	stringPtr->numChars = length;
-	stringPtr->hasUnicode = (length > 0);
+	stringPtr->unicode[length] = 0;
+	stringPtr->hasUnicode = 1;
 
 	/*
-	 * Ensure the string is NUL-terminated.
+	 * Can only get here when objPtr->bytes == NULL.
+	 * No need to invalidate the string rep.
 	 */
 
-	stringPtr->unicode[length] = 0;
-	stringPtr->allocated = 0;
-	objPtr->length = 0;
+	if (length == 0) {
+	    /* For the empty string case, set the string rep. */
+	    TclInitStringRep(objPtr, tclEmptyStringRep, 0);
+	}
     }
 }
 
@@ -840,7 +847,6 @@ Tcl_AttemptSetObjLength(
     }
 
     SetStringFromAny(NULL, objPtr);
-
     stringPtr = GET_STRING(objPtr);
 
     /*
@@ -905,16 +911,21 @@ Tcl_AttemptSetObjLength(
 	    SET_STRING(objPtr, stringPtr);
 	    stringPtr->uallocated = uallocated;
 	}
+
+	/* Mark the new end of the unicode string */
+	stringPtr->unicode[length] = 0;
 	stringPtr->numChars = length;
-	stringPtr->hasUnicode = (length > 0);
+	stringPtr->hasUnicode = 1;
 
 	/*
-	 * Ensure the string is NUL-terminated.
+	 * Can only get here when objPtr->bytes == NULL.
+	 * No need to invalidate the string rep.
 	 */
 
-	stringPtr->unicode[length] = 0;
-	stringPtr->allocated = 0;
-	objPtr->length = 0;
+	if (length == 0) {
+	    /* For the empty string case, set the string rep. */
+	    TclInitStringRep(objPtr, tclEmptyStringRep, 0);
+	}
     }
     return 1;
 }
@@ -988,17 +999,22 @@ SetUnicodeObj(
 
     uallocated = STRING_UALLOC(numChars);
     stringPtr = stringAlloc(uallocated);
+    SET_STRING(objPtr, stringPtr);
+    objPtr->typePtr = &tclStringType;
 
-    stringPtr->numChars = numChars;
     stringPtr->uallocated = uallocated;
-    stringPtr->hasUnicode = (numChars > 0);
-    stringPtr->allocated = 0;
     memcpy(stringPtr->unicode, unicode, uallocated);
     stringPtr->unicode[numChars] = 0;
+    stringPtr->numChars = numChars;
+    stringPtr->hasUnicode = 1;
 
     TclInvalidateStringRep(objPtr);
-    objPtr->typePtr = &tclStringType;
-    SET_STRING(objPtr, stringPtr);
+    stringPtr->allocated = 0;
+
+    if (numChars == 0) {
+	/* For the empty string case, set the string rep. */
+	TclInitStringRep(objPtr, tclEmptyStringRep, 0);
+    }
 }
 
 /*
@@ -1040,8 +1056,6 @@ Tcl_AppendLimitedToObj(
 	Tcl_Panic("%s called with shared object", "Tcl_AppendLimitedToObj");
     }
 
-    SetStringFromAny(NULL, objPtr);
-
     if (length < 0) {
 	length = (bytes ? strlen(bytes) : 0);
     }
@@ -1064,8 +1078,10 @@ Tcl_AppendLimitedToObj(
      * objPtr's string rep.
      */
 
+    SetStringFromAny(NULL, objPtr);
     stringPtr = GET_STRING(objPtr);
-    if (stringPtr->hasUnicode != 0) {
+
+    if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
 	AppendUtfToUnicodeRep(objPtr, bytes, toCopy);
     } else {
 	AppendUtfToUtfRep(objPtr, bytes, toCopy);
@@ -1076,7 +1092,7 @@ Tcl_AppendLimitedToObj(
     }
 
     stringPtr = GET_STRING(objPtr);
-    if (stringPtr->hasUnicode != 0) {
+    if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
 	AppendUtfToUnicodeRep(objPtr, ellipsis, strlen(ellipsis));
     } else {
 	AppendUtfToUtfRep(objPtr, ellipsis, strlen(ellipsis));
@@ -1155,7 +1171,8 @@ Tcl_AppendUnicodeToObj(
      * objPtr's string rep.
      */
 
-    if (stringPtr->hasUnicode != 0) {
+    /* TODO: shift appends to empty to work on Unicode? */
+    if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
 	AppendUnicodeToUnicodeRep(objPtr, unicode, length);
     } else {
 	AppendUnicodeToUtfRep(objPtr, unicode, length);
@@ -1223,21 +1240,23 @@ Tcl_AppendObjToObj(
      */
 
     SetStringFromAny(NULL, objPtr);
+    stringPtr = GET_STRING(objPtr);
 
     /*
      * If objPtr has a valid Unicode rep, then get a Unicode string from
      * appendObjPtr and append it.
      */
 
-    stringPtr = GET_STRING(objPtr);
-    if (stringPtr->hasUnicode != 0) {
+    /* TODO: Check that append to self works */
+
+    if (stringPtr->hasUnicode && stringPtr->numChars > 0) {
 	/*
 	 * If appendObjPtr is not of the "String" type, don't convert it.
 	 */
 
 	if (appendObjPtr->typePtr == &tclStringType) {
 	    stringPtr = GET_STRING(appendObjPtr);
-	    if ((stringPtr->numChars == -1) || (stringPtr->hasUnicode == 0)) {
+	    if (stringPtr->hasUnicode == 0) {
 		/*
 		 * If appendObjPtr is a string obj with no valid Unicode rep,
 		 * then fill its unicode rep.
@@ -1267,6 +1286,7 @@ Tcl_AppendObjToObj(
     numChars = stringPtr->numChars;
     if ((numChars >= 0) && (appendObjPtr->typePtr == &tclStringType)) {
 	stringPtr = GET_STRING(appendObjPtr);
+	/* TODO why is the == length test needed here? */
 	if ((stringPtr->numChars >= 0) && (stringPtr->numChars == length)) {
 	    numChars += stringPtr->numChars;
 	    allOneByteChars = 1;
@@ -1386,6 +1406,7 @@ AppendUnicodeToUtfRep(
 	stringPtr->numChars += numChars;
     }
 
+    /* TODO: Condition on (numChars > 0) ?  or change caller & eliminate ? */
     /* Invalidate the unicode rep */
     stringPtr->hasUnicode = 0;
 }
@@ -1580,6 +1601,7 @@ Tcl_AppendStringsToObjVA(
     }
 
     stringPtr = GET_STRING(objPtr);
+    /* TODO: pure unicode will crash! */
     if (oldLength + newLength > stringPtr->allocated) {
 	/*
 	 * There isn't currently enough space in the string representation, so
@@ -2708,12 +2730,16 @@ ExtendUnicodeRepWithString(
 	SET_STRING(objPtr, stringPtr);
     }
 
-    stringPtr->hasUnicode = (needed > 0);
+    stringPtr->hasUnicode = 1;
     stringPtr->numChars = needed;
     for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) {
 	bytes += TclUtfToUniChar(bytes, dst);
     }
     *dst = 0;
+    if (needed == 0) {
+	/* For the empty string case, set the string rep. */
+	TclInitStringRep(objPtr, tclEmptyStringRep, 0);
+    }
 }
 
 /*
@@ -2744,9 +2770,6 @@ DupStringInternalRep(
     String *srcStringPtr = GET_STRING(srcPtr);
     String *copyStringPtr = NULL;
 
-    /* TODO: Consider not copying String intrep when just a utf string. */
-    /* TODO: Consider not copying extra space. */
-
     /*
      * If the src obj is a string of 1-byte Utf chars, then copy the string
      * rep of the source object and create an "empty" Unicode internal rep for
@@ -2754,28 +2777,31 @@ DupStringInternalRep(
      * the string rep of the new object.
      */
 
-    if (srcStringPtr->hasUnicode == 0) {
-	copyStringPtr = (String *) ckalloc((unsigned) sizeof(String));
-	copyStringPtr->uallocated = 0;
-    } else {
+    if (srcStringPtr->hasUnicode && srcStringPtr->numChars > 0) {
+	/* Copy the full allocation for the Unicode buffer. */
+	/* TODO: consider a more limited copy to the min of
+	 * the current uallocated value and twice the current numChars */
 	copyStringPtr = stringAlloc(srcStringPtr->uallocated);
 	copyStringPtr->uallocated = srcStringPtr->uallocated;
-
 	memcpy(copyStringPtr->unicode, srcStringPtr->unicode,
 		(size_t) srcStringPtr->numChars * sizeof(Tcl_UniChar));
 	copyStringPtr->unicode[srcStringPtr->numChars] = 0;
+	copyStringPtr->allocated = 0;
+    } else {
+	/* TODO: consider not bothering to make a String intrep. */
+	copyStringPtr = (String *) ckalloc((unsigned) sizeof(String));
+	copyStringPtr->unicode[0] = 0;
+	copyStringPtr->uallocated = 0;
+	/*
+	 * Tricky point: the string value was copied by generic object
+	 * management code, so it doesn't contain any extra bytes that
+	 * might exist in the source object.
+	 */
+	copyStringPtr->allocated = copyPtr->length;
     }
     copyStringPtr->numChars = srcStringPtr->numChars;
     copyStringPtr->hasUnicode = srcStringPtr->hasUnicode;
 
-    /*
-     * Tricky point: the string value was copied by generic object management
-     * code, so it doesn't contain any extra bytes that might exist in the
-     * source object.
-     */
-
-    copyStringPtr->allocated = copyPtr->length;
-
     SET_STRING(copyPtr, copyStringPtr);
     copyPtr->typePtr = &tclStringType;
 }
@@ -2871,17 +2897,12 @@ ExtendStringRepWithUnicode(
     }
 
     if (numChars == 0) {
-	if (objPtr->bytes == NULL) {
-	    TclInitStringRep(objPtr, buf, 0);
-	}
 	return 0;
     }
 
     if (objPtr->bytes == tclEmptyStringRep) {
-	TclInvalidateStringRep(objPtr);
-	/*stringPtr->allocated = 0;*/
-    }
-    if (objPtr->bytes) {
+	objPtr->bytes = NULL;
+    } else if (objPtr->bytes) {
 	size = objPtr->length;
     } else {
 	objPtr->length = 0;
@@ -2902,6 +2923,7 @@ ExtendStringRepWithUnicode(
 
     /* Grow space if needed */
     if (size > stringPtr->allocated) {
+	/* TODO: Growth algorithm for appends ? */
 	objPtr->bytes = ckrealloc(objPtr->bytes, (unsigned) size+1);
 	stringPtr->allocated = size;
     }
author	dgp <dgp@users.sourceforge.net>	2009-02-14 23:07:17 (GMT)
committer	dgp <dgp@users.sourceforge.net>	2009-02-14 23:07:17 (GMT)
commit	9325ac89905cf6cadf12491e007de22301362651 (patch)
tree	0444ca159ecb1aa9c156bec35a4b82953ce34b43 /generic/tclStringObj.c
parent	34bab026d12e407d9bf7e5d9eca2012f2e9a1f97 (diff)
download	tcl-9325ac89905cf6cadf12491e007de22301362651.zip tcl-9325ac89905cf6cadf12491e007de22301362651.tar.gz tcl-9325ac89905cf6cadf12491e007de22301362651.tar.bz2