diff options
Diffstat (limited to 'generic/tclStringObj.c')
-rw-r--r-- | generic/tclStringObj.c | 140 |
1 files changed, 73 insertions, 67 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 09cfc4c..e00b02a 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1,29 +1,31 @@ /* * tclStringObj.c -- * - * This file contains procedures that implement string operations - * on Tcl objects. To do this efficiently (i.e. to allow many - * appends to be done to an object without constantly reallocating - * the space for the string representation) we overallocate the - * space for the string and use the internal representation to keep - * track of the extra space. Objects with this internal - * representation are called "expandable string objects". - * - * Since some string operations work with UTF strings and others require Unicode - format, the string obeject type stores one or both formats. If the object is - created with a Unicode string, then UTF form is not stored until it is - required by a string operation. The string object always stores the number of - characters, so if the object is created with a UTF string, we automatically - convert it to unicode (as this costs little more than - -A Unicode string - * is an internationalized string. Conceptually, a Unicode string is an - * array of 16-bit quantities organized as a sequence of properly formed - * UTF-8 characters. There is a one-to-one map between Unicode and UTF - * characters. The Unicode ojbect is opitmized for the case where each UTF - * char in a string is only one byte. In this case, we store the value of - * numChars, but we don't copy the bytes to the unicodeObj->chars. Before - * accessing obj->chars, check if unicodeObj->numChars == obj->length. + * This file contains procedures that implement string operations on Tcl + * objects. Some string operations work with UTF strings and others + * require Unicode format. Functions that require knowledge of the width + * of each character, such as indexing, operate on Unicode data. + * + * A Unicode string is an internationalized string. Conceptually, a + * Unicode string is an array of 16-bit quantities organized as a sequence + * of properly formed UTF-8 characters. There is a one-to-one map between + * Unicode and UTF characters. Because Unicode characters have a fixed + * width, operations such as indexing operate on Unicode data. The String + * ojbect is opitmized for the case where each UTF char in a string is + * only one byte. In this case, we store the value of numChars, but we + * don't store the Unicode data (unless Tcl_GetUnicode is explicitly + * called). + * + * The String object type stores one or both formats. The default + * behavior is to store UTF. Once Unicode is calculated by a function, it + * is stored in the internal rep for future access (without an additional + * O(n) cost). + * + * To allow many appends to be done to an object without constantly + * reallocating the space for the string or Unicode representation, we + * allocate double the space for the string or Unicode and use the + * internal representation to keep track of how much space is used + * vs. allocated. * * Copyright (c) 1995-1997 Sun Microsystems, Inc. * Copyright (c) 1999 by Scriptics Corporation. @@ -31,8 +33,7 @@ A Unicode string * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStringObj.c,v 1.9 1999/06/15 03:14:44 hershey Exp $ - */ + * RCS: @(#) $Id: tclStringObj.c,v 1.10 1999/06/15 22:06:17 hershey Exp $ */ #include "tclInt.h" @@ -80,7 +81,7 @@ Tcl_ObjType tclStringType = { * shrinking of the UTF and Unicode reps of the String object with fewer * mallocs. To optimize string length and indexing operations, this * structure also stores the number of characters (same of UTF and Unicode!) - * once that value has been computede. + * once that value has been computed. */ typedef struct String { @@ -321,12 +322,6 @@ Tcl_GetCharLength(objPtr) SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); -/* if (objPtr->bytes == NULL) { */ -/* printf("called Tcl_GetCharLength with unicode str.\n"); */ -/* } else { */ -/* printf("called Tcl_GetCharLength with str = %s\n", objPtr->bytes); */ -/* } */ - /* * If numChars is unknown, then calculate the number of characaters * while populating the Unicode string. @@ -395,12 +390,6 @@ Tcl_GetUniChar(objPtr, index) SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); -/* if (objPtr->bytes == NULL) { */ -/* printf("called Tcl_GetUniChar with unicode str.\n"); */ -/* } else { */ -/* printf("called Tcl_GetUniChar with str = %s\n", objPtr->bytes); */ -/* } */ - if (stringPtr->numChars == -1) { /* @@ -419,7 +408,6 @@ Tcl_GetUniChar(objPtr, index) stringPtr = GET_STRING(objPtr); } if (stringPtr->uallocated == 0) { - char *bytes; /* * All of the characters in the Utf string are 1 byte chars, @@ -427,8 +415,7 @@ Tcl_GetUniChar(objPtr, index) * and convert the index'th byte to a Unicode character. */ - bytes = Tcl_GetString(objPtr); - Tcl_UtfToUniChar(&bytes[index], &unichar); + Tcl_UtfToUniChar(&objPtr->bytes[index], &unichar); } else { unichar = stringPtr->unicode[index]; } @@ -463,12 +450,6 @@ Tcl_GetUnicode(objPtr) SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); -/* if (objPtr->bytes == NULL) { */ -/* printf("called Tcl_GetUnicode with unicode str.\n"); */ -/* } else { */ -/* printf("called Tcl_GetUnicode with str = %s\n", objPtr->bytes); */ -/* } */ - if ((stringPtr->numChars == -1) || (stringPtr->uallocated == 0)) { /* @@ -557,8 +538,9 @@ Tcl_GetRange(objPtr, first, last) * can set it's numChars field. */ -/* stringPtr = GET_STRING(newObjPtr); */ -/* stringPtr->numChars = last-first+1; */ + SetStringFromAny(NULL, newObjPtr); + stringPtr = GET_STRING(newObjPtr); + stringPtr->numChars = last-first+1; } else { newObjPtr = Tcl_NewUnicodeObj(stringPtr->unicode + first, last-first+1); @@ -622,7 +604,6 @@ Tcl_SetStringObj(objPtr, bytes, length) length = (bytes? strlen(bytes) : 0); } TclInitStringRep(objPtr, bytes, length); -/* printf("called Tcl_SetStringObj with str = %s\n", objPtr->bytes); */ } /* @@ -684,7 +665,6 @@ Tcl_SetObjLength(objPtr, length) if (objPtr->bytes != NULL) { memcpy((VOID *) new, (VOID *) objPtr->bytes, (size_t) objPtr->length); -/* new[objPtr->length] = 0; */ Tcl_InvalidateStringRep(objPtr); } objPtr->bytes = new; @@ -891,7 +871,7 @@ Tcl_AppendObjToObj(objPtr, appendObjPtr) Tcl_Obj *appendObjPtr; /* Object to append. */ { String *stringPtr; - int length; + int length, numChars, allOneByteChars; char *bytes; SetStringFromAny(NULL, objPtr); @@ -931,11 +911,28 @@ Tcl_AppendObjToObj(objPtr, appendObjPtr) } /* - * Append to objPtr's UTF string rep. + * Append to objPtr's UTF string rep. If we know the number of + * characters in both objects before appending, then set the combined + * number of characters in the final (appended-to) object. */ + allOneByteChars = 0; + numChars = stringPtr->numChars; + if ((numChars >= 0) && (appendObjPtr->typePtr == &tclStringType)) { + stringPtr = GET_STRING(appendObjPtr); + if (stringPtr->numChars >= 0) { + numChars += stringPtr->numChars; + allOneByteChars = 1; + } + } + bytes = Tcl_GetStringFromObj(appendObjPtr, &length); AppendUtfToUtfRep(objPtr, bytes, length); + + if (allOneByteChars) { + stringPtr = GET_STRING(objPtr); + stringPtr->numChars = numChars; + } } /* @@ -973,7 +970,10 @@ AppendUnicodeToUnicodeRep(objPtr, unicode, appendNumChars) stringPtr = GET_STRING(objPtr); /* - * Make the buffer big enough for the result. + * If not enough space has been allocated for the unicode rep, + * reallocate the internal rep object with double the amount of + * space needed, so the unicode string can grow without being + * reallocated. */ numChars = stringPtr->numChars + appendNumChars; @@ -1124,14 +1124,12 @@ AppendUtfToUtfRep(objPtr, bytes, numBytes) /* * There isn't currently enough space in the string - * representation so allocate additional space. If the current - * string representation isn't empty (i.e. it looks like we're - * doing a series of appends) then overallocate the space so - * that we won't have to do as much reallocation in the future. + * representation so allocate additional space. Overallocate the + * space by doubling it so that we won't have to do as much + * reallocation in the future. */ - Tcl_SetObjLength(objPtr, - (oldLength == 0) ? newLength : 2*newLength); + Tcl_SetObjLength(objPtr, 2*newLength); } else { /* @@ -1313,13 +1311,21 @@ FillUnicodeRep(objPtr) if (uallocated > stringPtr->uallocated) { /* - * If not enought space has been allocated for the unicode rep, - * reallocate the internal rep object with double the amount of - * space needed, so the unicode string can grow without being - * reallocated. + * If not enough space has been allocated for the unicode rep, + * reallocate the internal rep object. + */ + + /* + * There isn't currently enough space in the Unicode + * representation so allocate additional space. If the current + * Unicode representation isn't empty (i.e. it looks like we've + * done some appends) then overallocate the space so + * that we won't have to do as much reallocation in the future. */ - uallocated *= 2; + if (stringPtr->uallocated > 0) { + uallocated *= 2; + } stringPtr = (String *) ckrealloc((char*) stringPtr, STRING_SIZE(uallocated)); stringPtr->uallocated = uallocated; @@ -1359,8 +1365,7 @@ FillUnicodeRep(objPtr) static void DupStringInternalRep(srcPtr, copyPtr) register Tcl_Obj *srcPtr; /* Object with internal rep to copy. Must - * have an internal representation of type - * "expandable string". */ + * have an internal rep of type "String". */ register Tcl_Obj *copyPtr; /* Object with internal rep to set. Must * not currently have an internal rep.*/ { @@ -1388,6 +1393,7 @@ DupStringInternalRep(srcPtr, copyPtr) copyStringPtr->unicode[srcStringPtr->numChars] = 0; } copyStringPtr->numChars = srcStringPtr->numChars; + copyStringPtr->allocated = srcStringPtr->allocated; /* * Tricky point: the string value was copied by generic object |