diff options
author | hershey <hershey> | 1999-06-15 01:16:21 (GMT) |
---|---|---|
committer | hershey <hershey> | 1999-06-15 01:16:21 (GMT) |
commit | 73d440a8ed4e3ef4fd1c30ce5708061a261396dc (patch) | |
tree | dc505f5e222817db5fe70f5b9eb5d01607656d12 /generic/tclStringObj.c | |
parent | 9710a67498395708dbbe9b7e401cec3c6227b394 (diff) | |
download | tcl-73d440a8ed4e3ef4fd1c30ce5708061a261396dc.zip tcl-73d440a8ed4e3ef4fd1c30ce5708061a261396dc.tar.gz tcl-73d440a8ed4e3ef4fd1c30ce5708061a261396dc.tar.bz2 |
Merged String and Unicode object types. Added new functions to
the puplic API: Tcl_NewUnicodeObj, Tcl_SetUnicodeObj,
Tcl_GetUnicode, Tcl_GetUniChar, Tcl_GetCharLength, Tcl_GetRange,
Tcl_AppendUnicodeToObj.
Note: some stringObj tests are still failing--the teststringobj command
still needs to be updated.
Diffstat (limited to 'generic/tclStringObj.c')
-rw-r--r-- | generic/tclStringObj.c | 1077 |
1 files changed, 1002 insertions, 75 deletions
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index c70bcb9..8dc6e90 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -9,13 +9,29 @@ * track of the extra space. Objects with this internal * representation are called "expandable string objects". * + * Since some string operations work with UTF strings and others require Unicode + format, the string obeject type stores one or both formats. If the object is + created with a Unicode string, then UTF form is not stored until it is + required by a string operation. The string object always stores the number of + characters, so if the object is created with a UTF string, we automatically + convert it to unicode (as this costs little more than + +A Unicode string + * is an internationalized string. Conceptually, a Unicode string is an + * array of 16-bit quantities organized as a sequence of properly formed + * UTF-8 characters. There is a one-to-one map between Unicode and UTF + * characters. The Unicode ojbect is opitmized for the case where each UTF + * char in a string is only one byte. In this case, we store the value of + * numChars, but we don't copy the bytes to the unicodeObj->chars. Before + * accessing obj->chars, check if unicodeObj->numChars == obj->length. + * * Copyright (c) 1995-1997 Sun Microsystems, Inc. * Copyright (c) 1999 by Scriptics Corporation. * * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStringObj.c,v 1.7 1999/06/08 02:59:25 hershey Exp $ + * RCS: @(#) $Id: tclStringObj.c,v 1.8 1999/06/15 01:16:25 hershey Exp $ */ #include "tclInt.h" @@ -24,7 +40,20 @@ * Prototypes for procedures defined later in this file: */ -static void ConvertToStringType _ANSI_ARGS_((Tcl_Obj *objPtr)); +static void AppendUnicodeToUnicodeRep _ANSI_ARGS_(( + Tcl_Obj *objPtr, Tcl_UniChar *unicode, + int appendNumChars)); +static void AppendUnicodeToUtfRep _ANSI_ARGS_(( + Tcl_Obj *objPtr, Tcl_UniChar *unicode, + int numChars)); +static void AppendUtfToUnicodeRep _ANSI_ARGS_((Tcl_Obj *objPtr, + char *bytes, int numBytes)); +static void AppendUtfToUtfRep _ANSI_ARGS_((Tcl_Obj *objPtr, + char *bytes, int numBytes)); + +static void FillUnicodeRep _ANSI_ARGS_((Tcl_Obj *objPtr)); + +static void FreeStringInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); static void DupStringInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr, Tcl_Obj *copyPtr)); static int SetStringFromAny _ANSI_ARGS_((Tcl_Interp *interp, @@ -38,11 +67,46 @@ static void UpdateStringOfString _ANSI_ARGS_((Tcl_Obj *objPtr)); Tcl_ObjType tclStringType = { "string", /* name */ - (Tcl_FreeInternalRepProc *) NULL, /* freeIntRepProc */ + FreeStringInternalRep, /* freeIntRepPro */ DupStringInternalRep, /* dupIntRepProc */ UpdateStringOfString, /* updateStringProc */ SetStringFromAny /* setFromAnyProc */ }; + +/* + * The following structure is the internal rep for a String object. + * It keeps track of how much memory has been used and how much has been + * allocated for the Unicode and UTF string to enable growing and + * shrinking of the UTF and Unicode reps of the String object with fewer + * mallocs. To optimize string length and indexing operations, this + * structure also stores the number of characters (same of UTF and Unicode!) + * once that value has been computede. + */ + +typedef struct String { + int numChars; /* The number of chars in the string. + * -1 means this value has not been + * calculated. >= 0 means that there is a + * valid Unicode rep, or that the number + * of UTF bytes == the number of chars. */ + size_t allocated; /* The amount of space actually allocated + * for the UTF string (minus 1 byte for + * the termination char). */ + size_t uallocated; /* The amount of space actually allocated + * for the Unicode string. 0 means the + * Unicode string rep is invalid. */ + Tcl_UniChar unicode[2]; /* The array of Unicode chars. The actual + * size of this field depends on the + * 'uallocated' field above. */ +} String; + +#define STRING_SIZE(len) \ + ((unsigned) (sizeof(String) + ((len-1) * sizeof(Tcl_UniChar)))) +#define GET_STRING(objPtr) \ + ((String *) (objPtr)->internalRep.otherValuePtr) +#define SET_STRING(objPtr, stringPtr) \ + (objPtr)->internalRep.otherValuePtr = (VOID *) (stringPtr) + /* *---------------------------------------------------------------------- @@ -182,6 +246,327 @@ Tcl_DbNewStringObj(bytes, length, file, line) #endif /* TCL_MEM_DEBUG */ /* + *--------------------------------------------------------------------------- + * + * TclNewUnicodeObj -- + * + * This procedure is creates a new String object and initializes + * it from the given Utf String. If the Utf String is the same size + * as the Unicode string, don't duplicate the data. + * + * Results: + * The newly created object is returned. This object will have no + * initial string representation. The returned object has a ref count + * of 0. + * + * Side effects: + * Memory allocated for new object and copy of Unicode argument. + * + *--------------------------------------------------------------------------- + */ + +Tcl_Obj * +Tcl_NewUnicodeObj(unicode, numChars) + Tcl_UniChar *unicode; /* The unicode string used to initialize + * the new object. */ + int numChars; /* Number of characters in the unicode + * string. */ +{ + Tcl_Obj *objPtr; + String *stringPtr; + int uallocated = (numChars + 1) * sizeof(Tcl_UniChar); + + /* + * Create a new obj with an invalid string rep. + */ + + TclNewObj(objPtr); + Tcl_InvalidateStringRep(objPtr); + objPtr->typePtr = &tclStringType; + + stringPtr = (String *) ckalloc(STRING_SIZE(uallocated)); + stringPtr->numChars = numChars; + stringPtr->uallocated = uallocated; + stringPtr->allocated = 0; + memcpy((VOID *) stringPtr->unicode, (VOID *) unicode, + (size_t) (numChars * sizeof(Tcl_UniChar))); + stringPtr->unicode[numChars] = 0; + SET_STRING(objPtr, stringPtr); + return objPtr; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_GetCharLength -- + * + * Get the length of the Unicode string from the Tcl object. + * + * Results: + * Pointer to unicode string representing the unicode object. + * + * Side effects: + * Frees old internal rep. Allocates memory for new "String" + * internal rep. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_GetCharLength(objPtr) + Tcl_Obj *objPtr; /* The String object to get the num chars of. */ +{ + String *stringPtr; + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + +/* if (objPtr->bytes == NULL) { */ +/* printf("called Tcl_GetCharLength with unicode str.\n"); */ +/* } else { */ +/* printf("called Tcl_GetCharLength with str = %s\n", objPtr->bytes); */ +/* } */ + + /* + * If numChars is unknown, then calculate the number of characaters + * while populating the Unicode string. + */ + + if (stringPtr->numChars == -1) { + + stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); + + if (stringPtr->numChars == objPtr->length) { + + /* + * Since we've just calucalated the number of chars, and all + * UTF chars are 1-byte long, we don't need to store the + * unicode string. + */ + + stringPtr->uallocated = 0; + + } else { + + /* + * Since we've just calucalated the number of chars, and not + * all UTF chars are 1-byte long, go ahead and populate the + * unicode string. + */ + + FillUnicodeRep(objPtr); + + /* + * We need to fetch the pointer again because we have just + * reallocated the structure to make room for the Unicode data. + */ + + stringPtr = GET_STRING(objPtr); + } + } + return stringPtr->numChars; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_GetUniChar -- + * + * Get the index'th Unicode character from the String object. The + * index is assumed to be in the appropriate range. + * + * Results: + * Returns the index'th Unicode character in the Object. + * + * Side effects: + * Fills unichar with the index'th Unicode character. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_GetUniChar(objPtr, index) + Tcl_Obj *objPtr; /* The object to get the Unicode charater from. */ + int index; /* Get the index'th Unicode character. */ +{ + Tcl_UniChar unichar; + String *stringPtr; + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + +/* if (objPtr->bytes == NULL) { */ +/* printf("called Tcl_GetUniChar with unicode str.\n"); */ +/* } else { */ +/* printf("called Tcl_GetUniChar with str = %s\n", objPtr->bytes); */ +/* } */ + + if (stringPtr->numChars == -1) { + + /* + * We haven't yet calculated the length, so we don't have the + * Unicode str. We need to know the number of chars before we + * can do indexing. + */ + + Tcl_GetCharLength(objPtr); + + /* + * We need to fetch the pointer again because we may have just + * reallocated the structure. + */ + + stringPtr = GET_STRING(objPtr); + } + if (stringPtr->uallocated == 0) { + char *bytes; + + /* + * All of the characters in the Utf string are 1 byte chars, + * so we don't store the unicode char. We get the Utf string + * and convert the index'th byte to a Unicode character. + */ + + bytes = Tcl_GetString(objPtr); + Tcl_UtfToUniChar(&bytes[index], &unichar); + } else { + unichar = stringPtr->unicode[index]; + } + return unichar; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_GetUnicode -- + * + * Get the index'th Unicode character from the String object. If + * the object is not already a String object, it will be converted + * to one. If the String object does not have a Unicode rep, then + * one is create from the UTF string format. + * + * Results: + * Returns a pointer to the object's internal Unicode string. + * + * Side effects: + * Converts the object to have the String internal rep. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar * +Tcl_GetUnicode(objPtr) + Tcl_Obj *objPtr; /* The object to find the unicode string for. */ +{ + String *stringPtr; + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + +/* if (objPtr->bytes == NULL) { */ +/* printf("called Tcl_GetUnicode with unicode str.\n"); */ +/* } else { */ +/* printf("called Tcl_GetUnicode with str = %s\n", objPtr->bytes); */ +/* } */ + + if ((stringPtr->numChars == -1) || (stringPtr->uallocated == 0)) { + + /* + * We haven't yet calculated the length, or all of the characters + * in the Utf string are 1 byte chars (so we didn't store the + * unicode str). Since this function must return a unicode string, + * and one has not yet been stored, force the Unicode to be + * calculated and stored now. + */ + + FillUnicodeRep(objPtr); + + /* + * We need to fetch the pointer again because we have just + * reallocated the structure to make room for the Unicode data. + */ + + stringPtr = GET_STRING(objPtr); + } + return stringPtr->unicode; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_GetRange -- + * + * Create a Tcl Object that contains the chars between first and last + * of the object indicated by "objPtr". If the object is not already + * a String object, convert it to one. The first and last indices + * are assumed to be in the appropriate range. + * + * Results: + * Returns a new Tcl Object of the String type. + * + * Side effects: + * Changes the internal rep of "objPtr" to the String type. + * + *---------------------------------------------------------------------- + */ + +Tcl_Obj* +Tcl_GetRange(objPtr, first, last) + + Tcl_Obj *objPtr; /* The Tcl object to find the range of. */ + int first; /* First index of the range. */ + int last; /* Last index of the range. */ +{ + Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ + String *stringPtr; + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (stringPtr->numChars == -1) { + + /* + * We haven't yet calculated the length, so we don't have the + * Unicode str. We need to know the number of chars before we + * can do indexing. + */ + + Tcl_GetCharLength(objPtr); + + /* + * We need to fetch the pointer again because we may have just + * reallocated the structure. + */ + + stringPtr = GET_STRING(objPtr); + } + + if (stringPtr->numChars == objPtr->length) { + char *str = Tcl_GetString(objPtr); + + /* + * All of the characters in the Utf string are 1 byte chars, + * so we don't store the unicode char. Create a new string + * object containing the specified range of chars. + */ + + newObjPtr = Tcl_NewStringObj(&str[first], last-first+1); + + /* + * Since we know the new string only has 1-byte chars, we + * can set it's numChars field. + */ + +/* stringPtr = GET_STRING(newObjPtr); */ +/* stringPtr->numChars = last-first+1; */ + } else { + newObjPtr = Tcl_NewUnicodeObj(stringPtr->unicode + first, + last-first+1); + } + return newObjPtr; +} + +/* *---------------------------------------------------------------------- * * Tcl_SetStringObj -- @@ -237,6 +622,7 @@ Tcl_SetStringObj(objPtr, bytes, length) length = (bytes? strlen(bytes) : 0); } TclInitStringRep(objPtr, bytes, length); +/* printf("called Tcl_SetStringObj with str = %s\n", objPtr->bytes); */ } /* @@ -272,15 +658,23 @@ Tcl_SetObjLength(objPtr, length) * terminating null byte. */ { char *new; + String *stringPtr; if (Tcl_IsShared(objPtr)) { panic("Tcl_SetObjLength called with shared object"); } - if (objPtr->typePtr != &tclStringType) { - ConvertToStringType(objPtr); - } - - if ((long)length > objPtr->internalRep.longValue) { + SetStringFromAny(NULL, objPtr); + + /* + * Invalidate the unicode data. + */ + + stringPtr = GET_STRING(objPtr); + stringPtr->numChars = -1; + stringPtr->uallocated = 0; + + if (length > stringPtr->allocated) { + /* * Not enough space in current string. Reallocate the string * space and free the old string. @@ -290,11 +684,13 @@ Tcl_SetObjLength(objPtr, length) if (objPtr->bytes != NULL) { memcpy((VOID *) new, (VOID *) objPtr->bytes, (size_t) objPtr->length); +/* new[objPtr->length] = 0; */ Tcl_InvalidateStringRep(objPtr); } objPtr->bytes = new; - objPtr->internalRep.longValue = (long) length; + stringPtr->allocated = length; } + objPtr->length = length; if ((objPtr->bytes != NULL) && (objPtr->bytes != tclEmptyStringRep)) { objPtr->bytes[length] = 0; @@ -302,6 +698,60 @@ Tcl_SetObjLength(objPtr, length) } /* + *--------------------------------------------------------------------------- + * + * TclSetUnicodeObj -- + * + * Modify an object to hold the Unicode string indicated by "unicode". + * + * Results: + * None. + * + * Side effects: + * Memory allocated for new "String" internal rep. + * + *--------------------------------------------------------------------------- + */ + +void +Tcl_SetUnicodeObj(objPtr, unicode, numChars) + Tcl_Obj *objPtr; /* The object to set the string of. */ + Tcl_UniChar *unicode; /* The unicode string used to initialize + * the object. */ + int numChars; /* Number of characters in the unicode + * string. */ +{ + Tcl_ObjType *typePtr; + String *stringPtr; + size_t uallocated = (numChars + 1) * sizeof(Tcl_UniChar); + + /* + * Free the internal rep if one exists, and invalidate the string rep. + */ + + typePtr = objPtr->typePtr; + if ((typePtr != NULL) && (typePtr->freeIntRepProc) != NULL) { + (*typePtr->freeIntRepProc)(objPtr); + } + objPtr->typePtr = &tclStringType; + + /* + * Allocate enough space for the String structure + Unicode string. + */ + + stringPtr = (String *) ckalloc(STRING_SIZE(uallocated)); + stringPtr->numChars = numChars; + stringPtr->uallocated = uallocated; + stringPtr->allocated = 0; + memcpy((VOID *) stringPtr->unicode, (VOID *) unicode, + (size_t) (numChars * sizeof(Tcl_UniChar))); + stringPtr->unicode[numChars] = 0; + SET_STRING(objPtr, stringPtr); + Tcl_InvalidateStringRep(objPtr); + return; +} + +/* *---------------------------------------------------------------------- * * Tcl_AppendToObj -- @@ -327,37 +777,106 @@ Tcl_AppendToObj(objPtr, bytes, length) * "bytes". If < 0, then append all bytes * up to NULL byte. */ { - int newLength, oldLength; + String *stringPtr; if (Tcl_IsShared(objPtr)) { panic("Tcl_AppendToObj called with shared object"); } - if (objPtr->typePtr != &tclStringType) { - ConvertToStringType(objPtr); - } + + SetStringFromAny(NULL, objPtr); + if (length < 0) { - length = (bytes? strlen(bytes) : 0); + length = (bytes ? strlen(bytes) : 0); } if (length == 0) { return; } - oldLength = objPtr->length; - newLength = length + oldLength; - if ((long)newLength > objPtr->internalRep.longValue) { - /* - * There isn't currently enough space in the string - * representation so allocate additional space. In fact, - * overallocate so that there is room for future growth without - * having to reallocate again. - */ - Tcl_SetObjLength(objPtr, 2*newLength); + /* + * TEMPORARY!!! This is terribly inefficient, but it works, and Don + * needs for me to check this stuff in ASAP. -Melissa + */ + +/* printf("called Tcl_AppendToObj with str = %s\n", bytes); */ + UpdateStringOfString(objPtr); + AppendUtfToUtfRep(objPtr, bytes, length); + return; + + /* + * If objPtr has a valid Unicode rep, then append the Unicode + * conversion of "bytes" to the objPtr's Unicode rep, otherwise + * append "bytes" to objPtr's string rep. + */ + + stringPtr = GET_STRING(objPtr); + if (stringPtr->allocated > 0) { + AppendUtfToUnicodeRep(objPtr, bytes, length); + + stringPtr = GET_STRING(objPtr); +/* printf(" ended Tcl_AppendToObj with %d unicode chars.\n", */ +/* stringPtr->numChars); */ + } else { + AppendUtfToUtfRep(objPtr, bytes, length); +/* printf(" ended Tcl_AppendToObj with str = %s\n", objPtr->bytes); */ } - if (length > 0) { - memcpy((VOID *) (objPtr->bytes + oldLength), (VOID *) bytes, - (size_t) length); - objPtr->length = newLength; - objPtr->bytes[objPtr->length] = 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_AppendUnicodeToObj -- + * + * This procedure appends a Unicode string to an object in the + * most efficient manner possible. Length must be >= 0. + * + * Results: + * None. + * + * Side effects: + * Invalidates the string rep and creates a new Unicode string. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_AppendUnicodeToObj(objPtr, unicode, length) + register Tcl_Obj *objPtr; /* Points to the object to append to. */ + Tcl_UniChar *unicode; /* The unicode string to append to the + * object. */ + int length; /* Number of chars in "unicode". */ +{ + String *stringPtr; + + if (Tcl_IsShared(objPtr)) { + panic("Tcl_AppendUnicodeToObj called with shared object"); + } + + if (length == 0) { + return; + } + + SetStringFromAny(NULL, objPtr); + + /* + * TEMPORARY!!! This is terribly inefficient, but it works, and Don + * needs for me to check this stuff in ASAP. -Melissa + */ + + UpdateStringOfString(objPtr); + AppendUnicodeToUtfRep(objPtr, unicode, length); + return; + + /* + * If objPtr has a valid Unicode rep, then append the "unicode" + * to the objPtr's Unicode rep, otherwise the UTF conversion of + * "unicode" to objPtr's string rep. + */ + + stringPtr = GET_STRING(objPtr); + if (stringPtr->allocated > 0) { + AppendUnicodeToUnicodeRep(objPtr, unicode, length); + } else { + AppendUnicodeToUtfRep(objPtr, unicode, length); } } @@ -367,6 +886,7 @@ Tcl_AppendToObj(objPtr, bytes, length) * Tcl_AppendObjToObj -- * * This procedure appends the string rep of one object to another. + * "objPtr" cannot be a shared object. * * Results: * None. @@ -383,7 +903,273 @@ Tcl_AppendObjToObj(objPtr, appendObjPtr) Tcl_Obj *objPtr; /* Points to the object to append to. */ Tcl_Obj *appendObjPtr; /* Object to append. */ { - TclAppendObjToUnicodeObj(objPtr, appendObjPtr); + String *stringPtr; + int length; + char *bytes; + + SetStringFromAny(NULL, objPtr); + + /* + * TEMPORARY!!! This is terribly inefficient, but it works, and Don + * needs for me to check this stuff in ASAP. -Melissa + */ + + UpdateStringOfString(objPtr); + bytes = Tcl_GetStringFromObj(appendObjPtr, &length); + AppendUtfToUtfRep(objPtr, bytes, length); + return; + + /* + * If objPtr has a valid Unicode rep, then get a Unicode string + * from appendObjPtr and append it. + */ + + stringPtr = GET_STRING(objPtr); + if (stringPtr->allocated > 0) { + + /* + * If appendObjPtr is not of the "String" type, don't convert it. + */ + + if (appendObjPtr->typePtr == &tclStringType) { + stringPtr = GET_STRING(appendObjPtr); + if ((stringPtr->numChars == -1) + || (stringPtr->uallocated == 0)) { + + /* + * If appendObjPtr is a string obj with no valide Unicode + * rep, then fill its unicode rep. + */ + + FillUnicodeRep(appendObjPtr); + stringPtr = GET_STRING(appendObjPtr); + } + AppendUnicodeToUnicodeRep(objPtr, stringPtr->unicode, + stringPtr->numChars); + } else { + bytes = Tcl_GetStringFromObj(appendObjPtr, &length); + AppendUtfToUnicodeRep(objPtr, bytes, length); + } + return; + } + + /* + * Append to objPtr's UTF string rep. + */ + + bytes = Tcl_GetStringFromObj(appendObjPtr, &length); + AppendUtfToUtfRep(objPtr, bytes, length); +} + +/* + *---------------------------------------------------------------------- + * + * AppendUnicodeToUnicodeRep -- + * + * This procedure appends the contents of "unicode" to the Unicode + * rep of "objPtr". objPtr must already have a valid Unicode rep. + * + * Results: + * None. + * + * Side effects: + * objPtr's internal rep is reallocated. + * + *---------------------------------------------------------------------- + */ + +static void +AppendUnicodeToUnicodeRep(objPtr, unicode, appendNumChars) + Tcl_Obj *objPtr; /* Points to the object to append to. */ + Tcl_UniChar *unicode; /* String to append. */ + int appendNumChars; /* Number of chars of "unicode" to append. */ +{ + String *stringPtr; + int numChars; + size_t newSize; + + if (appendNumChars == 0) { + return; + } + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + /* + * Make the buffer big enough for the result. + */ + + numChars = stringPtr->numChars + appendNumChars; + newSize = (numChars + 1) * sizeof(Tcl_UniChar); + + if (newSize > stringPtr->uallocated) { + stringPtr->uallocated = newSize * 2; + stringPtr = (String *) ckrealloc((char*)stringPtr, + STRING_SIZE(stringPtr->uallocated)); + SET_STRING(objPtr, stringPtr); + } + + /* + * Copy the new string onto the end of the old string, then add the + * trailing null. + */ + + memcpy((VOID*) (stringPtr->unicode + stringPtr->numChars), unicode, + appendNumChars * sizeof(Tcl_UniChar)); + stringPtr->unicode[numChars] = 0; + stringPtr->numChars = numChars; + + SET_STRING(objPtr, stringPtr); + Tcl_InvalidateStringRep(objPtr); +} + +/* + *---------------------------------------------------------------------- + * + * AppendUnicodeToUtfRep -- + * + * This procedure converts the contents of "unicode" to UTF and + * appends the UTF to the string rep of "objPtr". + * + * Results: + * None. + * + * Side effects: + * objPtr's internal rep is reallocated. + * + *---------------------------------------------------------------------- + */ + +static void +AppendUnicodeToUtfRep(objPtr, unicode, numChars) + Tcl_Obj *objPtr; /* Points to the object to append to. */ + Tcl_UniChar *unicode; /* String to convert to UTF. */ + int numChars; /* Number of chars of "unicode" to convert. */ +{ + Tcl_DString dsPtr; + int length = numChars * sizeof(Tcl_UniChar); + char *bytes; + + if (numChars == 0) { + return; + } + + Tcl_DStringInit(&dsPtr); + bytes = (char *)Tcl_UniCharToUtfDString(unicode, numChars, &dsPtr); + AppendUtfToUtfRep(objPtr, bytes, Tcl_DStringLength(&dsPtr)); + Tcl_DStringFree(&dsPtr); +} + +/* + *---------------------------------------------------------------------- + * + * AppendUtfToUnicodeRep -- + * + * This procedure converts the contents of "bytes" to Unicode and + * appends the Unicode to the Unicode rep of "objPtr". objPtr must + * already have a valid Unicode rep. + * + * Results: + * None. + * + * Side effects: + * objPtr's internal rep is reallocated. + * + *---------------------------------------------------------------------- + */ + +static void +AppendUtfToUnicodeRep(objPtr, bytes, numBytes) + Tcl_Obj *objPtr; /* Points to the object to append to. */ + char *bytes; /* String to convert to Unicode. */ + int numBytes; /* Number of bytes of "bytes" to convert. */ +{ + Tcl_DString dsPtr; + int numChars; + Tcl_UniChar *unicode; + + if (numBytes < 0) { + numBytes = (bytes ? strlen(bytes) : 0); + } + if (numBytes == 0) { + return; + } + + Tcl_DStringInit(&dsPtr); + numChars = Tcl_NumUtfChars(bytes, numBytes); + unicode = (Tcl_UniChar *)Tcl_UtfToUniCharDString(bytes, numBytes, &dsPtr); + AppendUnicodeToUnicodeRep(objPtr, unicode, numChars); + Tcl_DStringFree(&dsPtr); +} + +/* + *---------------------------------------------------------------------- + * + * AppendUtfToUtfRep -- + * + * This procedure appends "numBytes" bytes of "bytes" to the UTF string + * rep of "objPtr". objPtr must already have a valid String rep. + * + * Results: + * None. + * + * Side effects: + * objPtr's internal rep is reallocated. + * + *---------------------------------------------------------------------- + */ + +static void +AppendUtfToUtfRep(objPtr, bytes, numBytes) + Tcl_Obj *objPtr; /* Points to the object to append to. */ + char *bytes; /* String to append. */ + int numBytes; /* Number of bytes of "bytes" to append. */ +{ + String *stringPtr; + int newLength, oldLength; + + if (numBytes < 0) { + numBytes = (bytes ? strlen(bytes) : 0); + } + if (numBytes == 0) { + return; + } + + /* + * Copy the new string onto the end of the old string, then add the + * trailing null. + */ + + oldLength = objPtr->length; + newLength = numBytes + oldLength; + + stringPtr = GET_STRING(objPtr); + if (newLength > stringPtr->allocated) { + + /* + * There isn't currently enough space in the string + * representation so allocate additional space. If the current + * string representation isn't empty (i.e. it looks like we're + * doing a series of appends) then overallocate the space so + * that we won't have to do as much reallocation in the future. + */ + + Tcl_SetObjLength(objPtr, + (oldLength == 0) ? newLength : 2*newLength); + } else { + + /* + * Invalidate the unicode data. + */ + + stringPtr->numChars = -1; + stringPtr->uallocated = 0; + } + + memcpy((VOID *) (objPtr->bytes + oldLength), (VOID *) bytes, + (size_t) numBytes); + objPtr->bytes[newLength] = 0; + objPtr->length = newLength; } /* @@ -409,6 +1195,7 @@ Tcl_AppendStringsToObjVA (objPtr, argList) Tcl_Obj *objPtr; /* Points to the object to append to. */ va_list argList; /* Variable argument list. */ { + String *stringPtr; va_list tmpArgList; int newLength, oldLength; register char *string, *dst; @@ -416,9 +1203,8 @@ Tcl_AppendStringsToObjVA (objPtr, argList) if (Tcl_IsShared(objPtr)) { panic("Tcl_AppendStringsToObj called with shared object"); } - if (objPtr->typePtr != &tclStringType) { - ConvertToStringType(objPtr); - } + + SetStringFromAny(NULL, objPtr); /* * Figure out how much space is needed for all the strings, and @@ -440,7 +1226,9 @@ Tcl_AppendStringsToObjVA (objPtr, argList) return; } - if ((long)newLength > objPtr->internalRep.longValue) { + stringPtr = GET_STRING(objPtr); + if (newLength > stringPtr->allocated) { + /* * There isn't currently enough space in the string * representation so allocate additional space. If the current @@ -514,45 +1302,64 @@ Tcl_AppendStringsToObj TCL_VARARGS_DEF(Tcl_Obj *,arg1) } /* - *---------------------------------------------------------------------- + *--------------------------------------------------------------------------- * - * ConvertToStringType -- + * FillUnicodeRep -- * - * This procedure converts the internal representation of an object - * to "expandable string" type. + * Populate the Unicode internal rep with the Unicode form of its string + * rep. The object must alread have a "String" internal rep. * * Results: * None. * * Side effects: - * Any old internal reputation for objPtr is freed and the - * internal representation is set to that for an expandable string - * (the field internalRep.longValue holds 1 less than the allocated - * length of objPtr's string representation). + * Reallocates the String internal rep. * - *---------------------------------------------------------------------- + *--------------------------------------------------------------------------- */ static void -ConvertToStringType(objPtr) - register Tcl_Obj *objPtr; /* Pointer to object. Must have a - * typePtr that isn't &tclStringType. */ +FillUnicodeRep(objPtr) + Tcl_Obj *objPtr; /* The object in which to fill the unicode rep. */ { - if (objPtr->typePtr != NULL) { - if (objPtr->bytes == NULL) { - objPtr->typePtr->updateStringProc(objPtr); - } - if (objPtr->typePtr->freeIntRepProc != NULL) { - objPtr->typePtr->freeIntRepProc(objPtr); - } + String *stringPtr; + size_t uallocated; + char *src, *srcEnd; + Tcl_UniChar *dst; + src = objPtr->bytes; + + stringPtr = GET_STRING(objPtr); + if (stringPtr->numChars == -1) { + stringPtr->numChars = Tcl_NumUtfChars(src, objPtr->length); } - objPtr->typePtr = &tclStringType; - if (objPtr->bytes != NULL) { - objPtr->internalRep.longValue = (long)objPtr->length; - } else { - objPtr->internalRep.longValue = 0; - objPtr->length = 0; + + uallocated = stringPtr->numChars * sizeof(Tcl_UniChar); + if (uallocated > stringPtr->uallocated) { + + /* + * If not enought space has been allocated for the unicode rep, + * reallocate the internal rep object with double the amount of + * space needed, so the unicode string can grow without being + * reallocated. + */ + + uallocated *= 2; + stringPtr = (String *) ckrealloc((char*) stringPtr, + STRING_SIZE(uallocated)); + stringPtr->uallocated = uallocated; + } + + /* + * Convert src to Unicode and store the coverted data in "unicode". + */ + + srcEnd = src + objPtr->length; + for (dst = stringPtr->unicode; src < srcEnd; dst++) { + src += Tcl_UtfToUniChar(src, dst); } + *dst = 0; + + SET_STRING(objPtr, stringPtr); } /* @@ -581,13 +1388,40 @@ DupStringInternalRep(srcPtr, copyPtr) register Tcl_Obj *copyPtr; /* Object with internal rep to set. Must * not currently have an internal rep.*/ { + String *srcStringPtr = GET_STRING(srcPtr); + String *copyStringPtr; + + /* + * If the src obj is a string of 1-byte Utf chars, then copy the + * string rep of the source object and create an "empty" Unicode + * internal rep for the new object. Otherwise, copy Unicode + * internal rep, and invalidate the string rep of the new object. + */ + + if (srcStringPtr->numChars == srcPtr->length) { + copyStringPtr = (String *) ckalloc(STRING_SIZE(0)); + copyStringPtr->uallocated = 0; + } else { + copyStringPtr = (String *) ckalloc( + STRING_SIZE(srcStringPtr->uallocated)); + copyStringPtr->uallocated = srcStringPtr->uallocated; + + memcpy((VOID *) copyStringPtr->unicode, + (VOID *) srcStringPtr->unicode, + (size_t) srcStringPtr->numChars * sizeof(Tcl_UniChar)); + copyStringPtr->unicode[srcStringPtr->numChars] = 0; + } + copyStringPtr->numChars = srcStringPtr->numChars; + /* * Tricky point: the string value was copied by generic object * management code, so it doesn't contain any extra bytes that * might exist in the source object. */ - copyPtr->internalRep.longValue = (long)copyPtr->length; + copyStringPtr->allocated = copyPtr->length; + + SET_STRING(copyPtr, copyStringPtr); copyPtr->typePtr = &tclStringType; } @@ -596,15 +1430,14 @@ DupStringInternalRep(srcPtr, copyPtr) * * SetStringFromAny -- * - * Create an internal representation of type "expandable string" - * for an object. + * Create an internal representation of type "String" for an object. * * Results: * This operation always succeeds and returns TCL_OK. * * Side effects: - * This procedure does nothing; there is no advantage in converting - * the internal representation now, so we just defer it. + * Any old internal reputation for objPtr is freed and the + * internal representation is set to "String". * *---------------------------------------------------------------------- */ @@ -614,6 +1447,42 @@ SetStringFromAny(interp, objPtr) Tcl_Interp *interp; /* Used for error reporting if not NULL. */ Tcl_Obj *objPtr; /* The object to convert. */ { + String *stringPtr; + + /* + * The Unicode object is opitmized for the case where each UTF char + * in a string is only one byte. In this case, we store the value of + * numChars, but we don't copy the bytes to the unicodeObj->unicode. + */ + + if (objPtr->typePtr != &tclStringType) { + + if (objPtr->typePtr != NULL) { + if (objPtr->bytes == NULL) { + objPtr->typePtr->updateStringProc(objPtr); + } + if ((objPtr->typePtr->freeIntRepProc) != NULL) { + (*objPtr->typePtr->freeIntRepProc)(objPtr); + } + } + objPtr->typePtr = &tclStringType; + + /* + * Allocate enough space for the basic String structure. + */ + + stringPtr = (String *) ckalloc(STRING_SIZE(0)); + stringPtr->numChars = -1; + stringPtr->uallocated = 0; + + if (objPtr->bytes != NULL) { + stringPtr->allocated = objPtr->length; + objPtr->bytes[objPtr->length] = 0; + } else { + objPtr->length = 0; + } + SET_STRING(objPtr, stringPtr); + } return TCL_OK; } @@ -623,13 +1492,14 @@ SetStringFromAny(interp, objPtr) * UpdateStringOfString -- * * Update the string representation for an object whose internal - * representation is "expandable string". + * representation is "String". * * Results: * None. * * Side effects: - * None. + * The object's string may be set by converting its Unicode + * represention to UTF format. * *---------------------------------------------------------------------- */ @@ -638,16 +1508,73 @@ static void UpdateStringOfString(objPtr) Tcl_Obj *objPtr; /* Object with string rep to update. */ { - /* - * The string is almost always valid already, in which case there's - * nothing for us to do. The only case we have to worry about is if - * the object is totally null. In this case, set the string rep to - * an empty string. - */ + int i, length, size; + Tcl_UniChar *unicode; + char dummy[TCL_UTF_MAX]; + char *dst; + String *stringPtr; - if (objPtr->bytes == NULL) { - objPtr->bytes = tclEmptyStringRep; - objPtr->length = 0; + stringPtr = GET_STRING(objPtr); + if ((objPtr->bytes == NULL) || (stringPtr->allocated == 0)) { + + if (stringPtr->numChars <= 0) { + + /* + * If there is no Unicode rep, or the string has 0 chars, + * then set the string rep to an empty string. + */ + + objPtr->bytes = tclEmptyStringRep; + objPtr->length = 0; + return; + } + + unicode = stringPtr->unicode; + length = stringPtr->numChars * sizeof(Tcl_UniChar); + + /* + * Translate the Unicode string to UTF. "size" will hold the + * amount of space the UTF string needs. + */ + + size = 0; + for (i = 0; i < stringPtr->numChars; i++) { + size += Tcl_UniCharToUtf((int) unicode[i], dummy); + } + + dst = (char *) ckalloc((unsigned) (size + 1)); + objPtr->bytes = dst; + objPtr->length = size; + stringPtr->allocated = size; + + for (i = 0; i < stringPtr->numChars; i++) { + dst += Tcl_UniCharToUtf(unicode[i], dst); + } + *dst = '\0'; } return; } + +/* + *---------------------------------------------------------------------- + * + * FreeStringInternalRep -- + * + * Deallocate the storage associated with a String data object's + * internal representation. + * + * Results: + * None. + * + * Side effects: + * Frees memory. + * + *---------------------------------------------------------------------- + */ + +static void +FreeStringInternalRep(objPtr) + Tcl_Obj *objPtr; /* Object with internal rep to free. */ +{ + ckfree((char *) GET_STRING(objPtr)); +} |