diff options
author | hershey <hershey> | 1999-06-08 02:59:23 (GMT) |
---|---|---|
committer | hershey <hershey> | 1999-06-08 02:59:23 (GMT) |
commit | 0e53e351cd3c0bdf51b84e459262c47f913c9a97 (patch) | |
tree | 87cee8e23f1c9f621f583c5d97e3e62979935fa9 /generic | |
parent | b2759d9c544b22071eca46475d110812304e8faa (diff) | |
download | tcl-0e53e351cd3c0bdf51b84e459262c47f913c9a97.zip tcl-0e53e351cd3c0bdf51b84e459262c47f913c9a97.tar.gz tcl-0e53e351cd3c0bdf51b84e459262c47f913c9a97.tar.bz2 |
* tests/string.test:
* generic/tclVar.c (Tcl_SetVar2Ex):
* generic/tclStringObj.c (Tcl_AppendObjToObj):
* generic/tclCmdMZ.c (Tcl_StringObjCmd): optimized the string
index, string length, string range, and append command in cases
where the object's internal rep is a bytearray. Objects with
other internal reps are converted to have the new unicode internal
rep.
* unix/Makefile.in:
* win/Makefile.in:
* win/Makefile.vc:
* tests/unicode.test:
* generic/tclInt.h:
* generic/tclObj.c:
* generic/tclUnicodeObj.c: added a new object type to store the
unicode representation of a string.
* generic/tclTestObj.c: added the objtype option to the testobj
command. This option returns the name of the type of internal rep
an object has.
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclCmdMZ.c | 139 | ||||
-rw-r--r-- | generic/tclInt.h | 12 | ||||
-rw-r--r-- | generic/tclObj.c | 4 | ||||
-rw-r--r-- | generic/tclStringObj.c | 9 | ||||
-rw-r--r-- | generic/tclTestObj.c | 25 | ||||
-rw-r--r-- | generic/tclUnicodeObj.c | 771 | ||||
-rw-r--r-- | generic/tclVar.c | 4 |
7 files changed, 907 insertions, 57 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 19b9ece..ebea22b 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.12 1999/06/03 18:43:30 stanton Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.13 1999/06/08 02:59:23 hershey Exp $ */ #include "tclInt.h" @@ -1009,32 +1009,47 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) } case STR_INDEX: { int index; + char buf[TCL_UTF_MAX]; + Tcl_UniChar unichar; if (objc != 4) { Tcl_WrongNumArgs(interp, 2, objv, "string charIndex"); return TCL_ERROR; } - string1 = Tcl_GetStringFromObj(objv[2], &length1); - /* - * establish what 'end' really means - */ - length2 = Tcl_NumUtfChars(string1, length1); - if (TclGetIntForIndex(interp, objv[3], length2 - 1, - &index) != TCL_OK) { - return TCL_ERROR; - } + /* - * index must be between 0 and the UTF length to be valid + * If we have a ByteArray object, avoid indexing in the + * Utf string since the byte array contains one byte per + * character. Otherwise, use the Unicode string rep to + * get the index'th char. */ - if ((index >= 0) && (index < length2)) { - if (length1 == length2) { - /* no unicode chars */ - Tcl_SetStringObj(resultPtr, string1+index, 1); - } else { - char buf[TCL_UTF_MAX]; - length2 = Tcl_UniCharToUtf(Tcl_UniCharAtIndex(string1, - index), buf); + if (objv[2]->typePtr == &tclByteArrayType) { + + string1 = Tcl_GetByteArrayFromObj(objv[2], &length1); + + if (TclGetIntForIndex(interp, objv[3], length1 - 1, + &index) != TCL_OK) { + return TCL_ERROR; + } + Tcl_SetStringObj(resultPtr, &string1[index], 1); + } else { + string1 = Tcl_GetStringFromObj(objv[2], &length1); + + /* + * convert to Unicode internal rep to calulate what + * 'end' really means. + */ + + length2 = TclGetUnicodeLengthFromObj(objv[2]); + + if (TclGetIntForIndex(interp, objv[3], length2 - 1, + &index) != TCL_OK) { + return TCL_ERROR; + } + if ((index >= 0) && (index < length2)) { + unichar = TclGetUniCharFromObj(objv[2], index); + length2 = Tcl_UniCharToUtf((int)unichar, buf); Tcl_SetStringObj(resultPtr, buf, length2); } } @@ -1400,16 +1415,16 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) /* * If we have a ByteArray object, avoid recomputing the * string since the byte array contains one byte per - * character. + * character. Otherwise, use the Unicode string rep to + * calculate the length. */ if (objv[2]->typePtr == &tclByteArrayType) { (void) Tcl_GetByteArrayFromObj(objv[2], &length1); Tcl_SetIntObj(resultPtr, length1); } else { - string1 = Tcl_GetStringFromObj(objv[2], &length1); - Tcl_SetIntObj(resultPtr, Tcl_NumUtfChars(string1, - length1)); + Tcl_SetIntObj(resultPtr, + TclGetUnicodeLengthFromObj(objv[2])); } } break; @@ -1550,28 +1565,64 @@ Tcl_StringObjCmd(dummy, interp, objc, objv) return TCL_ERROR; } - string1 = Tcl_GetStringFromObj(objv[2], &length1); - length1 = Tcl_NumUtfChars(string1, length1) - 1; - if (TclGetIntForIndex(interp, objv[3], length1, - &first) != TCL_OK) { - return TCL_ERROR; - } - if (TclGetIntForIndex(interp, objv[4], length1, - &last) != TCL_OK) { - return TCL_ERROR; - } - if (first < 0) { - first = 0; - } - if (last >= length1) { - last = length1; - } - if (last >= first) { - char *start, *end; + /* + * If we have a ByteArray object, avoid indexing in the + * Utf string since the byte array contains one byte per + * character. Otherwise, use the Unicode string rep to + * get the range. + */ - start = Tcl_UtfAtIndex(string1, first); - end = Tcl_UtfAtIndex(start, last - first + 1); - Tcl_SetStringObj(resultPtr, start, end - start); + if (objv[2]->typePtr == &tclByteArrayType) { + + string1 = Tcl_GetByteArrayFromObj(objv[2], &length1); + + if (TclGetIntForIndex(interp, objv[3], length1 - 1, + &first) != TCL_OK) { + return TCL_ERROR; + } + if (TclGetIntForIndex(interp, objv[4], length1 - 1, + &last) != TCL_OK) { + return TCL_ERROR; + } + if (first < 0) { + first = 0; + } + if (last >= length1 - 1) { + last = length1 - 1; + } + if (last >= first) { + int numBytes = last - first + 1; + resultPtr = Tcl_NewByteArrayObj(&string1[first], numBytes); + Tcl_SetObjResult(interp, resultPtr); + } + } else { + string1 = Tcl_GetStringFromObj(objv[2], &length1); + + /* + * Convert to Unicode internal rep to calulate length and + * create a result object. + */ + + length2 = TclGetUnicodeLengthFromObj(objv[2]) - 1; + + if (TclGetIntForIndex(interp, objv[3], length2, + &first) != TCL_OK) { + return TCL_ERROR; + } + if (TclGetIntForIndex(interp, objv[4], length2, + &last) != TCL_OK) { + return TCL_ERROR; + } + if (first < 0) { + first = 0; + } + if (last >= length1 - 1) { + last = length1 - 1; + } + if (last >= first) { + resultPtr = TclGetRangeFromObj(objv[2], first, last); + Tcl_SetObjResult(interp, resultPtr); + } } break; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 68614bc..ed9002d 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -11,7 +11,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclInt.h,v 1.29 1999/05/13 01:50:32 stanton Exp $ + * RCS: @(#) $Id: tclInt.h,v 1.30 1999/06/08 02:59:24 hershey Exp $ */ #ifndef _TCLINT @@ -1509,6 +1509,7 @@ extern Tcl_ObjType tclIntType; extern Tcl_ObjType tclListType; extern Tcl_ObjType tclProcBodyType; extern Tcl_ObjType tclStringType; +extern Tcl_ObjType tclUnicodeType; /* * The head of the list of free Tcl objects, and the total number of Tcl @@ -1542,6 +1543,9 @@ EXTERN int TclAccess _ANSI_ARGS_((CONST char *path, EXTERN int TclAccessDeleteProc _ANSI_ARGS_((TclAccessProc_ *proc)); EXTERN int TclAccessInsertProc _ANSI_ARGS_((TclAccessProc_ *proc)); EXTERN void TclAllocateFreeObjects _ANSI_ARGS_((void)); +EXTERN Tcl_Obj * TclAppendObjToUnicodeObj _ANSI_ARGS_(( + register Tcl_Obj *targetObjPtr, + register Tcl_Obj *srcObjPtr)); EXTERN int TclArraySet _ANSI_ARGS_((Tcl_Interp *interp, Tcl_Obj *arrayNameObj, Tcl_Obj *arrayElemObj)); EXTERN int TclCleanupChildren _ANSI_ARGS_((Tcl_Interp *interp, @@ -1634,6 +1638,12 @@ EXTERN int TclGetOpenMode _ANSI_ARGS_((Tcl_Interp *interp, char *string, int *seekFlagPtr)); EXTERN Tcl_Command TclGetOriginalCommand _ANSI_ARGS_(( Tcl_Command command)); +EXTERN Tcl_Obj* TclGetRangeFromObj _ANSI_ARGS_((Tcl_Obj *objPtr, + int first, int last)); +EXTERN Tcl_UniChar TclGetUniCharFromObj _ANSI_ARGS_((Tcl_Obj *objPtr, + int index)); +EXTERN int TclGetUnicodeLengthFromObj _ANSI_ARGS_(( + Tcl_Obj *objPtr)); EXTERN int TclGlob _ANSI_ARGS_((Tcl_Interp *interp, char *pattern, int noComplain)); EXTERN int TclGlobalInvoke _ANSI_ARGS_((Tcl_Interp *interp, diff --git a/generic/tclObj.c b/generic/tclObj.c index f1858f8..423df28 100644 --- a/generic/tclObj.c +++ b/generic/tclObj.c @@ -5,11 +5,12 @@ * many Tcl commands. * * Copyright (c) 1995-1997 Sun Microsystems, Inc. + * Copyright (c) 1999 by Scriptics Corporation. * * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclObj.c,v 1.7 1999/05/28 23:02:33 stanton Exp $ + * RCS: @(#) $Id: tclObj.c,v 1.8 1999/06/08 02:59:25 hershey Exp $ */ #include "tclInt.h" @@ -137,6 +138,7 @@ TclInitObjSubsystem() Tcl_RegisterObjType(&tclListType); Tcl_RegisterObjType(&tclByteCodeType); Tcl_RegisterObjType(&tclProcBodyType); + Tcl_RegisterObjType(&tclUnicodeType); #ifdef TCL_COMPILE_STATS Tcl_MutexLock(&tclObjMutex); diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index ea0cbd7..c70bcb9 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -10,11 +10,12 @@ * representation are called "expandable string objects". * * Copyright (c) 1995-1997 Sun Microsystems, Inc. + * Copyright (c) 1999 by Scriptics Corporation. * * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclStringObj.c,v 1.6 1999/05/07 20:07:35 stanton Exp $ + * RCS: @(#) $Id: tclStringObj.c,v 1.7 1999/06/08 02:59:25 hershey Exp $ */ #include "tclInt.h" @@ -382,11 +383,7 @@ Tcl_AppendObjToObj(objPtr, appendObjPtr) Tcl_Obj *objPtr; /* Points to the object to append to. */ Tcl_Obj *appendObjPtr; /* Object to append. */ { - int length; - char *stringRep; - - stringRep = Tcl_GetStringFromObj(appendObjPtr, &length); - Tcl_AppendToObj(objPtr, stringRep, length); + TclAppendObjToUnicodeObj(objPtr, appendObjPtr); } /* diff --git a/generic/tclTestObj.c b/generic/tclTestObj.c index d604c5b..533b967 100644 --- a/generic/tclTestObj.c +++ b/generic/tclTestObj.c @@ -7,11 +7,12 @@ * applications; they're only used for testing. * * Copyright (c) 1995-1998 Sun Microsystems, Inc. + * Copyright (c) 1999 by Scriptics Corporation. * * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclTestObj.c,v 1.3 1999/04/16 00:46:54 stanton Exp $ + * RCS: @(#) $Id: tclTestObj.c,v 1.4 1999/06/08 02:59:26 hershey Exp $ */ #include "tclInt.h" @@ -774,6 +775,23 @@ TestobjCmd(clientData, interp, objc, objv) } SetVarToObj(varIndex, Tcl_NewObj()); Tcl_SetObjResult(interp, varPtr[varIndex]); + } else if (strcmp(subCmd, "objtype") == 0) { + char *typeName; + + /* + * return an object containing the name of the argument's type + * of internal rep. If none exists, return "none". + */ + + if (objc != 3) { + goto wrongNumArgs; + } + if (objv[2]->typePtr == NULL) { + Tcl_SetObjResult(interp, Tcl_NewStringObj("none", -1)); + } else { + typeName = objv[2]->typePtr->name; + Tcl_SetObjResult(interp, Tcl_NewStringObj(typeName, -1)); + } } else if (strcmp(subCmd, "refcount") == 0) { char buf[TCL_INTEGER_SPACE]; @@ -810,7 +828,8 @@ TestobjCmd(clientData, interp, objc, objv) if (objc != 2) { goto wrongNumArgs; } - if (Tcl_AppendAllObjTypes(interp, Tcl_GetObjResult(interp)) != TCL_OK) { + if (Tcl_AppendAllObjTypes(interp, + Tcl_GetObjResult(interp)) != TCL_OK) { return TCL_ERROR; } } else { @@ -818,7 +837,7 @@ TestobjCmd(clientData, interp, objc, objv) "bad option \"", Tcl_GetString(objv[1]), "\": must be assign, convert, duplicate, freeallvars, ", - "newobj, objcount, refcount, type, or types", + "newobj, objcount, objtype, refcount, type, or types", (char *) NULL); return TCL_ERROR; } diff --git a/generic/tclUnicodeObj.c b/generic/tclUnicodeObj.c new file mode 100644 index 0000000..869b8c7 --- /dev/null +++ b/generic/tclUnicodeObj.c @@ -0,0 +1,771 @@ +/* + * tclUnicodeObj.c -- + * + * This file contains the implementation of the Unicode internal + * representation of Tcl objects. + * + * Copyright (c) 1999 by Scriptics Corporation. + * + * See the file "license.terms" for information on usage and redistribution + * of this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * RCS: @(#) $Id: tclUnicodeObj.c,v 1.2 1999/06/08 02:59:27 hershey Exp $ + */ + +#include <math.h> +#include "tclInt.h" +#include "tclPort.h" + +/* + * Prototypes for local procedures defined in this file: + */ + +static void DupUnicodeInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr, + Tcl_Obj *copyPtr)); +static void FreeUnicodeInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); +static void UpdateStringOfUnicode _ANSI_ARGS_((Tcl_Obj *objPtr)); +static int SetUnicodeFromAny _ANSI_ARGS_((Tcl_Interp *interp, + Tcl_Obj *objPtr)); + +static int AllSingleByteChars _ANSI_ARGS_((Tcl_Obj *objPtr)); +static void TclAppendUniCharStrToObj _ANSI_ARGS_(( + register Tcl_Obj *objPtr, Tcl_UniChar *unichars, + int numChars)); +static Tcl_Obj * TclNewUnicodeObj _ANSI_ARGS_((Tcl_UniChar *unichars, + int numChars)); +static void SetOptUnicodeFromAny _ANSI_ARGS_((Tcl_Obj *objPtr, + int numChars)); + +/* + * The following object type represents a Unicode string. A Unicode string + * is an internationalized string. Conceptually, a Unicode string is an + * array of 16-bit quantities organized as a sequence of properly formed + * UTF-8 characters. There is a one-to-one map between Unicode and UTF + * characters. The Unicode ojbect is opitmized for the case where each UTF + * char in a string is only one byte. In this case, we store the value of + * numChars, but we don't copy the bytes to the unicodeObj->chars. Before + * accessing obj->chars, check if unicodeObj->numChars == obj->length. + */ + +Tcl_ObjType tclUnicodeType = { + "unicode", + FreeUnicodeInternalRep, + DupUnicodeInternalRep, + UpdateStringOfUnicode, + SetUnicodeFromAny +}; + +/* + * The following structure is the internal rep for a Unicode object. + * Keeps track of how much memory has been used and how much has been + * allocated for the Unicode to enable growing and shrinking of the + * Unicode object with fewer mallocs. + */ + +typedef struct Unicode { + int numChars; /* The number of chars in the unicode + * string. */ + int used; /* The number of bytes used in the unicode + * string. */ + int allocated; /* The amount of space actually allocated + * minus 1 byte. */ + unsigned char chars[4]; /* The array of chars. The actual size of + * this field depends on the 'allocated' field + * above. */ +} Unicode; + +#define UNICODE_SIZE(len) \ + ((unsigned) (sizeof(Unicode) - 4 + (len))) +#define GET_UNICODE(objPtr) \ + ((Unicode *) (objPtr)->internalRep.otherValuePtr) +#define SET_UNICODE(objPtr, unicodePtr) \ + (objPtr)->internalRep.otherValuePtr = (VOID *) (unicodePtr) + + +/* + *---------------------------------------------------------------------- + * + * TclGetUnicodeLengthFromObj -- + * + * Get the length of the Unicode string from the Tcl object. If + * the object is not already a Unicode object, an attempt will be + * made to convert it to one. + * + * Results: + * Pointer to unicode string representing the unicode object. + * + * Side effects: + * Frees old internal rep. Allocates memory for new internal rep. + * + *---------------------------------------------------------------------- + */ + +int +TclGetUnicodeLengthFromObj(objPtr) + Tcl_Obj *objPtr; /* The Unicode object. */ +{ + int length; + Unicode *unicodePtr; + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + + length = unicodePtr->numChars; + return length; +} + +/* + *---------------------------------------------------------------------- + * + * TclGetUniCharFromObj -- + * + * Get the index'th Unicode character from the Unicode object. If + * the object is not already a Unicode object, an attempt will be + * made to convert it to one. The index is assumed to be in the + * appropriate range. + * + * Results: + * Returns the index'th Unicode character in the Object. + * + * Side effects: + * Fills unichar with the index'th Unicode character. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +TclGetUniCharFromObj(objPtr, index) + Tcl_Obj *objPtr; /* The Unicode object. */ + int index; /* Get the index'th character. */ +{ + Tcl_UniChar *unicharPtr, unichar; + Unicode *unicodePtr; + int length; + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + length = objPtr->length; + + if (AllSingleByteChars(objPtr)) { + int length; + char *str; + + /* + * All of the characters in the Utf string are 1 byte chars, + * so we don't store the unicode char. We get the Utf string + * and convert the index'th byte to a Unicode character. + */ + + str = Tcl_GetStringFromObj(objPtr, &length); + Tcl_UtfToUniChar(&str[index], &unichar); + } else { + unicharPtr = (Tcl_UniChar *)unicodePtr->chars; + unichar = unicharPtr[index]; + } + return unichar; +} + +/* + *---------------------------------------------------------------------- + * + * TclGetRangeFromObj -- + * + * Create a Tcl Object that contains the chars between first and + * last of the object indicated by "objPtr". If the object is not + * already a Unicode object, an attempt will be made to convert it + * to one. The first and last indices are assumed to be in the + * appropriate range. + * + * Results: + * Returns a new Tcl Object of either "string" or "unicode" type, + * containing the range of chars. + * + * Side effects: + * Changes the internal rep of "objPtr" to unicode. + * + *---------------------------------------------------------------------- + */ + +Tcl_Obj* +TclGetRangeFromObj(objPtr, first, last) + + Tcl_Obj *objPtr; /* The Tcl object to find the range of. */ + int first; /* First index of the range. */ + int last; /* Last index of the range. */ +{ + Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ + Tcl_UniChar *unicharPtr; + Unicode *unicodePtr; + int length; + + SetUnicodeFromAny(NULL, objPtr); + unicodePtr = GET_UNICODE(objPtr); + length = objPtr->length; + + if (unicodePtr->numChars != length) { + unicharPtr = (Tcl_UniChar *)unicodePtr->chars; + newObjPtr = TclNewUnicodeObj(&unicharPtr[first], last-first+1); + } else { + int length; + char *str; + + /* + * All of the characters in the Utf string are 1 byte chars, + * so we don't store the unicode char. Create a new string + * object containing the specified range of chars. + */ + + str = Tcl_GetStringFromObj(objPtr, &length); + newObjPtr = Tcl_NewStringObj(&str[first], last-first+1); + } + return newObjPtr; +} + +/* + *---------------------------------------------------------------------- + * + * TclAppendObjToUnicodeObj -- + * + * This procedure appends the contest of "srcObjPtr" to the Unicode + * object "destPtr". + * + * Results: + * None. + * + * Side effects: + * If srcObjPtr doesn't have an internal rep, then it is given a + * Unicode internal rep. + * + *---------------------------------------------------------------------- + */ + +Tcl_Obj * +TclAppendObjToUnicodeObj(targetObjPtr, srcObjPtr) + register Tcl_Obj *targetObjPtr; /* Points to the object to + * append to. */ + register Tcl_Obj *srcObjPtr; /* Points to the object to + * append from. */ +{ + int numBytes, numChars; + Tcl_Obj *resultObjPtr; + char *utfSrcStr; + Tcl_UniChar *unicharSrcStr; + Unicode *unicodePtr; + Tcl_DString dsPtr; + + /* + * Duplicate the target if it is shared. + * Change the result's internal rep to Unicode object. + */ + + if (Tcl_IsShared(targetObjPtr)) { + resultObjPtr = Tcl_DuplicateObj(targetObjPtr); + } else { + resultObjPtr = targetObjPtr; + } + SetUnicodeFromAny(NULL, resultObjPtr); + + /* + * Case where target chars are 1 byte long: + * If src obj is of "string" or null type, then convert it to "unicode" + * type. Src objs of other types (such as int) are left in tact to keep + * them from shimmering between types. If the src obj is a unichar obj, + * and all src chars are also 1 byte long, the src string is appended to + * the target "unicode" obj, and the target obj maintains its "optimized" + * status. + */ + + if (AllSingleByteChars(resultObjPtr)) { + + int length; + char *stringRep; + + if (srcObjPtr->typePtr == &tclStringType + || srcObjPtr->typePtr == NULL) { + SetUnicodeFromAny(NULL, srcObjPtr); + } + + stringRep = Tcl_GetStringFromObj(srcObjPtr, &length); + Tcl_AppendToObj(resultObjPtr, stringRep, length); + + if ((srcObjPtr->typePtr == &tclUnicodeType) + && (AllSingleByteChars(srcObjPtr))) { + SetOptUnicodeFromAny(resultObjPtr, resultObjPtr->length); + } + return resultObjPtr; + } + + /* + * Extract a unicode string from "unicode" or "string" type objects. + * Extract the utf string from non-unicode objects, and convert the + * utf string to unichar string locally. + * If the src obj is a "string" obj, convert it to "unicode" type. + * Src objs of other types (such as int) are left in tact to keep + * them from shimmering between types. + */ + + Tcl_DStringInit(&dsPtr); + if (srcObjPtr->typePtr == &tclStringType || srcObjPtr->typePtr == NULL) { + SetUnicodeFromAny(NULL, srcObjPtr); + } + if (srcObjPtr->typePtr == &tclUnicodeType) { + if (AllSingleByteChars(srcObjPtr)) { + + unicodePtr = GET_UNICODE(srcObjPtr); + numChars = unicodePtr->numChars; + + utfSrcStr = Tcl_GetStringFromObj(srcObjPtr, &numBytes); + unicharSrcStr = (Tcl_UniChar *)Tcl_UtfToUniCharDString(utfSrcStr, + numBytes, &dsPtr); + } else { + unicodePtr = GET_UNICODE(srcObjPtr); + numChars = unicodePtr->numChars; + unicharSrcStr = (Tcl_UniChar *)unicodePtr->chars; + } + } else { + utfSrcStr = Tcl_GetStringFromObj(srcObjPtr, &numBytes); + numChars = Tcl_NumUtfChars(utfSrcStr, numBytes); + unicharSrcStr = (Tcl_UniChar *)Tcl_UtfToUniCharDString(utfSrcStr, + numBytes, &dsPtr); + } + if (numChars == 0) { + return resultObjPtr; + } + + /* + * Append the unichar src string to the result object. + */ + + TclAppendUniCharStrToObj(resultObjPtr, unicharSrcStr, numChars); + Tcl_DStringFree(&dsPtr); + return resultObjPtr; +} + +/* + *---------------------------------------------------------------------- + * + * TclAppendUniCharStrToObj -- + * + * This procedure appends the contents of "srcObjPtr" to the + * Unicode object "objPtr". + * + * Results: + * None. + * + * Side effects: + * If srcObjPtr doesn't have an internal rep, then it is given a + * Unicode internal rep. + * + *---------------------------------------------------------------------- + */ + +void +TclAppendUniCharStrToObj(objPtr, unichars, numNewChars) + register Tcl_Obj *objPtr; /* Points to the object to append to. */ + Tcl_UniChar *unichars; /* The unicode string to append to the + * object. */ + int numNewChars; /* Number of chars in "unichars". */ +{ + Unicode *unicodePtr; + int usedBytes, numNewBytes, totalNumBytes, totalNumChars; + + /* + * Invalidate the StringRep. + */ + + Tcl_InvalidateStringRep(objPtr); + + unicodePtr = GET_UNICODE(objPtr); + + usedBytes = unicodePtr->used; + totalNumChars = numNewChars + unicodePtr->numChars; + totalNumBytes = totalNumChars * sizeof(Tcl_UniChar); + numNewBytes = numNewChars * sizeof(Tcl_UniChar); + + if (unicodePtr->allocated < totalNumBytes) { + int allocatedBytes = totalNumBytes * 2; + + /* + * There isn't currently enough space in the Unicode + * representation so allocate additional space. In fact, + * overallocate so that there is room for future growth without + * having to reallocate again. + */ + + unicodePtr = (Unicode *) ckrealloc(unicodePtr, + UNICODE_SIZE(allocatedBytes)); + memcpy((VOID *) (unicodePtr->chars + usedBytes), + (VOID *) unichars, (size_t) numNewBytes); + + unicodePtr->allocated = allocatedBytes; + unicodePtr = SET_UNICODE(objPtr, unicodePtr); + } + + memcpy((VOID *) (unicodePtr->chars + usedBytes), + (VOID *) unichars, (size_t) numNewBytes); + unicodePtr->used = totalNumBytes; + unicodePtr->numChars = totalNumChars; +} + +/* + *--------------------------------------------------------------------------- + * + * TclNewUnicodeObj -- + * + * This procedure is creates a new Unicode object and initializes + * it from the given Utf String. If the Utf String is the same size + * as the Unicode string, don't duplicate the data. + * + * Results: + * The newly created object is returned. This object will have no + * initial string representation. The returned object has a ref count + * of 0. + * + * Side effects: + * Memory allocated for new object and copy of Unicode argument. + * + *--------------------------------------------------------------------------- + */ + +Tcl_Obj * +TclNewUnicodeObj(unichars, numChars) + Tcl_UniChar *unichars; /* The unicode string used to initialize + * the new object. */ + int numChars; /* Number of characters in the unicode + * string. */ +{ + Tcl_Obj *objPtr; + Unicode *unicodePtr; + int numBytes; + + numBytes = numChars * sizeof(Tcl_UniChar); + + TclNewObj(objPtr); + objPtr->bytes = NULL; + objPtr->typePtr = &tclUnicodeType; + + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(numBytes)); + unicodePtr->used = numBytes; + unicodePtr->numChars = numChars; + unicodePtr->allocated = numBytes; + memcpy((VOID *) unicodePtr->chars, (VOID *) unichars, (size_t) numBytes); + SET_UNICODE(objPtr, unicodePtr); + return objPtr; +} + +/* + *--------------------------------------------------------------------------- + * + * TclAllSingleByteChars -- + * + * Initialize the internal representation of a Unicode Tcl_Obj + * to a copy of the internal representation of an existing Unicode + * object. + * + * Results: + * None. + * + * Side effects: + * Allocates memory. + * + *--------------------------------------------------------------------------- + */ + +static int +AllSingleByteChars(objPtr) + Tcl_Obj *objPtr; /* Object whose char lengths to check. */ +{ + Unicode *unicodePtr; + int numBytes, numChars; + + unicodePtr = GET_UNICODE(objPtr); + numChars = unicodePtr->numChars; + numBytes = objPtr->length; + + if (numChars == numBytes) { + return 1; + } else { + return 0; + } +} + +/* + *--------------------------------------------------------------------------- + * + * DupUnicodeInternalRep -- + * + * Initialize the internal representation of a Unicode Tcl_Obj + * to a copy of the internal representation of an existing Unicode + * object. + * + * Results: + * None. + * + * Side effects: + * Allocates memory. + * + *--------------------------------------------------------------------------- + */ + +static void +DupUnicodeInternalRep(srcPtr, copyPtr) + Tcl_Obj *srcPtr; /* Object with internal rep to copy. */ + Tcl_Obj *copyPtr; /* Object with internal rep to set. */ +{ + Unicode *srcUnicodePtr = GET_UNICODE(srcPtr); + Unicode *copyUnicodePtr; /*GET_UNICODE(copyPtr);*/ + + /* + * If the src obj is a string of 1-byte Utf chars, then copy the + * string rep of the source object and create an "empty" Unicode + * internal rep for the new object. Otherwise, copy Unicode + * internal rep, and invalidate the string rep of the new object. + */ + + if (AllSingleByteChars(srcPtr)) { + copyUnicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(4)); + } else { + int used = srcUnicodePtr->used; + int allocated = srcUnicodePtr->allocated; + Tcl_UniChar *unichars; + + unichars = (Tcl_UniChar *)srcUnicodePtr->chars; + + copyUnicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(allocated)); + + copyUnicodePtr->used = used; + copyUnicodePtr->allocated = allocated; + memcpy((VOID *) copyUnicodePtr->chars, + (VOID *) srcUnicodePtr->chars, (size_t) used); + } + copyUnicodePtr->numChars = srcUnicodePtr->numChars; + SET_UNICODE(copyPtr, copyUnicodePtr); +} + +/* + *--------------------------------------------------------------------------- + * + * TclSetUnicodeObj -- + * + * Modify an object to be a Unicode object and to have the specified + * unicode string as its value. + * + * Results: + * None. + * + * Side effects: + * The object's old string rep and internal rep is freed. + * Memory allocated for copy of unicode argument. + * + *---------------------------------------------------------------------- + */ + +void +TclSetUnicodeObj(objPtr, chars, length) + Tcl_Obj *objPtr; /* Object to initialize as a Unicode obj. */ + unsigned char *chars; /* The unicode string to use as the new + * value. */ + int length; /* Length of the unicode string, which must + * be >= 0. */ +{ + Tcl_ObjType *typePtr; + Unicode *unicodePtr; + + if (Tcl_IsShared(objPtr)) { + panic("TclSetUnicodeObj called with shared object"); + } + typePtr = objPtr->typePtr; + if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) { + (*typePtr->freeIntRepProc)(objPtr); + } + Tcl_InvalidateStringRep(objPtr); + + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(length)); + unicodePtr->used = length; + unicodePtr->allocated = length; + memcpy((VOID *) unicodePtr->chars, (VOID *) chars, (size_t) length); + + objPtr->typePtr = &tclUnicodeType; + SET_UNICODE(objPtr, unicodePtr); +} + +/* + *--------------------------------------------------------------------------- + * + * UpdateStringOfUnicode -- + * + * Update the string representation for a Unicode data object. + * Note: This procedure does not invalidate an existing old string rep + * so storage will be lost if this has not already been done. + * + * Results: + * None. + * + * Side effects: + * The object's string is set to a valid string that results from + * the Unicode-to-string conversion. + * + * The object becomes a string object -- the internal rep is + * discarded and the typePtr becomes NULL. + * + *--------------------------------------------------------------------------- + */ + +static void +UpdateStringOfUnicode(objPtr) + Tcl_Obj *objPtr; /* Unicode object whose string rep to + * update. */ +{ + int i, length, size; + Tcl_UniChar *src; + char dummy[TCL_UTF_MAX]; + char *dst; + Unicode *unicodePtr; + + unicodePtr = GET_UNICODE(objPtr); + src = (Tcl_UniChar *) unicodePtr->chars; + length = unicodePtr->used; + + /* + * How much space will string rep need? + */ + + size = 0; + for (i = 0; i < unicodePtr->numChars; i++) { + size += Tcl_UniCharToUtf((int) src[i], dummy); + } + + dst = (char *) ckalloc((unsigned) (size + 1)); + objPtr->bytes = dst; + objPtr->length = size; + + for (i = 0; i < unicodePtr->numChars; i++) { + dst += Tcl_UniCharToUtf(src[i], dst); + } + *dst = '\0'; +} + +/* + *--------------------------------------------------------------------------- + * + * SetOptUnicodeFromAny -- + * + * Generate the Unicode internal rep from the string rep. + * + * Results: + * The return value is always TCL_OK. + * + * Side effects: + * A Unicode object is stored as the internal rep of objPtr. The Unicode + * ojbect is opitmized for the case where each UTF char in a string is only + * one byte. In this case, we store the value of numChars, but we don't copy + * the bytes to the unicodeObj->chars. Before accessing obj->chars, check if + * all chars are 1 byte long. + * + *--------------------------------------------------------------------------- + */ + +static void +SetOptUnicodeFromAny(objPtr, numChars) + Tcl_Obj *objPtr; /* The object to convert to type Unicode. */ + int numChars; +{ + Tcl_ObjType *typePtr; + Unicode *unicodePtr; + + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(4)); + unicodePtr->numChars = numChars; + + typePtr = objPtr->typePtr; + if ((typePtr != NULL) && (typePtr->freeIntRepProc) != NULL) { + (*typePtr->freeIntRepProc)(objPtr); + } + objPtr->typePtr = &tclUnicodeType; + SET_UNICODE(objPtr, unicodePtr); +} + +/* + *--------------------------------------------------------------------------- + * + * SetUnicodeFromAny -- + * + * Generate the Unicode internal rep from the string rep. + * + * Results: + * The return value is always TCL_OK. + * + * Side effects: + * A Unicode object is stored as the internal rep of objPtr. The Unicode + * ojbect is opitmized for the case where each UTF char in a string is only + * one byte. In this case, we store the value of numChars, but we don't copy + * the bytes to the unicodeObj->chars. Before accessing obj->chars, check if + * all chars are 1 byte long. + * + *--------------------------------------------------------------------------- + */ + +static int +SetUnicodeFromAny(interp, objPtr) + Tcl_Interp *interp; /* Not used. */ + Tcl_Obj *objPtr; /* The object to convert to type Unicode. */ +{ + Tcl_ObjType *typePtr; + int numBytes, numChars; + char *src, *srcEnd; + Unicode *unicodePtr; + unsigned char *dst; + + typePtr = objPtr->typePtr; + if (typePtr != &tclUnicodeType) { + src = Tcl_GetStringFromObj(objPtr, &numBytes); + + numChars = Tcl_NumUtfChars(src, numBytes); + if (numChars == numBytes) { + SetOptUnicodeFromAny(objPtr, numChars); + } else { + unicodePtr = (Unicode *) ckalloc(UNICODE_SIZE(numChars + * sizeof(Tcl_UniChar))); + srcEnd = src + numBytes; + + for (dst = unicodePtr->chars; src < srcEnd; + dst += sizeof(Tcl_UniChar)) { + src += Tcl_UtfToUniChar(src, (Tcl_UniChar *) dst); + } + + unicodePtr->used = numChars * sizeof(Tcl_UniChar); + unicodePtr->numChars = numChars; + unicodePtr->allocated = numChars * sizeof(Tcl_UniChar); + + if ((typePtr != NULL) && (typePtr->freeIntRepProc) != NULL) { + (*typePtr->freeIntRepProc)(objPtr); + } + objPtr->typePtr = &tclUnicodeType; + SET_UNICODE(objPtr, unicodePtr); + } + } + return TCL_OK; +} + +/* + *---------------------------------------------------------------------- + * + * FreeUnicodeInternalRep -- + * + * Deallocate the storage associated with a Unicode data object's + * internal representation. + * + * Results: + * None. + * + * Side effects: + * Frees memory. + * + *---------------------------------------------------------------------- + */ + +static void +FreeUnicodeInternalRep(objPtr) + Tcl_Obj *objPtr; /* Object with internal rep to free. */ +{ + ckfree((char *) GET_UNICODE(objPtr)); +} diff --git a/generic/tclVar.c b/generic/tclVar.c index 03b7757..f2df52e 100644 --- a/generic/tclVar.c +++ b/generic/tclVar.c @@ -14,7 +14,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclVar.c,v 1.8 1999/04/16 00:46:55 stanton Exp $ + * RCS: @(#) $Id: tclVar.c,v 1.9 1999/06/08 02:59:27 hershey Exp $ */ #include "tclInt.h" @@ -1291,7 +1291,7 @@ Tcl_SetVar2Ex(interp, part1, part2, newValuePtr, flags) oldValuePtr = varPtr->value.objPtr; Tcl_IncrRefCount(oldValuePtr); /* since var is ref */ } - Tcl_AppendToObj(oldValuePtr, bytes, length); + Tcl_AppendObjToObj(oldValuePtr, newValuePtr); } } } else { |