From 694c4505cd26d54934847d0e5442ae3d5dca36fd Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 2 May 2023 05:38:04 +0000 Subject: Change realloc strategy on fail, optimize for empty string --- generic/tclListObj.c | 88 +++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/generic/tclListObj.c b/generic/tclListObj.c index 87530be..2475c8e 100644 --- a/generic/tclListObj.c +++ b/generic/tclListObj.c @@ -40,7 +40,7 @@ #ifdef ENABLE_LIST_ASSERTS -#define LIST_ASSERT(cond_) assert(cond_) /* TODO - is there a Tcl-specific one? */ +#define LIST_ASSERT(cond_) assert(cond_) /* * LIST_INDEX_ASSERT is to catch errors with negative indices and counts * being passed AFTER validation. On Tcl9 length types are unsigned hence @@ -68,9 +68,7 @@ #endif /* Checks for when caller should have already converted to internal list type */ -#define LIST_ASSERT_TYPE(listObj_) \ - LIST_ASSERT((listObj_)->typePtr == &tclListType.objType); - +#define LIST_ASSERT_TYPE(listObj_) LIST_ASSERT(INTREP_IS_LISTREP(listObj_)) /* * If ENABLE_LIST_INVARIANTS is enabled (-DENABLE_LIST_INVARIANTS from the @@ -305,12 +303,12 @@ ListSpanMerited( Tcl_Size allocatedStorageLength) /* Length of the currently allocation */ { /* - TODO - - heuristics thresholds need to be determined - - currently, information about the sharing (ref count) of existing - storage is not passed. Perhaps it should be. For example if the - existing storage has a "large" ref count, then it might make sense - to do even a small span. + * Possible optimizations for future consideration + * - heuristic LIST_SPAN_THRESHOLD + * - currently, information about the sharing (ref count) of existing + * storage is not passed. Perhaps it should be. For example if the + * existing storage has a "large" ref count, then it might make sense + * to do even a small span. */ if (length < LIST_SPAN_THRESHOLD) { @@ -771,14 +769,16 @@ ListStoreNew( } if (flags & LISTREP_SPACE_FLAGS) { + /* Caller requests extra space front, back or both */ capacity = ListStoreUpSize(objc); } else { capacity = objc; } storePtr = (ListStore *)Tcl_AttemptAlloc(LIST_SIZE(capacity)); - if (storePtr == NULL && capacity != objc) { - capacity = objc; /* Try allocating exact size */ + while (storePtr == NULL && (capacity > (objc+1))) { + /* Because of loop condition capacity won't overflow */ + capacity = objc + ((capacity - objc) / 2); storePtr = (ListStore *)Tcl_AttemptAlloc(LIST_SIZE(capacity)); } if (storePtr == NULL) { @@ -827,7 +827,8 @@ ListStoreNew( * * ListStoreReallocate -- * - * Reallocates the memory for a ListStore. + * Reallocates the memory for a ListStore allocating extra for + * possible future growth. * * Results: * Pointer to the ListStore which may be the same as storePtr or pointer @@ -856,7 +857,7 @@ ListStoreReallocate (ListStore *storePtr, Tcl_Size numSlots) * by half every time. */ while (newStorePtr == NULL && (newCapacity > (numSlots+1))) { - /* Because of loop condition newCapacity can't overflow */ + /* Because of loop condition newCapacity won't overflow */ newCapacity = numSlots + ((newCapacity - numSlots) / 2); newStorePtr = (ListStore *)Tcl_AttemptRealloc(storePtr, LIST_SIZE(newCapacity)); @@ -1960,19 +1961,18 @@ int Tcl_ListObjIndex( Tcl_Interp *interp, /* Used to report errors if not NULL. */ Tcl_Obj *listObj, /* List object to index into. */ - Tcl_Size index, /* Index of element to return. */ + Tcl_Size index, /* Index of element to return. */ Tcl_Obj **objPtrPtr) /* The resulting Tcl_Obj* is stored here. */ { Tcl_Obj **elemObjs; Tcl_Size numElems; - /* - * TODO - * Unlike the original list code, this does not optimize for lindex'ing - * an empty string when the internal rep is not already a list. On the - * other hand, this code will be faster for the case where the object - * is currently a dict. Benchmark the two cases. - */ + /* Empty string => empty list. Avoid unnecessary shimmering */ + if (listObj->bytes == &tclEmptyString) { + *objPtrPtr = NULL; + return TCL_OK; + } + if (TclListObjGetElementsM(interp, listObj, &numElems, &elemObjs) != TCL_OK) { return TCL_ERROR; @@ -2017,19 +2017,19 @@ Tcl_ListObjLength( { ListRep listRep; + /* Empty string => empty list. Avoid unnecessary shimmering */ + if (listObj->bytes == &tclEmptyString) { + *lenPtr = 0; + return TCL_OK; + } + Tcl_Size (*lengthProc)(Tcl_Obj *obj) = ABSTRACTLIST_PROC(listObj, lengthProc); if (lengthProc) { *lenPtr = lengthProc(listObj); return TCL_OK; } - /* - * TODO - * Unlike the original list code, this does not optimize for lindex'ing - * an empty string when the internal rep is not already a list. On the - * other hand, this code will be faster for the case where the object - * is currently a dict. Benchmark the two cases. - */ + if (TclListObjGetRep(interp, listObj, &listRep) != TCL_OK) { return TCL_ERROR; } @@ -2094,12 +2094,12 @@ Tcl_ListObjReplace( { ListRep listRep; Tcl_Size origListLen; - ptrdiff_t lenChange; - ptrdiff_t leadSegmentLen; - ptrdiff_t tailSegmentLen; + Tcl_Size lenChange; + Tcl_Size leadSegmentLen; + Tcl_Size tailSegmentLen; Tcl_Size numFreeSlots; - ptrdiff_t leadShift; - ptrdiff_t tailShift; + Tcl_Size leadShift; + Tcl_Size tailShift; Tcl_Obj **listObjs; int favor; @@ -2110,8 +2110,6 @@ Tcl_ListObjReplace( if (TclListObjGetRep(interp, listObj, &listRep) != TCL_OK) return TCL_ERROR; /* Cannot be converted to a list */ - /* TODO - will need modification if Tcl9 sticks to unsigned indices */ - /* Make limits sane */ origListLen = ListRepLength(&listRep); if (first < 0) { @@ -2260,7 +2258,7 @@ Tcl_ListObjReplace( * be an explicit alloc and memmove which would let us redistribute * free space. */ - if ((ptrdiff_t)numFreeSlots < lenChange && !ListRepIsShared(&listRep)) { + if (numFreeSlots < lenChange && !ListRepIsShared(&listRep)) { /* T:listrep-1.{1,3,14,18,21},3.{3,10,11,14,27,32,41} */ ListStore *newStorePtr = ListStoreReallocate(listRep.storePtr, origListLen + lenChange); @@ -2287,7 +2285,7 @@ Tcl_ListObjReplace( * TODO - for unshared case ONLY, consider a "move" based implementation */ if (ListRepIsShared(&listRep) || /* 3a */ - (ptrdiff_t)numFreeSlots < lenChange || /* 3b */ + numFreeSlots < lenChange || /* 3b */ (origListLen + lenChange) < (listRep.storePtr->numAllocated / 4) /* 3c */ ) { ListRep newRep; @@ -2402,9 +2400,9 @@ Tcl_ListObjReplace( * or need to shift both. In the former case, favor shifting the * smaller segment. */ - ptrdiff_t leadSpace = ListRepNumFreeHead(&listRep); - ptrdiff_t tailSpace = ListRepNumFreeTail(&listRep); - ptrdiff_t finalFreeSpace = leadSpace + tailSpace - lenChange; + Tcl_Size leadSpace = ListRepNumFreeHead(&listRep); + Tcl_Size tailSpace = ListRepNumFreeTail(&listRep); + Tcl_Size finalFreeSpace = leadSpace + tailSpace - lenChange; LIST_ASSERT((leadSpace + tailSpace) >= lenChange); if (leadSpace >= lenChange @@ -2421,7 +2419,7 @@ Tcl_ListObjReplace( * insertions. */ if (finalFreeSpace > 1 && (tailSpace == 0 || tailSegmentLen == 0)) { - ptrdiff_t postShiftLeadSpace = leadSpace - lenChange; + Tcl_Size postShiftLeadSpace = leadSpace - lenChange; if (postShiftLeadSpace > (finalFreeSpace/2)) { Tcl_Size extraShift = postShiftLeadSpace - (finalFreeSpace / 2); leadShift -= extraShift; @@ -2438,7 +2436,7 @@ Tcl_ListObjReplace( * See comments above. This is analogous. */ if (finalFreeSpace > 1 && (leadSpace == 0 || leadSegmentLen == 0)) { - ptrdiff_t postShiftTailSpace = tailSpace - lenChange; + Tcl_Size postShiftTailSpace = tailSpace - lenChange; if (postShiftTailSpace > (finalFreeSpace/2)) { /* T:listrep-1.{1,3,14,18,21},3.{2,3,26,27} */ Tcl_Size extraShift = postShiftTailSpace - (finalFreeSpace / 2); @@ -2613,7 +2611,7 @@ TclLindexList( /* * The argument is neither an index nor a well-formed list. * Report the error via TclLindexFlat. - * TODO - This is as original. why not directly return an error? + * TODO - This is as original code. why not directly return an error? */ return TclLindexFlat(interp, listObj, 1, &argObj); } @@ -3557,7 +3555,7 @@ TclListTestObj(size_t length, size_t leadingSpace, size_t endSpace) return NULL; } - ListRepInit(capacity, NULL, 0, &listRep); + ListRepInit(capacity, NULL, LISTREP_PANIC_ON_FAIL, &listRep); ListStore *storePtr = listRep.storePtr; size_t i; -- cgit v0.12 From 1f5d09b497ec683bca02b1b78b8ada732b43dda9 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Tue, 2 May 2023 11:50:03 +0000 Subject: Fix LISTREP_ASSERT --- generic/tclListObj.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/generic/tclListObj.c b/generic/tclListObj.c index 2475c8e..726b8dd 100644 --- a/generic/tclListObj.c +++ b/generic/tclListObj.c @@ -68,7 +68,8 @@ #endif /* Checks for when caller should have already converted to internal list type */ -#define LIST_ASSERT_TYPE(listObj_) LIST_ASSERT(INTREP_IS_LISTREP(listObj_)) +#define LIST_ASSERT_TYPE(listObj_) \ + LIST_ASSERT(TclHasInternalRep((listObj_), &tclListType.objType)) /* * If ENABLE_LIST_INVARIANTS is enabled (-DENABLE_LIST_INVARIANTS from the -- cgit v0.12 From 88760a8a81789306d3f2b9986b8f9824a4e6d08d Mon Sep 17 00:00:00 2001 From: pooryorick Date: Wed, 3 May 2023 09:31:08 +0000 Subject: Use Tcl_Size instead of ptrdiff_t in Tcl_ObjCmdProc2(), Tcl_CmdObjTraceProc2, and Tcl_MethodCallProc2(). --- doc/CrtObjCmd.3 | 2 +- doc/CrtTrace.3 | 4 ++-- generic/tcl.h | 4 ++-- generic/tclOO.h | 2 +- unix/dltest/pkgt.c | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/CrtObjCmd.3 b/doc/CrtObjCmd.3 index 7ba71eb..4bdde44 100644 --- a/doc/CrtObjCmd.3 +++ b/doc/CrtObjCmd.3 @@ -187,7 +187,7 @@ except its \fIproc2\fR argument is of type \fBTcl_ObjCmdProc2\fR. typedef int \fBTcl_ObjCmdProc2\fR( void *\fIclientData\fR, Tcl_Interp *\fIinterp\fR, - ptrdiff_t \fIobjc\fR, + Tcl_Size \fIobjc\fR, Tcl_Obj *const \fIobjv\fR[]); .CE .PP diff --git a/doc/CrtTrace.3 b/doc/CrtTrace.3 index cfd3303..9f74cbf 100644 --- a/doc/CrtTrace.3 +++ b/doc/CrtTrace.3 @@ -88,10 +88,10 @@ typedef int \fBTcl_CmdObjTraceProc\fR( typedef int \fBTcl_CmdObjTraceProc2\fR( \fBvoid *\fR \fIclientData\fR, \fBTcl_Interp\fR* \fIinterp\fR, - ptrdiff_t \fIlevel\fR, + Tcl_Size \fIlevel\fR, const char *\fIcommand\fR, \fBTcl_Command\fR \fIcommandToken\fR, - ptrdiff_t \fIobjc\fR, + Tcl_Size \fIobjc\fR, \fBTcl_Obj\fR *const \fIobjv\fR[]); .CE .PP diff --git a/generic/tcl.h b/generic/tcl.h index 0e647db..d75da06 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -561,7 +561,7 @@ typedef int (Tcl_CmdObjTraceProc) (void *clientData, Tcl_Interp *interp, int level, const char *command, Tcl_Command commandInfo, int objc, struct Tcl_Obj *const *objv); typedef int (Tcl_CmdObjTraceProc2) (void *clientData, Tcl_Interp *interp, - ptrdiff_t level, const char *command, Tcl_Command commandInfo, ptrdiff_t objc, + Tcl_Size level, const char *command, Tcl_Command commandInfo, Tcl_Size objc, struct Tcl_Obj *const *objv); typedef void (Tcl_CmdObjTraceDeleteProc) (void *clientData); typedef void (Tcl_DupInternalRepProc) (struct Tcl_Obj *srcPtr, @@ -586,7 +586,7 @@ typedef void (Tcl_NamespaceDeleteProc) (void *clientData); typedef int (Tcl_ObjCmdProc) (void *clientData, Tcl_Interp *interp, int objc, struct Tcl_Obj *const *objv); typedef int (Tcl_ObjCmdProc2) (void *clientData, Tcl_Interp *interp, - ptrdiff_t objc, struct Tcl_Obj *const *objv); + Tcl_Size objc, struct Tcl_Obj *const *objv); typedef int (Tcl_LibraryInitProc) (Tcl_Interp *interp); typedef int (Tcl_LibraryUnloadProc) (Tcl_Interp *interp, int flags); typedef void (Tcl_PanicProc) (const char *format, ...); diff --git a/generic/tclOO.h b/generic/tclOO.h index 775bd32..524acb9 100644 --- a/generic/tclOO.h +++ b/generic/tclOO.h @@ -63,7 +63,7 @@ typedef struct Tcl_ObjectContext_ *Tcl_ObjectContext; typedef int (Tcl_MethodCallProc)(void *clientData, Tcl_Interp *interp, Tcl_ObjectContext objectContext, int objc, Tcl_Obj *const *objv); typedef int (Tcl_MethodCallProc2)(void *clientData, Tcl_Interp *interp, - Tcl_ObjectContext objectContext, ptrdiff_t objc, Tcl_Obj *const *objv); + Tcl_ObjectContext objectContext, Tcl_Size objc, Tcl_Obj *const *objv); typedef void (Tcl_MethodDeleteProc)(void *clientData); typedef int (Tcl_CloneProc)(Tcl_Interp *interp, void *oldClientData, void **newClientData); diff --git a/unix/dltest/pkgt.c b/unix/dltest/pkgt.c index 77e21ac..1f326f5 100644 --- a/unix/dltest/pkgt.c +++ b/unix/dltest/pkgt.c @@ -16,10 +16,10 @@ static int TraceProc2 ( void *clientData, Tcl_Interp *interp, - ptrdiff_t level, + Tcl_Size level, const char *command, Tcl_Command commandInfo, - ptrdiff_t objc, + Tcl_Size objc, struct Tcl_Obj *const *objv) { (void)clientData; @@ -55,12 +55,12 @@ static int Pkgt_EqObjCmd2( void *dummy, /* Not used. */ Tcl_Interp *interp, /* Current interpreter. */ - ptrdiff_t objc, /* Number of arguments. */ + Tcl_Size objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { Tcl_WideInt result; const char *str1, *str2; - ptrdiff_t len1, len2; + Tcl_Size len1, len2; (void)dummy; if (objc != 3) { -- cgit v0.12 From 06cc962945bd148c0fb2a63ab23e6e3577945929 Mon Sep 17 00:00:00 2001 From: apnadkarni Date: Wed, 3 May 2023 10:01:50 +0000 Subject: Comments only. Fix blatantly obsolete ones --- generic/tclStringObj.c | 36 +++++++++++++++++------------------- generic/tclStringRep.h | 48 +++++++++++++----------------------------------- 2 files changed, 30 insertions(+), 54 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 63e38bb..d2bc1b2 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1,29 +1,27 @@ /* * tclStringObj.c -- * - * This file contains functions that implement string operations on Tcl - * objects. Some string operations work with UTF strings and others - * require Unicode format. Functions that require knowledge of the width - * of each character, such as indexing, operate on Unicode data. - * - * A Unicode string is an internationalized string. Conceptually, a - * Unicode string is an array of 16-bit quantities organized as a - * sequence of properly formed UTF-8 characters. There is a one-to-one - * map between Unicode and UTF characters. Because Unicode characters - * have a fixed width, operations such as indexing operate on Unicode - * data. The String object is optimized for the case where each UTF char + * This file contains functions that implement string operations on Tcl + * objects. Some string operations work with UTF-8 encoding forms. + * Functions that require knowledge of the width of each character, + * such as indexing, operate on fixed width encoding forms such as UTF-32. + * + * Conceptually, a string is a sequence of Unicode code points. Internally + * it may be stored in an encoding form such as a modified version of + * UTF-8 or UTF-16 (when TCL_UTF_MAX=3) or UTF-32. + * + * The String object is optimized for the case where each UTF char * in a string is only one byte. In this case, we store the value of - * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode - * is explicitly called). + * numChars, but we don't store the fixed form encoding (unless + * Tcl_GetUnicode is explicitly called). * - * The String object type stores one or both formats. The default - * behavior is to store UTF. Once Unicode is calculated by a function, it - * is stored in the internal rep for future access (without an additional - * O(n) cost). + * The String object type stores one or both formats. The default + * behavior is to store UTF-8. Once UTF-16/UTF32 is calculated, it is + * stored in the internal rep for future access (without an additional + * O(n) cost). * * To allow many appends to be done to an object without constantly - * reallocating the space for the string or Unicode representation, we - * allocate double the space for the string or Unicode and use the + * reallocating space, we allocate double the space and use the * internal representation to keep track of how much space is used vs. * allocated. * diff --git a/generic/tclStringRep.h b/generic/tclStringRep.h index d0c76cb..7f72b04 100644 --- a/generic/tclStringRep.h +++ b/generic/tclStringRep.h @@ -1,29 +1,12 @@ /* * tclStringRep.h -- * - * This file contains the definition of the Unicode string internal - * representation and macros to access it. + * This file contains the definition of internal representations of a string + * and macros to access it. * - * A Unicode string is an internationalized string. Conceptually, a - * Unicode string is an array of 16-bit quantities organized as a - * sequence of properly formed UTF-8 characters. There is a one-to-one - * map between Unicode and UTF characters. Because Unicode characters - * have a fixed width, operations such as indexing operate on Unicode - * data. The String object is optimized for the case where each UTF char - * in a string is only one byte. In this case, we store the value of - * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode - * is explicitly called). - * - * The String object type stores one or both formats. The default - * behavior is to store UTF. Once Unicode is calculated by a function, it - * is stored in the internal rep for future access (without an additional - * O(n) cost). - * - * To allow many appends to be done to an object without constantly - * reallocating the space for the string or Unicode representation, we - * allocate double the space for the string or Unicode and use the - * internal representation to keep track of how much space is used vs. - * allocated. + * Conceptually, a string is a sequence of Unicode code points. Internally + * it may be stored in an encoding form such as a modified version of UTF-8 + * or UTF-16 (when TCL_UTF_MAX=3) or UTF-32. * * Copyright (c) 1995-1997 Sun Microsystems, Inc. * Copyright (c) 1999 by Scriptics Corporation. @@ -39,15 +22,10 @@ /* * The following structure is the internal rep for a String object. It keeps * track of how much memory has been used and how much has been allocated for - * the Unicode and UTF string to enable growing and shrinking of the UTF and - * Unicode reps of the String object with fewer mallocs. To optimize string + * the various representations to enable growing and shrinking of + * the String object with fewer mallocs. To optimize string * length and indexing operations, this structure also stores the number of - * characters (same of UTF and Unicode!) once that value has been computed. - * - * Under normal configurations, what Tcl calls "Unicode" is actually UTF-16 - * restricted to the Basic Multilingual Plane (i.e. U+00000 to U+0FFFF). This - * can be officially modified by altering the definition of Tcl_UniChar in - * tcl.h, but do not do that unless you are sure what you're doing! + * code points (independent of encoding form) once that value has been computed. */ typedef struct { @@ -57,15 +35,15 @@ typedef struct { * Unicode rep, or that the number of UTF bytes == * the number of chars. */ Tcl_Size allocated; /* The amount of space actually allocated for - * the UTF string (minus 1 byte for the + * the UTF-8 string (minus 1 byte for the * termination char). */ Tcl_Size maxChars; /* Max number of chars that can fit in the * space allocated for the Unicode array. */ int hasUnicode; /* Boolean determining whether the string has - * a Unicode representation. */ - Tcl_UniChar unicode[TCLFLEXARRAY]; /* The array of Unicode chars. The actual size - * of this field depends on the 'maxChars' - * field above. */ + * a Tcl_UniChar representation. */ + Tcl_UniChar unicode[TCLFLEXARRAY]; /* The array of Tcl_UniChar units. + * The actual size of this field depends on + * the maxChars field above. */ } String; /* Limit on string lengths. The -1 because limit does not include the nul */ -- cgit v0.12 From d40bb9d13a5a6e2607f313df4bd986886f1ef8d7 Mon Sep 17 00:00:00 2001 From: pointsman Date: Wed, 3 May 2023 15:18:17 +0000 Subject: Removed the bug specific test constraint from the tests related to [0306a5563c] because the bug is fixed. --- tests/bigdata.test | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/bigdata.test b/tests/bigdata.test index 08556dd..0f72be7 100644 --- a/tests/bigdata.test +++ b/tests/bigdata.test @@ -1114,7 +1114,6 @@ bigtestRO concat-bigdata-1 "concat" {4294967296 {0 1 2 3 4} {6 7 0 1 2} {3 4 5 6 test puts-bigdata-1 "puts" -setup { set fpath [makeFile {} bug-0306a5563.data] } -constraints { - bug0306a5563 bigdata } -body { set fd [open $fpath w] @@ -1130,7 +1129,6 @@ test puts-bigdata-1 "puts" -setup { test puts-bigdata-2 "puts" -setup { set fpath [tcltest::makeFile {} bug-0306a5563.data] } -constraints { - bug0306a5563 bigdata } -body { set fd [open $fpath w] -- cgit v0.12 From b583ea7360808cb502d1ea65954ab0387ebdd823 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 3 May 2023 15:51:10 +0000 Subject: Backport "Comments only. Fix blatantly obsolete ones". And a few more improvements from the same files. --- generic/tclStringObj.c | 95 +++++++++++++++++++++++++++++--------------------- generic/tclStringRep.h | 44 ++++++++--------------- 2 files changed, 70 insertions(+), 69 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index d3a17d1..7fbf77a 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1,29 +1,27 @@ /* * tclStringObj.c -- * - * This file contains functions that implement string operations on Tcl - * objects. Some string operations work with UTF strings and others - * require Unicode format. Functions that require knowledge of the width - * of each character, such as indexing, operate on Unicode data. - * - * A Unicode string is an internationalized string. Conceptually, a - * Unicode string is an array of 16-bit quantities organized as a - * sequence of properly formed UTF-8 characters. There is a one-to-one - * map between Unicode and UTF characters. Because Unicode characters - * have a fixed width, operations such as indexing operate on Unicode - * data. The String object is optimized for the case where each UTF char + * This file contains functions that implement string operations on Tcl + * objects. Some string operations work with UTF-8 encoding forms. + * Functions that require knowledge of the width of each character, + * such as indexing, operate on fixed width encoding forms such as UTF-32. + * + * Conceptually, a string is a sequence of Unicode code points. Internally + * it may be stored in an encoding form such as a modified version of + * UTF-8 or UTF-16 (when TCL_UTF_MAX=3) or UTF-32. + * + * The String object is optimized for the case where each UTF char * in a string is only one byte. In this case, we store the value of - * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode - * is explicitly called). + * numChars, but we don't store the fixed form encoding (unless + * Tcl_GetUnicode is explicitly called). * - * The String object type stores one or both formats. The default - * behavior is to store UTF. Once Unicode is calculated by a function, it - * is stored in the internal rep for future access (without an additional - * O(n) cost). + * The String object type stores one or both formats. The default + * behavior is to store UTF-8. Once UTF-16/UTF32 is calculated, it is + * stored in the internal rep for future access (without an additional + * O(n) cost). * * To allow many appends to be done to an object without constantly - * reallocating the space for the string or Unicode representation, we - * allocate double the space for the string or Unicode and use the + * reallocating space, we allocate double the space and use the * internal representation to keep track of how much space is used vs. * allocated. * @@ -37,7 +35,6 @@ #include "tclInt.h" #include "tclTomMath.h" #include "tclStringRep.h" - #include "assert.h" /* * Prototypes for functions defined later in this file: @@ -631,10 +628,8 @@ TclGetCharLength( */ if (TclIsPureByteArray(objPtr)) { - int length; - - (void) Tcl_GetByteArrayFromObj(objPtr, &length); - return length; + (void) Tcl_GetByteArrayFromObj(objPtr, &numChars); + return numChars; } /* @@ -675,10 +670,10 @@ Tcl_GetCharLength( } /* - * Optimize BytArray case: No need to convert to a string to perform the - * get-length operation. + * Optimize the case where we're really dealing with a bytearray object; + * we don't need to convert to a string to perform the get-length operation. * - * Starting in Tcl 8.7, check for a "pure" bytearray, because the + * Starting in Tcl 8.7, we check for a "pure" bytearray, because the * machinery behind that test is using a proper bytearray ObjType. We * could also compute length of an improper bytearray without shimmering * but there's no value in that. We *want* to shimmer an improper bytearray @@ -686,16 +681,17 @@ Tcl_GetCharLength( */ if (TclIsPureByteArray(objPtr)) { - (void) Tcl_GetByteArrayFromObj(objPtr, &numChars); } else { Tcl_GetString(objPtr); numChars = Tcl_NumUtfChars(objPtr->bytes, objPtr->length); } + return numChars; } #endif + /* *---------------------------------------------------------------------- * @@ -722,6 +718,11 @@ TclCheckEmptyString( return TCL_EMPTYSTRING_YES; } + if (TclIsPureByteArray(objPtr) + && Tcl_GetCharLength(objPtr) == 0) { + return TCL_EMPTYSTRING_YES; + } + if (TclListObjIsCanonical(objPtr)) { TclListObjLengthM(NULL, objPtr, &length); return length == 0; @@ -2383,12 +2384,16 @@ Tcl_AppendFormatToObj( width = 0; if (isdigit(UCHAR(ch))) { - width = strtoul(format, &end, 10); - if (width < 0) { + /* Note ull will be >= 0 because of isdigit check above */ + unsigned long long ull; + ull = strtoull(format, &end, 10); + /* Comparison is >=, not >, to leave room for nul */ + if (ull >= WIDE_MAX) { msg = overflow; errCode = "OVERFLOW"; goto errorMsg; } + width = (Tcl_WideInt)ull; format = end; step = TclUtfToUniChar(format, &ch); } else if (ch == '*') { @@ -2425,7 +2430,16 @@ Tcl_AppendFormatToObj( step = TclUtfToUniChar(format, &ch); } if (isdigit(UCHAR(ch))) { - precision = strtoul(format, &end, 10); + /* Note ull will be >= 0 because of isdigit check above */ + unsigned long long ull; + ull = strtoull(format, &end, 10); + /* Comparison is >=, not >, to leave room for nul */ + if (ull >= WIDE_MAX) { + msg = overflow; + errCode = "OVERFLOW"; + goto errorMsg; + } + precision = (Tcl_WideInt)ull; format = end; step = TclUtfToUniChar(format, &ch); } else if (ch == '*') { @@ -2531,6 +2545,9 @@ Tcl_AppendFormatToObj( if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } + if ((unsigned)code > 0x10FFFF) { + code = 0xFFFD; + } length = Tcl_UniCharToUtf(code, buf); if ((code >= 0xD800) && (length < 3)) { /* Special case for handling high surrogates. */ @@ -3875,6 +3892,7 @@ TclStringCmp( if ((reqlength == 0) || (value1Ptr == value2Ptr)) { /* * Always match at 0 chars of if it is the same obj. + * Note: as documented reqlength negative means it is ignored */ match = 0; } else { @@ -4006,15 +4024,15 @@ TclStringCmp( * comparison function. */ length = (s1len < s2len) ? s1len : s2len; - if (reqlength > 0 && reqlength < length) { - length = reqlength; - } else if (reqlength < 0) { + if (reqlength < 0) { /* * The requested length is negative, so ignore it by setting it * to length + 1 to correct the match var. */ reqlength = length + 1; + } else if (reqlength > 0 && reqlength < length) { + length = reqlength; } if (checkEq && reqlength < 0 && (s1len != s2len)) { @@ -4452,18 +4470,17 @@ TclStringReplace( int inPlace = flags & TCL_STRING_IN_PLACE; Tcl_Obj *result; - /* Caller is expected to pass sensible arguments */ - assert ( count >= 0 ) ; - assert ( first >= 0 ) ; - /* Replace nothing with nothing */ - if ((insertPtr == NULL) && (count == 0)) { + if ((insertPtr == NULL) && (count <= 0)) { if (inPlace) { return objPtr; } else { return Tcl_DuplicateObj(objPtr); } } + if (first < 0) { + first = 0; + } /* * The caller very likely had to call Tcl_GetCharLength() or similar diff --git a/generic/tclStringRep.h b/generic/tclStringRep.h index 0219a00..ef64d6c 100644 --- a/generic/tclStringRep.h +++ b/generic/tclStringRep.h @@ -1,29 +1,12 @@ /* * tclStringRep.h -- * - * This file contains the definition of the Unicode string internal - * representation and macros to access it. + * This file contains the definition of internal representations of a string + * and macros to access it. * - * A Unicode string is an internationalized string. Conceptually, a - * Unicode string is an array of 16-bit quantities organized as a - * sequence of properly formed UTF-8 characters. There is a one-to-one - * map between Unicode and UTF characters. Because Unicode characters - * have a fixed width, operations such as indexing operate on Unicode - * data. The String object is optimized for the case where each UTF char - * in a string is only one byte. In this case, we store the value of - * numChars, but we don't store the Unicode data (unless Tcl_GetUnicode - * is explicitly called). - * - * The String object type stores one or both formats. The default - * behavior is to store UTF. Once Unicode is calculated by a function, it - * is stored in the internal rep for future access (without an additional - * O(n) cost). - * - * To allow many appends to be done to an object without constantly - * reallocating the space for the string or Unicode representation, we - * allocate double the space for the string or Unicode and use the - * internal representation to keep track of how much space is used vs. - * allocated. + * Conceptually, a string is a sequence of Unicode code points. Internally + * it may be stored in an encoding form such as a modified version of UTF-8 + * or UTF-16 (when TCL_UTF_MAX=3) or UTF-32. * * Copyright (c) 1995-1997 Sun Microsystems, Inc. * Copyright (c) 1999 by Scriptics Corporation. @@ -39,10 +22,10 @@ /* * The following structure is the internal rep for a String object. It keeps * track of how much memory has been used and how much has been allocated for - * the Unicode and UTF string to enable growing and shrinking of the UTF and - * Unicode reps of the String object with fewer mallocs. To optimize string + * the various representations to enable growing and shrinking of + * the String object with fewer mallocs. To optimize string * length and indexing operations, this structure also stores the number of - * characters (same of UTF and Unicode!) once that value has been computed. + * code points (independent of encoding form) once that value has been computed. */ typedef struct { @@ -52,17 +35,18 @@ typedef struct { * Unicode rep, or that the number of UTF bytes == * the number of chars. */ Tcl_Size allocated; /* The amount of space actually allocated for - * the UTF string (minus 1 byte for the + * the UTF-8 string (minus 1 byte for the * termination char). */ Tcl_Size maxChars; /* Max number of chars that can fit in the * space allocated for the Unicode array. */ int hasUnicode; /* Boolean determining whether the string has - * a Unicode representation. */ - unsigned short unicode[TCLFLEXARRAY]; /* The array of Unicode chars. The actual size - * of this field depends on the 'maxChars' - * field above. */ + * a Tcl_UniChar representation. */ + unsigned short unicode[TCLFLEXARRAY]; /* The array of Tcl_UniChar units. + * The actual size of this field depends on + * the maxChars field above. */ } String; +/* Limit on string lengths. The -1 because limit does not include the nul */ #define STRING_MAXCHARS \ (int)(((size_t)UINT_MAX - offsetof(String, unicode))/sizeof(unsigned short) - 1) #define STRING_SIZE(numChars) \ -- cgit v0.12 From e2f55284cf4bfdca0a8c1b971e9ca8895ce415e9 Mon Sep 17 00:00:00 2001 From: dkf Date: Thu, 4 May 2023 08:24:01 +0000 Subject: Do not build every branch. Too expensive. --- .github/workflows/linux-build.yml | 9 ++++++++- .github/workflows/mac-build.yml | 9 ++++++++- .github/workflows/onefiledist.yml | 9 ++++++++- .github/workflows/win-build.yml | 9 ++++++++- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index 7ba9e89..f8c30da 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -1,5 +1,12 @@ name: Linux -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read jobs: diff --git a/.github/workflows/mac-build.yml b/.github/workflows/mac-build.yml index a9345a1..b5a586e 100644 --- a/.github/workflows/mac-build.yml +++ b/.github/workflows/mac-build.yml @@ -1,5 +1,12 @@ name: macOS -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read jobs: diff --git a/.github/workflows/onefiledist.yml b/.github/workflows/onefiledist.yml index 1f75762..c6277af 100644 --- a/.github/workflows/onefiledist.yml +++ b/.github/workflows/onefiledist.yml @@ -1,5 +1,12 @@ name: Build Binaries -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read jobs: diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index ba4e5ba..9773c2a 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -1,5 +1,12 @@ name: Windows -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read env: -- cgit v0.12 From af320fb41e62980c0c988a377c138ac14757ad81 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 4 May 2023 12:53:45 +0000 Subject: Fix [9c397da4a9]: AppendPrintfToObjVA(): support 64-bit pointers --- generic/tcl.h | 2 +- generic/tclStringObj.c | 8 ++++++-- generic/tclStringRep.h | 2 +- tests/tailcall.test | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/generic/tcl.h b/generic/tcl.h index 318c7a1..2b6c947 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2176,7 +2176,7 @@ typedef struct Tcl_EncodingType { #define TCL_ENCODING_CHAR_LIMIT 0x10 /* Internal use bits, do not define bits in this space. See above comment */ #define TCL_ENCODING_INTERNAL_USE_MASK 0xFF00 -/* +/* * Reserve top byte for profile values (disjoint, not a mask). In case of * changes, ensure ENCODING_PROFILE_* macros in tclInt.h are modified if * necessary. diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 7fbf77a..0e47487 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -249,7 +249,7 @@ UpdateStringOfUTF16String( #endif #endif - + /* * TCL STRING GROWTH ALGORITHM * @@ -3130,12 +3130,16 @@ AppendPrintfToObjVA( break; } + case 'p': + if (sizeof(size_t) == sizeof(Tcl_WideInt)) { + size = 2; + } + /* FALLTHRU */ case 'c': case 'i': case 'u': case 'd': case 'o': - case 'p': case 'x': case 'X': seekingConversion = 0; diff --git a/generic/tclStringRep.h b/generic/tclStringRep.h index ef64d6c..d1863fb 100644 --- a/generic/tclStringRep.h +++ b/generic/tclStringRep.h @@ -22,7 +22,7 @@ /* * The following structure is the internal rep for a String object. It keeps * track of how much memory has been used and how much has been allocated for - * the various representations to enable growing and shrinking of + * the various representations to enable growing and shrinking of * the String object with fewer mallocs. To optimize string * length and indexing operations, this structure also stores the number of * code points (independent of encoding form) once that value has been computed. diff --git a/tests/tailcall.test b/tests/tailcall.test index c9ec674..0016845 100644 --- a/tests/tailcall.test +++ b/tests/tailcall.test @@ -709,7 +709,7 @@ test tailcall-14.1-bc {{in a deleted namespace} {byte compiled}} -body { } -returnCodes 1 -result {namespace "::ns" not found} test tailcall-bug-784befb0ba {tailcall crash with 254 args} -body { - proc tccrash args {llength $args} + proc tccrash args {llength $args} # Must be EXACTLY 254 for crash proc p {} [list tailcall tccrash {*}[lrepeat 254 x]] p -- cgit v0.12 From e51f7da2ee3eeb2de5ed29bffef7c4fd20bc2791 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 4 May 2023 12:58:41 +0000 Subject: Backport: Do not build every branch. Too expensive. --- .github/workflows/linux-build.yml | 9 ++++++++- .github/workflows/mac-build.yml | 9 ++++++++- .github/workflows/onefiledist.yml | 9 ++++++++- .github/workflows/win-build.yml | 9 ++++++++- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index 7ba9e89..f8c30da 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -1,5 +1,12 @@ name: Linux -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read jobs: diff --git a/.github/workflows/mac-build.yml b/.github/workflows/mac-build.yml index a9345a1..b5a586e 100644 --- a/.github/workflows/mac-build.yml +++ b/.github/workflows/mac-build.yml @@ -1,5 +1,12 @@ name: macOS -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read jobs: diff --git a/.github/workflows/onefiledist.yml b/.github/workflows/onefiledist.yml index 1f75762..c6277af 100644 --- a/.github/workflows/onefiledist.yml +++ b/.github/workflows/onefiledist.yml @@ -1,5 +1,12 @@ name: Build Binaries -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read jobs: diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index ba4e5ba..9773c2a 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -1,5 +1,12 @@ name: Windows -on: [push] +on: + push: + branches: + - "main" + - "trunk" + - "core-8-branch" + tags: + - "core-**" permissions: contents: read env: -- cgit v0.12