From 3a3fd2954f11d13424d941346c5ede39c3bf175f Mon Sep 17 00:00:00 2001 From: pooryorick Date: Sat, 5 May 2018 16:38:27 +0000 Subject: Avoid generating string representation when comparing the empty string. --- generic/tclExecute.c | 56 +++++++++++++++++++++++++++++++++++++++++++++----- generic/tclInt.h | 10 +++++++++ generic/tclStringObj.c | 45 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 5 deletions(-) diff --git a/generic/tclExecute.c b/generic/tclExecute.c index af44323..728a847 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5440,22 +5440,58 @@ TEBCresume( } } else { /* - * strcmp can't do a simple memcmp in order to handle the - * special Tcl \xC0\x80 null encoding for utf-8. + * In order to handle the special Tcl \xC0\x80 null encoding + * for utf-8, strcmp can't do a simple memcmp. */ - s1 = TclGetStringFromObj(valuePtr, &s1len); - s2 = TclGetStringFromObj(value2Ptr, &s2len); + { + int empty; + if ((empty = TclCheckEmptyString(valuePtr)) > 0) { + switch (TclCheckEmptyString(value2Ptr)) { + case -1: + s1 = ""; + s1len = 0; + s2 = TclGetStringFromObj(value2Ptr, &s2len); + break; + case 0: + match = -1; + goto matchdone; + case 1: + match = 0; + goto matchdone; + } + } else if (TclCheckEmptyString(value2Ptr) > 0) { + switch (empty) { + case -1: + s2 = ""; + s2len = 0; + s1 = TclGetStringFromObj(valuePtr, &s1len); + break; + case 0: + match = 1; + goto matchdone; + case 1: + match = 0; + goto matchdone; + } + } else { + s1 = TclGetStringFromObj(valuePtr, &s1len); + s2 = TclGetStringFromObj(value2Ptr, &s2len); + } + } + + if (checkEq) { memCmpFn = memcmp; } else { memCmpFn = (memCmpFn_t) TclpUtfNcmp2; } + } if (checkEq && (s1len != s2len)) { match = 1; - } else { + } else { /* * The comparison function should compare up to the minimum * byte length only. @@ -5468,6 +5504,8 @@ TEBCresume( } } + matchdone: + /* * Make sure only -1,0,1 is returned * TODO: consider peephole opt. @@ -6142,6 +6180,14 @@ TEBCresume( value2Ptr = OBJ_AT_TOS; valuePtr = OBJ_UNDER_TOS; + /* + Try to determine, without triggering generation of a string + representation, whether one value is not a number. + */ + if (TclCheckEmptyString(valuePtr) > 0 || TclCheckEmptyString(value2Ptr) > 0) { + goto stringCompare; + } + if (GetNumberFromObj(NULL, valuePtr, &ptr1, &type1) != TCL_OK || GetNumberFromObj(NULL, value2Ptr, &ptr2, &type2) != TCL_OK) { /* diff --git a/generic/tclInt.h b/generic/tclInt.h index 4db4576..4bdaf58 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -2735,6 +2735,10 @@ MODULE_SCOPE long tclObjsShared[TCL_MAX_SHARED_OBJ_STATS]; MODULE_SCOPE char * tclEmptyStringRep; MODULE_SCOPE char tclEmptyString; +enum CheckEmptyStringResult { + TCL_EMPTYSTRING_UNKNOWN = -1, TCL_EMPTYSTRING_NO, TCL_EMPTYSTRING_YES +}; + /* *---------------------------------------------------------------- * Procedures shared among Tcl modules but not used by the outside world, @@ -2875,6 +2879,7 @@ MODULE_SCOPE int TclCheckArrayTraces(Tcl_Interp *interp, Var *varPtr, Var *arrayPtr, Tcl_Obj *name, int index); MODULE_SCOPE int TclCheckBadOctal(Tcl_Interp *interp, const char *value); +MODULE_SCOPE int TclCheckEmptyString(Tcl_Obj *objPtr); MODULE_SCOPE int TclChanCaughtErrorBypass(Tcl_Interp *interp, Tcl_Channel chan); MODULE_SCOPE Tcl_ObjCmdProc TclChannelNamesCmd; @@ -4455,6 +4460,11 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, #define TclIsPureByteArray(objPtr) \ (((objPtr)->typePtr==&tclByteArrayType) && ((objPtr)->bytes==NULL)) +#define TclIsPureDict(objPtr) \ + (((objPtr)->bytes==NULL) && ((objPtr)->typePtr==&tclDictType)) + +#define TclIsPureList(objPtr) \ + (((objPtr)->bytes==NULL) && ((objPtr)->typePtr==&tclListType)) /* *---------------------------------------------------------------- diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 1b35c56..a503392 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -434,6 +434,7 @@ Tcl_GetCharLength( return length; } + /* * OK, need to work with the object as a string. */ @@ -464,6 +465,50 @@ Tcl_GetCharLength( } return numChars; } + + + +/* + *---------------------------------------------------------------------- + * + * TclCheckEmptyString -- + * + * Determine whether the string value of an object is or would be the + * empty string, without generating a string representation. + * + * Results: + * Returns 1 if empty, 0 if not, and -1 if unknown. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ +int +TclCheckEmptyString ( + Tcl_Obj *objPtr +) { + int length = -1; + + if (objPtr->bytes == tclEmptyStringRep) { + return TCL_EMPTYSTRING_YES; + } + + if (TclIsPureList(objPtr)) { + Tcl_ListObjLength(NULL, objPtr, &length); + return length == 0; + } + + if (TclIsPureDict(objPtr)) { + Tcl_DictObjSize(NULL, objPtr, &length); + return length == 0; + } + + if (objPtr->bytes == NULL) { + return TCL_EMPTYSTRING_UNKNOWN; + } + return objPtr->length == 0; +} /* *---------------------------------------------------------------------- -- cgit v0.12 From 86c6a11cdb30d7f9e34ca6882a9882556d0aace2 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Sun, 6 May 2018 13:45:21 +0000 Subject: Preparation to deduplicate code between byte-compiled and legacy implementations of [string compare]. --- generic/tclCmdMZ.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++ generic/tclExecute.c | 120 +------------------------------------------------- generic/tclInt.h | 5 +++ 3 files changed, 129 insertions(+), 118 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index b686330..9520f86 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2865,6 +2865,128 @@ StringCmpCmd( return TCL_OK; } +int TclStringCmp ( + Tcl_Obj *value1Ptr, + Tcl_Obj *value2Ptr, + int checkEq +) { + char *s1, *s2; + int empty, match, s1len, s2len; + memCmpFn_t memCmpFn; + + /* + * When we have equal-length we can check only for (in)equality. + * We can use memcmp in all (n)eq cases because we + * don't need to worry about lexical LE/BE variance. + */ + + if (value1Ptr == value2Ptr) { + match = 0; + } else { + if (TclIsPureByteArray(value1Ptr) + && TclIsPureByteArray(value2Ptr)) { + s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len); + s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); + memCmpFn = memcmp; + } else if ((value1Ptr->typePtr == &tclStringType) + && (value2Ptr->typePtr == &tclStringType)) { + /* + * Do a unicode-specific comparison if both of the args are of + * String type. If the char length == byte length, we can do a + * memcmp. In benchmark testing this proved the most efficient + * check between the unicode and string comparison operations. + */ + + s1len = Tcl_GetCharLength(value1Ptr); + s2len = Tcl_GetCharLength(value2Ptr); + if ((s1len == value1Ptr->length) + && (value1Ptr->bytes != NULL) + && (s2len == value2Ptr->length) + && (value2Ptr->bytes != NULL)) { + s1 = value1Ptr->bytes; + s2 = value2Ptr->bytes; + memCmpFn = memcmp; + } else { + s1 = (char *) Tcl_GetUnicode(value1Ptr); + s2 = (char *) Tcl_GetUnicode(value2Ptr); + if ( +#ifdef WORDS_BIGENDIAN + 1 +#else + checkEq +#endif + ) { + memCmpFn = memcmp; + s1len *= sizeof(Tcl_UniChar); + s2len *= sizeof(Tcl_UniChar); + } else { + memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp; + } + } + } else { + /* + * In order to handle the special Tcl \xC0\x80 null encoding + * for utf-8, strcmp can't do a simple memcmp. + */ + + if ((empty = TclCheckEmptyString(value1Ptr)) > 0) { + switch (TclCheckEmptyString(value2Ptr)) { + case -1: + s1 = ""; + s1len = 0; + s2 = TclGetStringFromObj(value2Ptr, &s2len); + break; + case 0: + match = -1; + goto matchdone; + case 1: + match = 0; + goto matchdone; + } + } else if (TclCheckEmptyString(value2Ptr) > 0) { + switch (empty) { + case -1: + s2 = ""; + s2len = 0; + s1 = TclGetStringFromObj(value1Ptr, &s1len); + break; + case 0: + match = 1; + goto matchdone; + case 1: + match = 0; + goto matchdone; + } + } else { + s1 = TclGetStringFromObj(value1Ptr, &s1len); + s2 = TclGetStringFromObj(value2Ptr, &s2len); + } + + if (checkEq) { + memCmpFn = memcmp; + } else { + memCmpFn = (memCmpFn_t) TclpUtfNcmp2; + } + } + + if (checkEq && (s1len != s2len)) { + match = 1; + } else { + /* + * The comparison function should compare up to the minimum + * byte length only. + */ + match = memCmpFn(s1, s2, + (size_t) ((s1len < s2len) ? s1len : s2len)); + if (match == 0) { + match = s1len - s2len; + } + } + } + matchdone: + return match; +} + /* *---------------------------------------------------------------------- * diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 728a847..a76e686 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5384,128 +5384,12 @@ TEBCresume( value2Ptr = OBJ_AT_TOS; valuePtr = OBJ_UNDER_TOS; - if (valuePtr == value2Ptr) { - match = 0; - } else { - /* - * We only need to check (in)equality when we have equal length - * strings. We can use memcmp in all (n)eq cases because we - * don't need to worry about lexical LE/BE variance. - */ - - typedef int (*memCmpFn_t)(const void*, const void*, size_t); - memCmpFn_t memCmpFn; + { int checkEq = ((*pc == INST_EQ) || (*pc == INST_NEQ) || (*pc == INST_STR_EQ) || (*pc == INST_STR_NEQ)); - - if (TclIsPureByteArray(valuePtr) - && TclIsPureByteArray(value2Ptr)) { - s1 = (char *) Tcl_GetByteArrayFromObj(valuePtr, &s1len); - s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); - memCmpFn = memcmp; - } else if ((valuePtr->typePtr == &tclStringType) - && (value2Ptr->typePtr == &tclStringType)) { - /* - * Do a unicode-specific comparison if both of the args are of - * String type. If the char length == byte length, we can do a - * memcmp. In benchmark testing this proved the most efficient - * check between the unicode and string comparison operations. - */ - - s1len = Tcl_GetCharLength(valuePtr); - s2len = Tcl_GetCharLength(value2Ptr); - if ((s1len == valuePtr->length) - && (valuePtr->bytes != NULL) - && (s2len == value2Ptr->length) - && (value2Ptr->bytes != NULL)) { - s1 = valuePtr->bytes; - s2 = value2Ptr->bytes; - memCmpFn = memcmp; - } else { - s1 = (char *) Tcl_GetUnicode(valuePtr); - s2 = (char *) Tcl_GetUnicode(value2Ptr); - if ( -#ifdef WORDS_BIGENDIAN - 1 -#else - checkEq -#endif - ) { - memCmpFn = memcmp; - s1len *= sizeof(Tcl_UniChar); - s2len *= sizeof(Tcl_UniChar); - } else { - memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp; - } - } - } else { - /* - * In order to handle the special Tcl \xC0\x80 null encoding - * for utf-8, strcmp can't do a simple memcmp. - */ - - { - int empty; - if ((empty = TclCheckEmptyString(valuePtr)) > 0) { - switch (TclCheckEmptyString(value2Ptr)) { - case -1: - s1 = ""; - s1len = 0; - s2 = TclGetStringFromObj(value2Ptr, &s2len); - break; - case 0: - match = -1; - goto matchdone; - case 1: - match = 0; - goto matchdone; - } - } else if (TclCheckEmptyString(value2Ptr) > 0) { - switch (empty) { - case -1: - s2 = ""; - s2len = 0; - s1 = TclGetStringFromObj(valuePtr, &s1len); - break; - case 0: - match = 1; - goto matchdone; - case 1: - match = 0; - goto matchdone; - } - } else { - s1 = TclGetStringFromObj(valuePtr, &s1len); - s2 = TclGetStringFromObj(value2Ptr, &s2len); - } - } - - - if (checkEq) { - memCmpFn = memcmp; - } else { - memCmpFn = (memCmpFn_t) TclpUtfNcmp2; - } - - } - - if (checkEq && (s1len != s2len)) { - match = 1; - } else { - /* - * The comparison function should compare up to the minimum - * byte length only. - */ - match = memCmpFn(s1, s2, - (size_t) ((s1len < s2len) ? s1len : s2len)); - if (match == 0) { - match = s1len - s2len; - } - } + match = TclStringCmp(valuePtr, value2Ptr, checkEq); } - matchdone: - /* * Make sure only -1,0,1 is returned * TODO: consider peephole opt. diff --git a/generic/tclInt.h b/generic/tclInt.h index 4bdaf58..1bc8696 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3155,6 +3155,11 @@ MODULE_SCOPE void TclSpellFix(Tcl_Interp *interp, Tcl_Obj *bad, Tcl_Obj *fix); MODULE_SCOPE void * TclStackRealloc(Tcl_Interp *interp, void *ptr, int numBytes); + +typedef int (*memCmpFn_t)(const void*, const void*, size_t); +MODULE_SCOPE int TclStringCmp (Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr, + int checkEq); + MODULE_SCOPE int TclStringMatch(const char *str, int strLen, const char *pattern, int ptnLen, int flags); MODULE_SCOPE int TclStringMatchObj(Tcl_Obj *stringObj, -- cgit v0.12 From db133f014426110646fe9631bab793e01cee6129 Mon Sep 17 00:00:00 2001 From: pooryorick Date: Sun, 6 May 2018 18:13:54 +0000 Subject: Factor options handling out of StringCmpCmd. --- generic/tclCmdMZ.c | 94 +++++++++++++++++++++++++++++++++--------------------- generic/tclInt.h | 3 +- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 9520f86..433b9e8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2750,45 +2750,19 @@ StringCmpCmd( */ const char *string1, *string2; - int length1, length2, i, match, length, nocase = 0, reqlength = -1; + int length1, length2, match, length, nocase, reqlength, status; typedef int (*strCmpFn_t)(const char *, const char *, unsigned int); strCmpFn_t strCmpFn; - if (objc < 3 || objc > 6) { - str_cmp_args: - Tcl_WrongNumArgs(interp, 1, objv, - "?-nocase? ?-length int? string1 string2"); - return TCL_ERROR; - } - - for (i = 1; i < objc-2; i++) { - string2 = TclGetStringFromObj(objv[i], &length2); - if ((length2 > 1) && !strncmp(string2, "-nocase", (size_t)length2)) { - nocase = 1; - } else if ((length2 > 1) - && !strncmp(string2, "-length", (size_t)length2)) { - if (i+1 >= objc-2) { - goto str_cmp_args; - } - i++; - if (TclGetIntFromObj(interp, objv[i], &reqlength) != TCL_OK) { - return TCL_ERROR; - } - } else { - Tcl_SetObjResult(interp, Tcl_ObjPrintf( - "bad option \"%s\": must be -nocase or -length", - string2)); - Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option", - string2, NULL); - return TCL_ERROR; - } + if ((status = TclStringCmpOpts(interp, objc, objv, &reqlength, + (char **)&string2, &length2, &nocase)) != TCL_OK){ + return status; } /* * From now on, we only access the two objects at the end of the argument * array. */ - objv += objc-2; if ((reqlength == 0) || (objv[0] == objv[1])) { @@ -2802,12 +2776,6 @@ StringCmpCmd( if (!nocase && TclIsPureByteArray(objv[0]) && TclIsPureByteArray(objv[1])) { - /* - * Use binary versions of comparisons since that won't cause undue - * type conversions and it is much faster. Only do this if we're - * case-sensitive (which is all that really makes sense with byte - * arrays anyway, and we have no memcasecmp() for some reason... :^) - */ string1 = (char *) Tcl_GetByteArrayFromObj(objv[0], &length1); string2 = (char *) Tcl_GetByteArrayFromObj(objv[1], &length2); @@ -2885,6 +2853,12 @@ int TclStringCmp ( } else { if (TclIsPureByteArray(value1Ptr) && TclIsPureByteArray(value2Ptr)) { + /* + * Use binary versions of comparisons since that won't cause undue + * type conversions and it is much faster. Only do this if we're + * case-sensitive (which is all that really makes sense with byte + * arrays anyway, and we have no memcasecmp() for some reason... :^) + */ s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len); s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); memCmpFn = memcmp; @@ -2987,6 +2961,54 @@ int TclStringCmp ( return match; } +int TclStringCmpOpts ( + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[], /* Argument objects. */ + int *reqlength, + char **stringPtr, + int *length, + int *nocase + +) +{ + int i; + const char *string = *stringPtr; + + *reqlength = -1; + *nocase = 0; + if (objc < 3 || objc > 6) { + str_cmp_args: + Tcl_WrongNumArgs(interp, 1, objv, + "?-nocase? ?-length int? string1 string2"); + return TCL_ERROR; + } + + for (i = 1; i < objc-2; i++) { + string = TclGetStringFromObj(objv[i], length); + if ((*length > 1) && !strncmp(string, "-nocase", (size_t)*length)) { + *nocase = 1; + } else if ((*length > 1) + && !strncmp(string, "-length", (size_t)*length)) { + if (i+1 >= objc-2) { + goto str_cmp_args; + } + i++; + if (TclGetIntFromObj(interp, objv[i], reqlength) != TCL_OK) { + return TCL_ERROR; + } + } else { + Tcl_SetObjResult(interp, Tcl_ObjPrintf( + "bad option \"%s\": must be -nocase or -length", + string)); + Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option", + string, NULL); + return TCL_ERROR; + } + } + return TCL_OK; +} + /* *---------------------------------------------------------------------- * diff --git a/generic/tclInt.h b/generic/tclInt.h index 1bc8696..67f53fd 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3159,7 +3159,8 @@ MODULE_SCOPE void * TclStackRealloc(Tcl_Interp *interp, void *ptr, typedef int (*memCmpFn_t)(const void*, const void*, size_t); MODULE_SCOPE int TclStringCmp (Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr, int checkEq); - +MODULE_SCOPE int TclStringCmpOpts (Tcl_Interp *interp, int objc, Tcl_Obj *const objv[], + int *reqlength, char **stringPtr, int *length2, int *nocase); MODULE_SCOPE int TclStringMatch(const char *str, int strLen, const char *pattern, int ptnLen, int flags); MODULE_SCOPE int TclStringMatchObj(Tcl_Obj *stringObj, -- cgit v0.12 From 5f0cf291f513b75b00ea3d59842b83dc24047c1c Mon Sep 17 00:00:00 2001 From: pooryorick Date: Mon, 7 May 2018 07:43:00 +0000 Subject: Deduplicate code in INST_STR_CMP, StringCmpCmd, and StringEqualCmd. --- generic/tclCmdMZ.c | 369 ++++++++++++++++++--------------------------------- generic/tclExecute.c | 2 +- generic/tclInt.h | 4 +- 3 files changed, 130 insertions(+), 245 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 433b9e8..bc798b7 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2599,10 +2599,8 @@ StringEqualCmd( * the expr string comparison in INST_EQ/INST_NEQ/INST_LT/...). */ - const char *string1, *string2; - int length1, length2, i, match, length, nocase = 0, reqlength = -1; - typedef int (*strCmpFn_t)(const char *, const char *, unsigned int); - strCmpFn_t strCmpFn; + const char *string2; + int length2, i, match, nocase = 0, reqlength = -1; if (objc < 3 || objc > 6) { str_cmp_args: @@ -2641,78 +2639,7 @@ StringEqualCmd( objv += objc-2; - if ((reqlength == 0) || (objv[0] == objv[1])) { - /* - * Always match at 0 chars of if it is the same obj. - */ - - Tcl_SetObjResult(interp, Tcl_NewBooleanObj(1)); - return TCL_OK; - } - - if (!nocase && TclIsPureByteArray(objv[0]) && - TclIsPureByteArray(objv[1])) { - /* - * Use binary versions of comparisons since that won't cause undue - * type conversions and it is much faster. Only do this if we're - * case-sensitive (which is all that really makes sense with byte - * arrays anyway, and we have no memcasecmp() for some reason... :^) - */ - - string1 = (char *) Tcl_GetByteArrayFromObj(objv[0], &length1); - string2 = (char *) Tcl_GetByteArrayFromObj(objv[1], &length2); - strCmpFn = (strCmpFn_t) memcmp; - } else if ((objv[0]->typePtr == &tclStringType) - && (objv[1]->typePtr == &tclStringType)) { - /* - * Do a unicode-specific comparison if both of the args are of String - * type. In benchmark testing this proved the most efficient check - * between the unicode and string comparison operations. - */ - - string1 = (char *) Tcl_GetUnicodeFromObj(objv[0], &length1); - string2 = (char *) Tcl_GetUnicodeFromObj(objv[1], &length2); - strCmpFn = (strCmpFn_t) - (nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp); - } else { - /* - * As a catch-all we will work with UTF-8. We cannot use memcmp() as - * that is unsafe with any string containing NUL (\xC0\x80 in Tcl's - * utf rep). We can use the more efficient TclpUtfNcmp2 if we are - * case-sensitive and no specific length was requested. - */ - - string1 = (char *) TclGetStringFromObj(objv[0], &length1); - string2 = (char *) TclGetStringFromObj(objv[1], &length2); - if ((reqlength < 0) && !nocase) { - strCmpFn = (strCmpFn_t) TclpUtfNcmp2; - } else { - length1 = Tcl_NumUtfChars(string1, length1); - length2 = Tcl_NumUtfChars(string2, length2); - strCmpFn = (strCmpFn_t) (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp); - } - } - - if ((reqlength < 0) && (length1 != length2)) { - match = 1; /* This will be reversed below. */ - } else { - length = (length1 < length2) ? length1 : length2; - if (reqlength > 0 && reqlength < length) { - length = reqlength; - } else if (reqlength < 0) { - /* - * The requested length is negative, so we ignore it by setting it - * to length + 1 so we correct the match var. - */ - - reqlength = length + 1; - } - - match = strCmpFn(string1, string2, (unsigned) length); - if ((match == 0) && (reqlength > length)) { - match = length1 - length2; - } - } + match = TclStringCmp (objv[0], objv[1], 0, nocase, reqlength); Tcl_SetObjResult(interp, Tcl_NewBooleanObj(match ? 0 : 1)); return TCL_OK; @@ -2749,128 +2676,63 @@ StringCmpCmd( * the expr string comparison in INST_EQ/INST_NEQ/INST_LT/...). */ - const char *string1, *string2; - int length1, length2, match, length, nocase, reqlength, status; - typedef int (*strCmpFn_t)(const char *, const char *, unsigned int); - strCmpFn_t strCmpFn; + int match, nocase, reqlength, status; + + if ((status = TclStringCmpOpts(interp, objc, objv, &nocase, &reqlength)) + != TCL_OK) { - if ((status = TclStringCmpOpts(interp, objc, objv, &reqlength, - (char **)&string2, &length2, &nocase)) != TCL_OK){ return status; } - /* - * From now on, we only access the two objects at the end of the argument - * array. - */ objv += objc-2; - - if ((reqlength == 0) || (objv[0] == objv[1])) { - /* - * Always match at 0 chars of if it is the same obj. - */ - - Tcl_SetObjResult(interp, Tcl_NewBooleanObj(0)); - return TCL_OK; - } - - if (!nocase && TclIsPureByteArray(objv[0]) && - TclIsPureByteArray(objv[1])) { - - string1 = (char *) Tcl_GetByteArrayFromObj(objv[0], &length1); - string2 = (char *) Tcl_GetByteArrayFromObj(objv[1], &length2); - strCmpFn = (strCmpFn_t) memcmp; - } else if ((objv[0]->typePtr == &tclStringType) - && (objv[1]->typePtr == &tclStringType)) { - /* - * Do a unicode-specific comparison if both of the args are of String - * type. In benchmark testing this proved the most efficient check - * between the unicode and string comparison operations. - */ - - string1 = (char *) Tcl_GetUnicodeFromObj(objv[0], &length1); - string2 = (char *) Tcl_GetUnicodeFromObj(objv[1], &length2); - strCmpFn = (strCmpFn_t) - (nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp); - } else { - /* - * As a catch-all we will work with UTF-8. We cannot use memcmp() as - * that is unsafe with any string containing NUL (\xC0\x80 in Tcl's - * utf rep). We can use the more efficient TclpUtfNcmp2 if we are - * case-sensitive and no specific length was requested. - */ - - string1 = (char *) TclGetStringFromObj(objv[0], &length1); - string2 = (char *) TclGetStringFromObj(objv[1], &length2); - if ((reqlength < 0) && !nocase) { - strCmpFn = (strCmpFn_t) TclpUtfNcmp2; - } else { - length1 = Tcl_NumUtfChars(string1, length1); - length2 = Tcl_NumUtfChars(string2, length2); - strCmpFn = (strCmpFn_t) (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp); - } - } - - length = (length1 < length2) ? length1 : length2; - if (reqlength > 0 && reqlength < length) { - length = reqlength; - } else if (reqlength < 0) { - /* - * The requested length is negative, so we ignore it by setting it to - * length + 1 so we correct the match var. - */ - - reqlength = length + 1; - } - - match = strCmpFn(string1, string2, (unsigned) length); - if ((match == 0) && (reqlength > length)) { - match = length1 - length2; - } - - Tcl_SetObjResult(interp, - Tcl_NewIntObj((match > 0) ? 1 : (match < 0) ? -1 : 0)); + match = TclStringCmp (objv[0], objv[1], 0, nocase, reqlength); + Tcl_SetObjResult(interp, Tcl_NewIntObj(match)); return TCL_OK; } int TclStringCmp ( Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr, - int checkEq + int checkEq, /* comparison is only for equality */ + int nocase, /* comparison is not case sensitive */ + int reqlength /* requested length */ ) { - char *s1, *s2; - int empty, match, s1len, s2len; - memCmpFn_t memCmpFn; + char *s1, *s2; + int empty, length, match, s1len, s2len; + memCmpFn_t memCmpFn; + if ((reqlength == 0) || (value1Ptr == value2Ptr)) { /* - * When we have equal-length we can check only for (in)equality. - * We can use memcmp in all (n)eq cases because we - * don't need to worry about lexical LE/BE variance. + * Always match at 0 chars of if it is the same obj. */ + match = 0; + } else { - if (value1Ptr == value2Ptr) { - match = 0; - } else { - if (TclIsPureByteArray(value1Ptr) - && TclIsPureByteArray(value2Ptr)) { - /* - * Use binary versions of comparisons since that won't cause undue - * type conversions and it is much faster. Only do this if we're - * case-sensitive (which is all that really makes sense with byte - * arrays anyway, and we have no memcasecmp() for some reason... :^) - */ - s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len); - s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); - memCmpFn = memcmp; - } else if ((value1Ptr->typePtr == &tclStringType) - && (value2Ptr->typePtr == &tclStringType)) { - /* - * Do a unicode-specific comparison if both of the args are of - * String type. If the char length == byte length, we can do a - * memcmp. In benchmark testing this proved the most efficient - * check between the unicode and string comparison operations. - */ + if (!nocase && TclIsPureByteArray(value1Ptr) + && TclIsPureByteArray(value2Ptr)) { + /* + * Use binary versions of comparisons since that won't cause undue + * type conversions and it is much faster. Only do this if we're + * case-sensitive (which is all that really makes sense with byte + * arrays anyway, and we have no memcasecmp() for some reason... :^) + */ + s1 = (char *) Tcl_GetByteArrayFromObj(value1Ptr, &s1len); + s2 = (char *) Tcl_GetByteArrayFromObj(value2Ptr, &s2len); + memCmpFn = memcmp; + } else if ((value1Ptr->typePtr == &tclStringType) + && (value2Ptr->typePtr == &tclStringType)) { + /* + * Do a unicode-specific comparison if both of the args are of + * String type. If the char length == byte length, we can do a + * memcmp. In benchmark testing this proved the most efficient + * check between the unicode and string comparison operations. + */ + if (nocase) { + s1 = (char *) Tcl_GetUnicodeFromObj(value1Ptr, &s1len); + s2 = (char *) Tcl_GetUnicodeFromObj(value2Ptr, &s2len); + memCmpFn = (memCmpFn_t)Tcl_UniCharNcasecmp; + } else { s1len = Tcl_GetCharLength(value1Ptr); s2len = Tcl_GetCharLength(value2Ptr); if ((s1len == value1Ptr->length) @@ -2897,66 +2759,92 @@ int TclStringCmp ( memCmpFn = (memCmpFn_t) Tcl_UniCharNcmp; } } - } else { - /* - * In order to handle the special Tcl \xC0\x80 null encoding - * for utf-8, strcmp can't do a simple memcmp. - */ - - if ((empty = TclCheckEmptyString(value1Ptr)) > 0) { - switch (TclCheckEmptyString(value2Ptr)) { - case -1: - s1 = ""; - s1len = 0; - s2 = TclGetStringFromObj(value2Ptr, &s2len); - break; - case 0: - match = -1; - goto matchdone; - case 1: - match = 0; - goto matchdone; - } - } else if (TclCheckEmptyString(value2Ptr) > 0) { - switch (empty) { - case -1: - s2 = ""; - s2len = 0; - s1 = TclGetStringFromObj(value1Ptr, &s1len); - break; - case 0: - match = 1; - goto matchdone; - case 1: - match = 0; - goto matchdone; - } - } else { - s1 = TclGetStringFromObj(value1Ptr, &s1len); + } + } else { + if ((empty = TclCheckEmptyString(value1Ptr)) > 0) { + switch (TclCheckEmptyString(value2Ptr)) { + case -1: + s1 = ""; + s1len = 0; s2 = TclGetStringFromObj(value2Ptr, &s2len); + break; + case 0: + match = -1; + goto matchdone; + case 1: + match = 0; + goto matchdone; } - - if (checkEq) { - memCmpFn = memcmp; - } else { - memCmpFn = (memCmpFn_t) TclpUtfNcmp2; + } else if (TclCheckEmptyString(value2Ptr) > 0) { + switch (empty) { + case -1: + s2 = ""; + s2len = 0; + s1 = TclGetStringFromObj(value1Ptr, &s1len); + break; + case 0: + match = 1; + goto matchdone; + case 1: + match = 0; + goto matchdone; } + } else { + s1 = TclGetStringFromObj(value1Ptr, &s1len); + s2 = TclGetStringFromObj(value2Ptr, &s2len); } + if (!nocase && checkEq) { + /* + * When we have equal-length we can check only for (in)equality. + * We can use memcmp in all (n)eq cases because we + * don't need to worry about lexical LE/BE variance. + */ + memCmpFn = memcmp; + } else { - if (checkEq && (s1len != s2len)) { - match = 1; - } else { /* - * The comparison function should compare up to the minimum - * byte length only. + * As a catch-all we will work with UTF-8. We cannot use memcmp() as + * that is unsafe with any string containing NUL (\xC0\x80 in Tcl's + * utf rep). We can use the more efficient TclpUtfNcmp2 if we are + * case-sensitive and no specific length was requested. */ - match = memCmpFn(s1, s2, - (size_t) ((s1len < s2len) ? s1len : s2len)); - if (match == 0) { - match = s1len - s2len; + + if ((reqlength < 0) && !nocase) { + memCmpFn = (memCmpFn_t) TclpUtfNcmp2; + } else { + s1len = Tcl_NumUtfChars(s1, s1len); + s2len = Tcl_NumUtfChars(s2, s2len); + memCmpFn = (memCmpFn_t) (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp); } } } + + length = (s1len < s2len) ? s1len : s2len; + if (reqlength > 0 && reqlength < length) { + length = reqlength; + } else if (reqlength < 0) { + /* + * The requested length is negative, so we ignore it by setting it to + * length + 1 so we correct the match var. + */ + + reqlength = length + 1; + } + + if (checkEq && (s1len != s2len)) { + match = 1; /* This will be reversed below. */ + } else { + /* + * The comparison function should compare up to the minimum + * byte length only. + */ + match = memCmpFn(s1, s2, (size_t) length); + } + if ((match == 0) && (reqlength > length)) { + match = s1len - s2len; + } + match = (match > 0) ? 1 : (match < 0) ? -1 : 0; + } matchdone: return match; } @@ -2965,15 +2853,12 @@ int TclStringCmpOpts ( Tcl_Interp *interp, /* Current interpreter. */ int objc, /* Number of arguments. */ Tcl_Obj *const objv[], /* Argument objects. */ - int *reqlength, - char **stringPtr, - int *length, - int *nocase - + int *nocase, + int *reqlength ) { - int i; - const char *string = *stringPtr; + int i, length; + const char *string; *reqlength = -1; *nocase = 0; @@ -2985,11 +2870,11 @@ int TclStringCmpOpts ( } for (i = 1; i < objc-2; i++) { - string = TclGetStringFromObj(objv[i], length); - if ((*length > 1) && !strncmp(string, "-nocase", (size_t)*length)) { + string = TclGetStringFromObj(objv[i], &length); + if ((length > 1) && !strncmp(string, "-nocase", (size_t)length)) { *nocase = 1; - } else if ((*length > 1) - && !strncmp(string, "-length", (size_t)*length)) { + } else if ((length > 1) + && !strncmp(string, "-length", (size_t)length)) { if (i+1 >= objc-2) { goto str_cmp_args; } diff --git a/generic/tclExecute.c b/generic/tclExecute.c index a76e686..4c14514 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5387,7 +5387,7 @@ TEBCresume( { int checkEq = ((*pc == INST_EQ) || (*pc == INST_NEQ) || (*pc == INST_STR_EQ) || (*pc == INST_STR_NEQ)); - match = TclStringCmp(valuePtr, value2Ptr, checkEq); + match = TclStringCmp(valuePtr, value2Ptr, checkEq, 0, -1); } /* diff --git a/generic/tclInt.h b/generic/tclInt.h index 67f53fd..0a3285f 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3158,9 +3158,9 @@ MODULE_SCOPE void * TclStackRealloc(Tcl_Interp *interp, void *ptr, typedef int (*memCmpFn_t)(const void*, const void*, size_t); MODULE_SCOPE int TclStringCmp (Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr, - int checkEq); + int checkEq, int nocase, int reqlength); MODULE_SCOPE int TclStringCmpOpts (Tcl_Interp *interp, int objc, Tcl_Obj *const objv[], - int *reqlength, char **stringPtr, int *length2, int *nocase); + int *nocase, int *reqlength); MODULE_SCOPE int TclStringMatch(const char *str, int strLen, const char *pattern, int ptnLen, int flags); MODULE_SCOPE int TclStringMatchObj(Tcl_Obj *stringObj, -- cgit v0.12 From 78ee8c697e423a5b4718fabb53eef1c5f6a2a2b1 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 8 May 2018 09:49:16 +0000 Subject: fixes [92564326a98b5510]: wrong x64-aligned handle from readdir64 by opendir/rewinddir/closedir, if HAVE_STRUCT_DIRENT64 used. --- unix/tclUnixFCmd.c | 14 +++++++------- unix/tclUnixFile.c | 4 ++-- unix/tclUnixPort.h | 6 ++++++ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/unix/tclUnixFCmd.c b/unix/tclUnixFCmd.c index b5450b1..4377b77 100644 --- a/unix/tclUnixFCmd.c +++ b/unix/tclUnixFCmd.c @@ -333,7 +333,7 @@ DoRenameFile( if ((Realpath((char *) src, srcPath) != NULL) /* INTL: Native. */ && (Realpath((char *) dst, dstPath) != NULL) /* INTL: Native */ && (strncmp(srcPath, dstPath, strlen(srcPath)) != 0)) { - dirPtr = opendir(dst); /* INTL: Native. */ + dirPtr = TclOSopendir(dst); /* INTL: Native. */ if (dirPtr != NULL) { while (1) { dirEntPtr = TclOSreaddir(dirPtr); /* INTL: Native. */ @@ -343,11 +343,11 @@ DoRenameFile( if ((strcmp(dirEntPtr->d_name, ".") != 0) && (strcmp(dirEntPtr->d_name, "..") != 0)) { errno = EEXIST; - closedir(dirPtr); + TclOSclosedir(dirPtr); return TCL_ERROR; } } - closedir(dirPtr); + TclOSclosedir(dirPtr); } } errno = EINVAL; @@ -945,7 +945,7 @@ TraverseUnixTree( errorPtr); } #ifndef HAVE_FTS - dirPtr = opendir(source); /* INTL: Native. */ + dirPtr = TclOSopendir(source); /* INTL: Native. */ if (dirPtr == NULL) { /* * Can't read directory @@ -957,7 +957,7 @@ TraverseUnixTree( result = (*traverseProc)(sourcePtr, targetPtr, &statBuf, DOTREE_PRED, errorPtr); if (result != TCL_OK) { - closedir(dirPtr); + TclOSclosedir(dirPtr); return result; } @@ -1007,11 +1007,11 @@ TraverseUnixTree( * NULL-return that may a symptom of a buggy readdir. */ - rewinddir(dirPtr); + TclOSrewinddir(dirPtr); numProcessed = 0; } } - closedir(dirPtr); + TclOSclosedir(dirPtr); /* * Strip off the trailing slash we added diff --git a/unix/tclUnixFile.c b/unix/tclUnixFile.c index 0a2099c..1dc73ae 100644 --- a/unix/tclUnixFile.c +++ b/unix/tclUnixFile.c @@ -310,7 +310,7 @@ TclpMatchInDirectory( return TCL_OK; } - d = opendir(native); /* INTL: Native. */ + d = TclOSopendir(native); /* INTL: Native. */ if (d == NULL) { Tcl_DStringFree(&ds); if (interp != NULL) { @@ -383,7 +383,7 @@ TclpMatchInDirectory( } } - closedir(d); + TclOSclosedir(d); Tcl_DStringFree(&ds); Tcl_DStringFree(&dsOrig); Tcl_DecrRefCount(fileNamePtr); diff --git a/unix/tclUnixPort.h b/unix/tclUnixPort.h index 965014e..cc31bf9 100644 --- a/unix/tclUnixPort.h +++ b/unix/tclUnixPort.h @@ -60,9 +60,15 @@ #ifdef HAVE_STRUCT_DIRENT64 typedef struct dirent64 Tcl_DirEntry; # define TclOSreaddir readdir64 +# define TclOSopendir opendir64 +# define TclOSrewinddir rewinddir64 +# define TclOSclosedir closedir64 #else typedef struct dirent Tcl_DirEntry; # define TclOSreaddir readdir +# define TclOSopendir opendir +# define TclOSrewinddir rewinddir +# define TclOSclosedir closedir #endif #ifdef HAVE_TYPE_OFF64_T -- cgit v0.12 From 47e9dbdc37939a6e874afd15d6586864649f23c7 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 8 May 2018 10:16:26 +0000 Subject: prevents UB/segfault by unexpected return-code (not -1/0/1) and avoid warnings like: tclCmdMZ.c:2815:15: warning: `s1` may be used uninitialized in this function [-Wmaybe-uninitialized] s1len = Tcl_NumUtfChars(s1, s1len); ^~~~~~~~~~~~~~~~~~~~~~~~~~ --- generic/tclCmdMZ.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index bc798b7..8530719 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2772,6 +2772,7 @@ int TclStringCmp ( match = -1; goto matchdone; case 1: + default: /* avoid warn: `s2` may be used uninitialized */ match = 0; goto matchdone; } @@ -2786,6 +2787,7 @@ int TclStringCmp ( match = 1; goto matchdone; case 1: + default: /* avoid warn: `s1` may be used uninitialized */ match = 0; goto matchdone; } -- cgit v0.12