From 5f1098561b5067a8f429be371166ed3b1e6189e4 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 4 Nov 2016 21:29:30 +0000 Subject: First draft refactoring the [string first] functionality. --- generic/tclExecute.c | 4 +++ generic/tclInt.h | 2 ++ generic/tclStringObj.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 83b83f1..5ea199d 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5720,6 +5720,9 @@ TEBCresume( NEXT_INST_V(1, 3, 1); case INST_STR_FIND: +#if 1 + match = TclStringFind(OBJ_UNDER_TOS, OBJ_AT_TOS, 0); +#else ustring1 = Tcl_GetUnicodeFromObj(OBJ_AT_TOS, &length); /* Haystack */ ustring2 = Tcl_GetUnicodeFromObj(OBJ_UNDER_TOS, &length2);/* Needle */ @@ -5734,6 +5737,7 @@ TEBCresume( } } } +#endif TRACE(("%.20s %.20s => %d\n", O2S(OBJ_UNDER_TOS), O2S(OBJ_AT_TOS), match)); diff --git a/generic/tclInt.h b/generic/tclInt.h index 8a647f0..26592f9 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3138,6 +3138,8 @@ MODULE_SCOPE void * TclStackRealloc(Tcl_Interp *interp, void *ptr, MODULE_SCOPE int TclStringCatObjv(Tcl_Interp *interp, int inPlace, int objc, Tcl_Obj *const objv[], Tcl_Obj **objPtrPtr); +MODULE_SCOPE int TclStringFind(Tcl_Obj *needle, Tcl_Obj *haystack, + unsigned int start); MODULE_SCOPE int TclStringMatch(const char *str, int strLen, const char *pattern, int ptnLen, int flags); MODULE_SCOPE int TclStringMatchObj(Tcl_Obj *stringObj, diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index b486106..6e1529c 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -2841,6 +2841,90 @@ TclStringCatObjv( /* *--------------------------------------------------------------------------- * + * TclStringFind -- + * + * Implements the [string first] operation. + * + * Results: + * If needle is found as a substring of haystack, the index of the + * first instance of such a find is returned. If needle is not present + * as a substring of haystack, -1 is returned. + * + * Side effects: + * needle and haystack may have their Tcl_ObjType changed. + * + *--------------------------------------------------------------------------- + */ + +int +TclStringFind( + Tcl_Obj *needle, + Tcl_Obj *haystack, + unsigned int start) +{ + int ln, lh; + + if (TclIsPureByteArray(needle) && TclIsPureByteArray(haystack)) { + unsigned char *end, *try, *bh; + unsigned char *bn = Tcl_GetByteArrayFromObj(needle, &ln); + + if (ln == 0) { + /* + * We don't find empty substrings. Bizarre! + * + * TODO: When we one day make this a true substring + * finder, change this to "return 0" + */ + return -1; + } + + bh = Tcl_GetByteArrayFromObj(haystack, &lh); + end = bh + lh; + + try = bh + start; + while (try + ln <= end) { + try = memchr(try, bn[0], end - try); + + if (try == NULL) { + return -1; + } + if (0 == memcmp(try+1, bn+1, ln-1)) { + return (try - bh); + } + try++; + } + return -1; + } + + /* TODO: Detect and optimize case with single byte chars only */ + + { + Tcl_UniChar *try, *end, *uh; + Tcl_UniChar *un = Tcl_GetUnicodeFromObj(needle, &ln); + + if (ln == 0) { + /* See above */ + return -1; + } + + uh = Tcl_GetUnicodeFromObj(haystack, &lh); + end = uh + lh; + + try = uh + start; + while (try + ln <= end) { + if ((*try == *un) + && (0 == memcmp(try+1, un+1, (ln-1)*sizeof(Tcl_UniChar)))) { + return (try - uh); + } + try++; + } + return -1; + } +} + +/* + *--------------------------------------------------------------------------- + * * TclStringObjReverse -- * * Implements the [string reverse] operation. -- cgit v0.12 From a4cbbc51f40fedd5a20c14571c23b10d5e311c8a Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 7 Nov 2016 19:28:25 +0000 Subject: Implement direct eval [string first] with the refactored engine. --- generic/tclCmdMZ.c | 78 +++++++----------------------------------------------- 1 file changed, 9 insertions(+), 69 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 10c2ef3..be77b1f 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1176,8 +1176,7 @@ StringFirstCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - Tcl_UniChar *needleStr, *haystackStr; - int match, start, needleLen, haystackLen; + int start = 0; if (objc < 3 || objc > 4) { Tcl_WrongNumArgs(interp, 1, objv, @@ -1185,82 +1184,23 @@ StringFirstCmd( return TCL_ERROR; } - /* - * We are searching haystackStr for the sequence needleStr. - */ - - match = -1; - start = 0; - haystackLen = -1; - - needleStr = Tcl_GetUnicodeFromObj(objv[1], &needleLen); - haystackStr = Tcl_GetUnicodeFromObj(objv[2], &haystackLen); - if (objc == 4) { - /* - * If a startIndex is specified, we will need to fast forward to that - * point in the string before we think about a match. - */ + int size = Tcl_GetCharLength(objv[2]); - if (TclGetIntForIndexM(interp, objv[3], haystackLen-1, - &start) != TCL_OK){ + if (TCL_OK != TclGetIntForIndexM(interp, objv[3], size - 1, &start)) { return TCL_ERROR; } - /* - * Reread to prevent shimmering problems. - */ - - needleStr = Tcl_GetUnicodeFromObj(objv[1], &needleLen); - haystackStr = Tcl_GetUnicodeFromObj(objv[2], &haystackLen); - - if (start >= haystackLen) { - goto str_first_done; - } else if (start > 0) { - haystackStr += start; - haystackLen -= start; - } else if (start < 0) { - /* - * Invalid start index mapped to string start; Bug #423581 - */ - + if (start < 0) { start = 0; } - } - - /* - * If the length of the needle is more than the length of the haystack, it - * cannot be contained in there so we can avoid searching. [Bug 2960021] - */ - - if (needleLen > 0 && needleLen <= haystackLen) { - register Tcl_UniChar *p, *end; - - end = haystackStr + haystackLen - needleLen + 1; - for (p = haystackStr; p < end; p++) { - /* - * Scan forward to find the first character. - */ - - if ((*p == *needleStr) && (memcmp(needleStr, p, - sizeof(Tcl_UniChar) * (size_t)needleLen) == 0)) { - match = p - haystackStr; - break; - } + if (start >= size) { + Tcl_SetObjResult(interp, Tcl_NewIntObj(-1)); + return TCL_OK; } } - - /* - * Compute the character index of the matching string by counting the - * number of characters before the match. - */ - - if ((match != -1) && (objc == 4)) { - match += start; - } - - str_first_done: - Tcl_SetObjResult(interp, Tcl_NewIntObj(match)); + Tcl_SetObjResult(interp, Tcl_NewIntObj(TclStringFind(objv[1], + objv[2], start))); return TCL_OK; } -- cgit v0.12 From b2c104021f192aefe7b82d283176e941b7f0a01e Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 7 Nov 2016 19:41:15 +0000 Subject: Consolidate the "find empty string" cases. --- generic/tclStringObj.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 6e1529c..0c0121e 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -2862,22 +2862,22 @@ TclStringFind( Tcl_Obj *haystack, unsigned int start) { - int ln, lh; + int lh, ln = Tcl_GetCharLength(needle); + + if (ln == 0) { + /* + * We don't find empty substrings. Bizarre! + * + * TODO: When we one day make this a true substring + * finder, change this to "return 0" + */ + return -1; + } if (TclIsPureByteArray(needle) && TclIsPureByteArray(haystack)) { unsigned char *end, *try, *bh; unsigned char *bn = Tcl_GetByteArrayFromObj(needle, &ln); - if (ln == 0) { - /* - * We don't find empty substrings. Bizarre! - * - * TODO: When we one day make this a true substring - * finder, change this to "return 0" - */ - return -1; - } - bh = Tcl_GetByteArrayFromObj(haystack, &lh); end = bh + lh; @@ -2902,11 +2902,6 @@ TclStringFind( Tcl_UniChar *try, *end, *uh; Tcl_UniChar *un = Tcl_GetUnicodeFromObj(needle, &ln); - if (ln == 0) { - /* See above */ - return -1; - } - uh = Tcl_GetUnicodeFromObj(haystack, &lh); end = uh + lh; -- cgit v0.12 From 96b75029d111bab1f01acea88795c774b96b4f6e Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 7 Nov 2016 20:04:22 +0000 Subject: Optimize case of all single-byte chars. --- generic/tclStringObj.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 0c0121e..edba881 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -2896,9 +2896,27 @@ TclStringFind( return -1; } - /* TODO: Detect and optimize case with single byte chars only */ + lh = Tcl_GetCharLength(haystack); + if (haystack->bytes && (lh == haystack->length)) { + /* haystack is all single-byte chars */ - { + if (needle->bytes && (ln == needle->length)) { + /* needle is also all single-byte chars */ + char *found = strstr(haystack->bytes + start, needle->bytes); + + if (found) { + return (found - haystack->bytes); + } else { + return -1; + } + } else { + /* + * Cannot find substring with a multi-byte char inside + * a string with no multi-byte chars. + */ + return -1; + } + } else { Tcl_UniChar *try, *end, *uh; Tcl_UniChar *un = Tcl_GetUnicodeFromObj(needle, &ln); -- cgit v0.12