From 293d30f0a2a4756db591e83e93eae24681e0ec7a Mon Sep 17 00:00:00 2001 From: dkf Date: Sat, 11 Feb 2017 21:36:36 +0000 Subject: Proposed implementation of [regsub -command]. --- generic/tclCmdMZ.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++---- tests/regexp.test | 22 +++++++++++++- tests/regexpComp.test | 2 +- 3 files changed, 98 insertions(+), 8 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index ba1fc41..ffae8b2 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -487,26 +487,28 @@ Tcl_RegsubObjCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int idx, result, cflags, all, wlen, wsublen, numMatches, offset; - int start, end, subStart, subEnd, match; + int start, end, subStart, subEnd, match, command; Tcl_RegExp regExpr; Tcl_RegExpInfo info; Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL; + Tcl_Obj **args = NULL, *parts[2], *cmdObj; Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec, *wend; static const char *const options[] = { - "-all", "-nocase", "-expanded", - "-line", "-linestop", "-lineanchor", "-start", + "-all", "-command", "-expanded", "-line", + "-linestop", "-lineanchor", "-nocase", "-start", "--", NULL }; enum options { - REGSUB_ALL, REGSUB_NOCASE, REGSUB_EXPANDED, - REGSUB_LINE, REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_START, + REGSUB_ALL, REGSUB_COMMAND, REGSUB_EXPANDED, REGSUB_LINE, + REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE, REGSUB_START, REGSUB_LAST }; cflags = TCL_REG_ADVANCED; all = 0; offset = 0; + command = 0; resultPtr = NULL; for (idx = 1; idx < objc; idx++) { @@ -528,6 +530,9 @@ Tcl_RegsubObjCmd( case REGSUB_NOCASE: cflags |= TCL_REG_NOCASE; break; + case REGSUB_COMMAND: + command = 1; + break; case REGSUB_EXPANDED: cflags |= TCL_REG_EXPANDED; break; @@ -585,7 +590,7 @@ Tcl_RegsubObjCmd( } } - if (all && (offset == 0) + if (all && (offset == 0) && (command == 0) && (strpbrk(TclGetString(objv[2]), "&\\") == NULL) && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) { /* @@ -737,6 +742,68 @@ Tcl_RegsubObjCmd( Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); /* + * In -command mode, the substitutions are added as quoted arguments + * to the subSpec to form a command, that is then executed and the + * result used as the string to substitute in. + */ + + if (command) { + if (args == NULL) { + args = ckalloc(sizeof(Tcl_Obj*) * (info.nsubs + 1)); + } + + for (idx = 0 ; idx <= info.nsubs ; idx++) { + subStart = info.matches[idx].start; + subEnd = info.matches[idx].end; + if ((subStart >= 0) && (subEnd >= 0)) { + args[idx] = Tcl_NewUnicodeObj( + wstring + offset + subStart, subEnd - subStart); + } else { + args[idx] = Tcl_NewObj(); + } + } + parts[0] = subPtr; + parts[1] = Tcl_NewListObj(info.nsubs+1, args); + cmdObj = Tcl_ConcatObj(2, parts); + Tcl_IncrRefCount(cmdObj); + Tcl_DecrRefCount(parts[1]); + + result = Tcl_EvalObjEx(interp, cmdObj, TCL_EVAL_DIRECT); + Tcl_DecrRefCount(cmdObj); + if (result != TCL_OK) { + if (result == TCL_ERROR) { + Tcl_AppendObjToErrorInfo(interp, Tcl_ObjPrintf( + "\n (%s substitution computation script)", + options[REGSUB_COMMAND])); + } + goto done; + } + + Tcl_AppendObjToObj(resultPtr, Tcl_GetObjResult(interp)); + Tcl_ResetResult(interp); + + offset += end; + if (end == 0 || start == end) { + /* + * Always consume at least one character of the input string + * in order to prevent infinite loops, even when we + * technically matched the empty string; we must not match + * again at the same spot. + */ + + if (offset < wlen) { + Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1); + } + offset++; + } + if (all) { + continue; + } else { + break; + } + } + + /* * Append the subSpec argument to the variable, making appropriate * substitutions. This code is a bit hairy because of the backslash * conventions and because the code saves up ranges of characters in @@ -864,6 +931,9 @@ Tcl_RegsubObjCmd( if (subPtr && (objv[2] == objv[0])) { Tcl_DecrRefCount(subPtr); } + if (args) { + ckfree(args); + } if (resultPtr) { Tcl_DecrRefCount(resultPtr); } diff --git a/tests/regexp.test b/tests/regexp.test index 4ffdbdb..6c77b41 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -453,7 +453,7 @@ test regexp-11.4 {regsub errors} { } {1 {wrong # args: should be "regsub ?-option ...? exp string subSpec ?varName?"}} test regexp-11.5 {regsub errors} { list [catch {regsub -gorp a b c} msg] $msg -} {1 {bad option "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}} +} {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}} test regexp-11.6 {regsub errors} { list [catch {regsub -nocase a( b c d} msg] $msg } {1 {couldn't compile regular expression pattern: parentheses () not balanced}} @@ -1123,6 +1123,26 @@ test regexp-26.12 {regexp with -line option} { test regexp-26.13 {regexp without -line option} { regexp -all -inline -- {a*} "b\n" } {{} {}} + +test regexp-27.1 {regsub -command} { + regsub -command {.x.} {abcxdef} {string length} +} ab3ef +test regexp-27.2 {regsub -command} { + regsub -command {.x.} {abcxdefxghi} {string length} +} ab3efxghi +test regexp-27.3 {regsub -command} { + set x 0 + regsub -all -command {(?=.)} abcde "incr x;#" +} 1a2b3c4d5e +test regexp-27.4 {regsub -command} -body { + regsub -command {.x.} {abcxdef} error +} -returnCodes error -result cxd +test regexp-27.5 {regsub -command} { + regsub -command {(.)(.)} {abcdef} {list ,} +} {, ab a bcdef} +test regexp-27.6 {regsub -command} { + regsub -command -all {(.)(.)} {abcdef} {list ,} +} {, ab a b, cd c d, ef e f} # cleanup ::tcltest::cleanupTests diff --git a/tests/regexpComp.test b/tests/regexpComp.test index b8e64b6..fbf8012 100644 --- a/tests/regexpComp.test +++ b/tests/regexpComp.test @@ -587,7 +587,7 @@ test regexpComp-11.5 {regsub errors} { evalInProc { list [catch {regsub -gorp a b c} msg] $msg } -} {1 {bad option "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}} +} {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}} test regexpComp-11.6 {regsub errors} { evalInProc { list [catch {regsub -nocase a( b c d} msg] $msg -- cgit v0.12 From 203058f02d4fbfde79ae5a95f9648f61681a36fa Mon Sep 17 00:00:00 2001 From: dkf Date: Fri, 17 Feb 2017 06:08:18 +0000 Subject: Switch to using command prefixes properly. This is quite a bit faster. --- generic/tclCmdMZ.c | 63 +++++++++++++++++++++++++++++++++++++++--------------- tests/regexp.test | 2 +- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index ffae8b2..d6d0152 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -487,11 +487,11 @@ Tcl_RegsubObjCmd( Tcl_Obj *const objv[]) /* Argument objects. */ { int idx, result, cflags, all, wlen, wsublen, numMatches, offset; - int start, end, subStart, subEnd, match, command; + int start, end, subStart, subEnd, match, command, numParts, numArgs; Tcl_RegExp regExpr; Tcl_RegExpInfo info; Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL; - Tcl_Obj **args = NULL, *parts[2], *cmdObj; + Tcl_Obj **args = NULL, **parts; Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec, *wend; static const char *const options[] = { @@ -666,6 +666,20 @@ Tcl_RegsubObjCmd( return TCL_ERROR; } + if (command) { + /* + * In command-prefix mode, we require that the third non-option + * argument be a list, so we enforce that here. Afterwards, we fetch + * the RE compilation again in case objv[0] and objv[2] are the same + * object. (If they aren't, that's cheap to do.) + */ + + if (Tcl_ListObjLength(interp, objv[2], &numParts) != TCL_OK) { + return TCL_ERROR; + } + regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); + } + /* * Make sure to avoid problems where the objects are shared. This can * cause RegExpObj <> UnicodeObj shimmering that causes data corruption. @@ -683,7 +697,9 @@ Tcl_RegsubObjCmd( } else { subPtr = objv[2]; } - wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen); + if (!command) { + wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen); + } result = TCL_OK; @@ -742,34 +758,47 @@ Tcl_RegsubObjCmd( Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start); /* - * In -command mode, the substitutions are added as quoted arguments - * to the subSpec to form a command, that is then executed and the - * result used as the string to substitute in. + * In command-prefix mode, the substitutions are added as quoted + * arguments to the subSpec to form a command, that is then executed + * and the result used as the string to substitute in. Actually, + * everything is passed through Tcl_EvalObjv, as that's much faster. */ if (command) { if (args == NULL) { - args = ckalloc(sizeof(Tcl_Obj*) * (info.nsubs + 1)); + Tcl_ListObjGetElements(interp, subPtr, &numParts, &parts); + numArgs = numParts + info.nsubs + 1; + args = ckalloc(sizeof(Tcl_Obj*) * numArgs); + memcpy(args, parts, sizeof(Tcl_Obj*) * numParts); } for (idx = 0 ; idx <= info.nsubs ; idx++) { subStart = info.matches[idx].start; subEnd = info.matches[idx].end; if ((subStart >= 0) && (subEnd >= 0)) { - args[idx] = Tcl_NewUnicodeObj( + args[idx + numParts] = Tcl_NewUnicodeObj( wstring + offset + subStart, subEnd - subStart); } else { - args[idx] = Tcl_NewObj(); + args[idx + numParts] = Tcl_NewObj(); } + Tcl_IncrRefCount(args[idx + numParts]); + } + + /* + * At this point, we're locally holding the references to the + * argument words we added for this time round the loop, and the + * subPtr is holding the references to the words that the user + * supplied directly. None are zero-refcount, which is important + * because Tcl_EvalObjv is "hairy monster" in terms of refcount + * handling, being able to optionally add references to any of its + * argument words. We'll drop the local refs immediately + * afterwarsds; subPtr is handled in the main exit stanza. + */ + + result = Tcl_EvalObjv(interp, numArgs, args, 0); + for (idx = 0 ; idx <= info.nsubs ; idx++) { + TclDecrRefCount(args[idx + numParts]); } - parts[0] = subPtr; - parts[1] = Tcl_NewListObj(info.nsubs+1, args); - cmdObj = Tcl_ConcatObj(2, parts); - Tcl_IncrRefCount(cmdObj); - Tcl_DecrRefCount(parts[1]); - - result = Tcl_EvalObjEx(interp, cmdObj, TCL_EVAL_DIRECT); - Tcl_DecrRefCount(cmdObj); if (result != TCL_OK) { if (result == TCL_ERROR) { Tcl_AppendObjToErrorInfo(interp, Tcl_ObjPrintf( diff --git a/tests/regexp.test b/tests/regexp.test index 6c77b41..6c3d774 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -1132,7 +1132,7 @@ test regexp-27.2 {regsub -command} { } ab3efxghi test regexp-27.3 {regsub -command} { set x 0 - regsub -all -command {(?=.)} abcde "incr x;#" + regsub -all -command {(?=.)} abcde {apply {args {incr ::x}}} } 1a2b3c4d5e test regexp-27.4 {regsub -command} -body { regsub -command {.x.} {abcxdef} error -- cgit v0.12 From 4d7b9162e578238f275688adcef5d56242b8ae7e Mon Sep 17 00:00:00 2001 From: dkf Date: Fri, 17 Feb 2017 09:20:49 +0000 Subject: Stop problems with representation smashes. --- generic/tclCmdMZ.c | 7 +++++++ tests/regexp.test | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d6d0152..110de4c 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -811,6 +811,13 @@ Tcl_RegsubObjCmd( Tcl_AppendObjToObj(resultPtr, Tcl_GetObjResult(interp)); Tcl_ResetResult(interp); + /* + * Refetch the unicode, in case the representation was smashed by + * the user code. + */ + + wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen); + offset += end; if (end == 0 || start == end) { /* diff --git a/tests/regexp.test b/tests/regexp.test index 6c3d774..ad770fa 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -1143,6 +1143,12 @@ test regexp-27.5 {regsub -command} { test regexp-27.6 {regsub -command} { regsub -command -all {(.)(.)} {abcdef} {list ,} } {, ab a b, cd c d, ef e f} +test regexp-27.7 {regsub -command representation smash} { + set ::s {123=456 789} + regsub -command -all {\d+} $::s {apply {n { + expr {[llength $::s] + $n} + }}} +} {125=458 791} # cleanup ::tcltest::cleanupTests -- cgit v0.12 From 768fa857c25a31f96cfdfeeb34e8628f68ddb7ba Mon Sep 17 00:00:00 2001 From: dkf Date: Sat, 18 Feb 2017 14:26:34 +0000 Subject: Add more representation smashing testing and a memleak test. --- tests/regexp.test | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/regexp.test b/tests/regexp.test index ad770fa..f1be6eb 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -19,6 +19,20 @@ if {"::tcltest" ni [namespace children]} { unset -nocomplain foo testConstraint exec [llength [info commands exec]] + +# Used for constraining memory leak tests +testConstraint memory [llength [info commands memory]] +if {[testConstraint memory]} { + proc memtest script { + set end [lindex [split [memory info] \n] 3 3] + for {set i 0} {$i < 5} {incr i} { + uplevel 1 $script + set tmp $end + set end [lindex [split [memory info] \n] 3 3] + } + expr {$end - $tmp} + } +} test regexp-1.1 {basic regexp operation} { regexp ab*c abbbc @@ -1149,6 +1163,21 @@ test regexp-27.7 {regsub -command representation smash} { expr {[llength $::s] + $n} }}} } {125=458 791} +test regexp-27.8 {regsub -command representation smash} { + set ::t {apply {n { + expr {[llength [lindex $::t 1 1 1]] + $n} + }}} + regsub -command -all {\d+} "123=456 789" $::t +} {131=464 797} +test regexp-27.9 {regsub -command memory leak testing} memory { + set ::s "123=456 789" + set ::t {apply {n { + expr {[llength [lindex $::t 1 1 1]] + [llength $::s] + $n} + }}} + memtest { + regsub -command -all {\d+} $::s $::t + } +} 0 # cleanup ::tcltest::cleanupTests -- cgit v0.12 From fcf8e64d6b12e1682af90b9e25b364e22d04c7bf Mon Sep 17 00:00:00 2001 From: dkf Date: Sat, 18 Feb 2017 16:24:09 +0000 Subject: Testing for some error cases. --- generic/tclCmdMZ.c | 8 ++++++++ tests/regexp.test | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 110de4c..d5a6b01 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -677,6 +677,14 @@ Tcl_RegsubObjCmd( if (Tcl_ListObjLength(interp, objv[2], &numParts) != TCL_OK) { return TCL_ERROR; } + if (numParts < 1) { + Tcl_SetObjResult(interp, Tcl_NewStringObj( + "command prefix must be a list of at least one element", + -1)); + Tcl_SetErrorCode(interp, "TCL", "OPERATION", "REGSUB", + "CMDEMPTY", NULL); + return TCL_ERROR; + } regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); } diff --git a/tests/regexp.test b/tests/regexp.test index f1be6eb..2686526 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -1178,6 +1178,12 @@ test regexp-27.9 {regsub -command memory leak testing} memory { regsub -command -all {\d+} $::s $::t } } 0 +test regexp-27.10 {regsub -command error cases} -returnCodes error -body { + regsub -command . abc "def \{ghi" +} -result {unmatched open brace in list} +test regexp-27.11 {regsub -command error cases} -returnCodes error -body { + regsub -command . abc {} +} -result {command prefix must be a list of at least one element} # cleanup ::tcltest::cleanupTests -- cgit v0.12 From 6aa0cc7188b6df1dac97b03bc0b9240aa780799b Mon Sep 17 00:00:00 2001 From: dkf Date: Sat, 18 Feb 2017 18:38:52 +0000 Subject: Add documentation of [regsub -command]. --- doc/regsub.n | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ generic/tclCmdMZ.c | 4 +-- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/doc/regsub.n b/doc/regsub.n index a5b79de..23bbff9 100644 --- a/doc/regsub.n +++ b/doc/regsub.n @@ -68,6 +68,31 @@ and sequences are handled for each substitution using the information from the corresponding match. .TP +\fB\-command\fR +.VS 8.7 +Changes the handling of the substitution string so that it no longer treats +.QW & +and +.QW \e +as special characters, but instead uses them as a non-empty list of words. +Each time a substitution is processed, another complete Tcl word is appended +to that list for each substitution value (the first such argument represents +the overall matched substring, the subsequent arguments will be one per +capturing sub-RE, much as are returned from \fBregexp\fR \fB\-inline\fR) and +the overall list is then evaluated as a Tcl command call. If the command +finishes successfully, the result of command call is substituted into the +resulting string. +.RS +.PP +If \fB\-all\fR is not also given, the command callback will be invoked at most +once (exactly when the regular expression matches). If \fB\-all\fR is given, +the command callback will be invoked for each matched location, in sequence. +The exact location indices that matched are not made available to the script. +.PP +See \fBEXAMPLES\fR below for illustrative cases. +.RE +.VE 8.7 +.TP \fB\-expanded\fR . Enables use of the expanded regular expression syntax where @@ -183,6 +208,53 @@ set substitution {[format \e\e\e\eu%04x [scan "\e\e&" %c]]} set quoted [subst [string map {\en {\e\eu000a}} \e [\fBregsub\fR -all $RE $string $substitution]]] .CE +.PP +.VS 8.7 +The above operation can be done using \fBregsub \-command\fR instead, which is +often faster. (A full pre-computed \fBstring map\fR would be faster still, but +the cost of computing the map for a transformation as complex as this can be +quite large.) +.PP +.CS +# This RE is just a character class for everything "bad" +set RE {[][{};#\e\e\e$\es\eu0080-\euffff]} + +# This encodes what the RE described above matches +proc encodeChar {ch} { + # newline is handled specially since backslash-newline is a + # special sequence. + if {$ch eq "\en"} { + return "\e\eu000a" + } + # No point in writing this as a one-liner + scan $ch %c charNumber + format "\e\eu%04x" $charNumber +} + +set quoted [\fBregsub\fR -all -command $RE $string encodeChar] +.CE +.PP +Decoding a URL-encoded string using \fBregsub \-command\fR, a lambda term and +the \fBapply\fR command. +.PP +.CS +# Match one of the sequences in a URL-encoded string that needs +# fixing, converting + to space and %XX to the right character +# (e.g., %7e becomes ~) +set RE {(\e+)|%([0-9A-Fa-f]{2})} + +# Note that -command uses a command prefix, not a command name +set decoded [\fBregsub\fR -all -command $RE $string {apply {{- p h} { + # + is a special case; handle directly + if {$p eq "+"} { + return " " + } + # convert hex to a char + scan $h %x charNumber + format %c $charNumber +}}}] +.CE +.VE 8.7 .SH "SEE ALSO" regexp(n), re_syntax(n), subst(n), string(n) .SH KEYWORDS diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d5a6b01..4178ba8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -500,8 +500,8 @@ Tcl_RegsubObjCmd( "--", NULL }; enum options { - REGSUB_ALL, REGSUB_COMMAND, REGSUB_EXPANDED, REGSUB_LINE, - REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE, REGSUB_START, + REGSUB_ALL, REGSUB_COMMAND, REGSUB_EXPANDED, REGSUB_LINE, + REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE, REGSUB_START, REGSUB_LAST }; -- cgit v0.12