Implement TIP #463: Command-Driven Substitutions for regsub

author: dkf <donal.k.fellows@manchester.ac.uk> 2017-06-22 21:46:40 (GMT)
committer: dkf <donal.k.fellows@manchester.ac.uk> 2017-06-22 21:46:40 (GMT)
commit: f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff (patch)
tree: 318ce059db03075568b505a7caa2624fce436e13
parent: f57dacd4955b9dd474cfd9d4e4d55cac22990f6c (diff)
parent: 69201f94bdcef502012a231e2100bceef5062f90 (diff)
download: tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.zip
tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.tar.gz
tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.tar.bz2
4 files changed, 256 insertions, 9 deletions
diff --git a/doc/regsub.n b/doc/regsub.n
index a5b79de..23bbff9 100644
--- a/doc/regsub.n
+++ b/doc/regsub.n
@@ -68,6 +68,31 @@ and
 sequences are handled for each substitution using the information
 from the corresponding match.
 .TP
+\fB\-command\fR
+.VS 8.7
+Changes the handling of the substitution string so that it no longer treats
+.QW &
+and
+.QW \e
+as special characters, but instead uses them as a non-empty list of words.
+Each time a substitution is processed, another complete Tcl word is appended
+to that list for each substitution value (the first such argument represents
+the overall matched substring, the subsequent arguments will be one per
+capturing sub-RE, much as are returned from \fBregexp\fR \fB\-inline\fR) and
+the overall list is then evaluated as a Tcl command call. If the command
+finishes successfully, the result of command call is substituted into the
+resulting string.
+.RS
+.PP
+If \fB\-all\fR is not also given, the command callback will be invoked at most
+once (exactly when the regular expression matches). If \fB\-all\fR is given,
+the command callback will be invoked for each matched location, in sequence.
+The exact location indices that matched are not made available to the script.
+.PP
+See \fBEXAMPLES\fR below for illustrative cases.
+.RE
+.VE 8.7
+.TP
 \fB\-expanded\fR
 .
 Enables use of the expanded regular expression syntax where
@@ -183,6 +208,53 @@ set substitution {[format \e\e\e\eu%04x [scan "\e\e&" %c]]}
 set quoted [subst [string map {\en {\e\eu000a}} \e
         [\fBregsub\fR -all $RE $string $substitution]]]
 .CE
+.PP
+.VS 8.7
+The above operation can be done using \fBregsub \-command\fR instead, which is
+often faster. (A full pre-computed \fBstring map\fR would be faster still, but
+the cost of computing the map for a transformation as complex as this can be
+quite large.)
+.PP
+.CS
+# This RE is just a character class for everything "bad"
+set RE {[][{};#\e\e\e$\es\eu0080-\euffff]}
+
+# This encodes what the RE described above matches
+proc encodeChar {ch} {
+    # newline is handled specially since backslash-newline is a
+    # special sequence.
+    if {$ch eq "\en"} {
+        return "\e\eu000a"
+    }
+    # No point in writing this as a one-liner
+    scan $ch %c charNumber
+    format "\e\eu%04x" $charNumber
+}
+
+set quoted [\fBregsub\fR -all -command $RE $string encodeChar]
+.CE
+.PP
+Decoding a URL-encoded string using \fBregsub \-command\fR, a lambda term and
+the \fBapply\fR command.
+.PP
+.CS
+# Match one of the sequences in a URL-encoded string that needs
+# fixing, converting + to space and %XX to the right character
+# (e.g., %7e becomes ~)
+set RE {(\e+)|%([0-9A-Fa-f]{2})}
+
+# Note that -command uses a command prefix, not a command name
+set decoded [\fBregsub\fR -all -command $RE $string {apply {{- p h} {
+    # + is a special case; handle directly
+    if {$p eq "+"} {
+        return " "
+    }
+    # convert hex to a char
+    scan $h %x charNumber
+    format %c $charNumber
+}}}]
+.CE
+.VE 8.7
 .SH "SEE ALSO"
 regexp(n), re_syntax(n), subst(n), string(n)
 .SH KEYWORDS
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index e24dd58..2c6b7bb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -487,26 +487,28 @@ Tcl_RegsubObjCmd(
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     int idx, result, cflags, all, wlen, wsublen, numMatches, offset;
-    int start, end, subStart, subEnd, match;
+    int start, end, subStart, subEnd, match, command, numParts, numArgs;
     Tcl_RegExp regExpr;
     Tcl_RegExpInfo info;
     Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL;
+    Tcl_Obj **args = NULL, **parts;
     Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec, *wend;
 
     static const char *const options[] = {
-	"-all",		"-nocase",	"-expanded",
-	"-line",	"-linestop",	"-lineanchor",	"-start",
+	"-all",		"-command",	"-expanded",	"-line",
+	"-linestop",	"-lineanchor",	"-nocase",	"-start",
 	"--",		NULL
     };
     enum options {
-	REGSUB_ALL,	REGSUB_NOCASE,	REGSUB_EXPANDED,
-	REGSUB_LINE,	REGSUB_LINESTOP, REGSUB_LINEANCHOR,	REGSUB_START,
+	REGSUB_ALL,	 REGSUB_COMMAND,    REGSUB_EXPANDED, REGSUB_LINE,
+	REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE,   REGSUB_START,
 	REGSUB_LAST
     };
 
     cflags = TCL_REG_ADVANCED;
     all = 0;
     offset = 0;
+    command = 0;
     resultPtr = NULL;
 
     for (idx = 1; idx < objc; idx++) {
@@ -528,6 +530,9 @@ Tcl_RegsubObjCmd(
 	case REGSUB_NOCASE:
 	    cflags |= TCL_REG_NOCASE;
 	    break;
+	case REGSUB_COMMAND:
+	    command = 1;
+	    break;
 	case REGSUB_EXPANDED:
 	    cflags |= TCL_REG_EXPANDED;
 	    break;
@@ -585,7 +590,7 @@ Tcl_RegsubObjCmd(
 	}
     }
 
-    if (all && (offset == 0)
+    if (all && (offset == 0) && (command == 0)
 	    && (strpbrk(TclGetString(objv[2]), "&\\") == NULL)
 	    && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) {
 	/*
@@ -661,6 +666,28 @@ Tcl_RegsubObjCmd(
 	return TCL_ERROR;
     }
 
+    if (command) {
+	/*
+	 * In command-prefix mode, we require that the third non-option
+	 * argument be a list, so we enforce that here. Afterwards, we fetch
+	 * the RE compilation again in case objv[0] and objv[2] are the same
+	 * object. (If they aren't, that's cheap to do.)
+	 */
+
+	if (Tcl_ListObjLength(interp, objv[2], &numParts) != TCL_OK) {
+	    return TCL_ERROR;
+	}
+	if (numParts < 1) {
+	    Tcl_SetObjResult(interp, Tcl_NewStringObj(
+		    "command prefix must be a list of at least one element",
+		    -1));
+	    Tcl_SetErrorCode(interp, "TCL", "OPERATION", "REGSUB",
+		    "CMDEMPTY", NULL);
+	    return TCL_ERROR;
+	}
+	regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags);
+    }
+
     /*
      * Make sure to avoid problems where the objects are shared. This can
      * cause RegExpObj <> UnicodeObj shimmering that causes data corruption.
@@ -678,7 +705,9 @@ Tcl_RegsubObjCmd(
     } else {
 	subPtr = objv[2];
     }
-    wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen);
+    if (!command) {
+	wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen);
+    }
 
     result = TCL_OK;
 
@@ -737,6 +766,88 @@ Tcl_RegsubObjCmd(
 	Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start);
 
 	/*
+	 * In command-prefix mode, the substitutions are added as quoted
+	 * arguments to the subSpec to form a command, that is then executed
+	 * and the result used as the string to substitute in. Actually,
+	 * everything is passed through Tcl_EvalObjv, as that's much faster.
+	 */
+
+	if (command) {
+	    if (args == NULL) {
+		Tcl_ListObjGetElements(interp, subPtr, &numParts, &parts);
+		numArgs = numParts + info.nsubs + 1;
+		args = ckalloc(sizeof(Tcl_Obj*) * numArgs);
+		memcpy(args, parts, sizeof(Tcl_Obj*) * numParts);
+	    }
+
+	    for (idx = 0 ; idx <= info.nsubs ; idx++) {
+		subStart = info.matches[idx].start;
+		subEnd = info.matches[idx].end;
+		if ((subStart >= 0) && (subEnd >= 0)) {
+		    args[idx + numParts] = Tcl_NewUnicodeObj(
+			    wstring + offset + subStart, subEnd - subStart);
+		} else {
+		    args[idx + numParts] = Tcl_NewObj();
+		}
+		Tcl_IncrRefCount(args[idx + numParts]);
+	    }
+
+	    /*
+	     * At this point, we're locally holding the references to the
+	     * argument words we added for this time round the loop, and the
+	     * subPtr is holding the references to the words that the user
+	     * supplied directly. None are zero-refcount, which is important
+	     * because Tcl_EvalObjv is "hairy monster" in terms of refcount
+	     * handling, being able to optionally add references to any of its
+	     * argument words. We'll drop the local refs immediately
+	     * afterwarsds; subPtr is handled in the main exit stanza.
+	     */
+
+	    result = Tcl_EvalObjv(interp, numArgs, args, 0);
+	    for (idx = 0 ; idx <= info.nsubs ; idx++) {
+		TclDecrRefCount(args[idx + numParts]);
+	    }
+	    if (result != TCL_OK) {
+		if (result == TCL_ERROR) {
+		    Tcl_AppendObjToErrorInfo(interp, Tcl_ObjPrintf(
+			    "\n    (%s substitution computation script)",
+			    options[REGSUB_COMMAND]));
+		}
+		goto done;
+	    }
+
+	    Tcl_AppendObjToObj(resultPtr, Tcl_GetObjResult(interp));
+	    Tcl_ResetResult(interp);
+
+	    /*
+	     * Refetch the unicode, in case the representation was smashed by
+	     * the user code.
+	     */
+
+	    wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen);
+
+	    offset += end;
+	    if (end == 0 || start == end) {
+		/*
+		 * Always consume at least one character of the input string
+		 * in order to prevent infinite loops, even when we
+		 * technically matched the empty string; we must not match
+		 * again at the same spot.
+		 */
+
+		if (offset < wlen) {
+		    Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1);
+		}
+		offset++;
+	    }
+	    if (all) {
+		continue;
+	    } else {
+		break;
+	    }
+	}
+
+	/*
 	 * Append the subSpec argument to the variable, making appropriate
 	 * substitutions. This code is a bit hairy because of the backslash
 	 * conventions and because the code saves up ranges of characters in
@@ -864,6 +975,9 @@ Tcl_RegsubObjCmd(
     if (subPtr && (objv[2] == objv[0])) {
 	Tcl_DecrRefCount(subPtr);
     }
+    if (args) {
+	ckfree(args);
+    }
     if (resultPtr) {
 	Tcl_DecrRefCount(resultPtr);
     }
diff --git a/tests/regexp.test b/tests/regexp.test
index 4ffdbdb..2686526 100644
--- a/tests/regexp.test
+++ b/tests/regexp.test
@@ -19,6 +19,20 @@ if {"::tcltest" ni [namespace children]} {
 unset -nocomplain foo
 
 testConstraint exec [llength [info commands exec]]
+
+# Used for constraining memory leak tests
+testConstraint memory [llength [info commands memory]]
+if {[testConstraint memory]} {
+    proc memtest script {
+	set end [lindex [split [memory info] \n] 3 3]
+	for {set i 0} {$i < 5} {incr i} {
+	    uplevel 1 $script
+	    set tmp $end
+	    set end [lindex [split [memory info] \n] 3 3]
+	}
+	expr {$end - $tmp}
+    }
+}
 
 test regexp-1.1 {basic regexp operation} {
     regexp ab*c abbbc
@@ -453,7 +467,7 @@ test regexp-11.4 {regsub errors} {
 } {1 {wrong # args: should be "regsub ?-option ...? exp string subSpec ?varName?"}}
 test regexp-11.5 {regsub errors} {
     list [catch {regsub -gorp a b c} msg] $msg
-} {1 {bad option "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}}
+} {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}}
 test regexp-11.6 {regsub errors} {
     list [catch {regsub -nocase a( b c d} msg] $msg
 } {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
@@ -1123,6 +1137,53 @@ test regexp-26.12 {regexp with -line option} {
 test regexp-26.13 {regexp without -line option} {
     regexp -all -inline -- {a*} "b\n"
 } {{} {}}
+
+test regexp-27.1 {regsub -command} {
+    regsub -command {.x.} {abcxdef} {string length}
+} ab3ef
+test regexp-27.2 {regsub -command} {
+    regsub -command {.x.} {abcxdefxghi} {string length}
+} ab3efxghi
+test regexp-27.3 {regsub -command} {
+    set x 0
+    regsub -all -command {(?=.)} abcde {apply {args {incr ::x}}}
+} 1a2b3c4d5e
+test regexp-27.4 {regsub -command} -body {
+    regsub -command {.x.} {abcxdef} error
+} -returnCodes error -result cxd
+test regexp-27.5 {regsub -command} {
+    regsub -command {(.)(.)} {abcdef} {list ,}
+} {, ab a bcdef}
+test regexp-27.6 {regsub -command} {
+    regsub -command -all {(.)(.)} {abcdef} {list ,}
+} {, ab a b, cd c d, ef e f}
+test regexp-27.7 {regsub -command representation smash} {
+    set ::s {123=456 789}
+    regsub -command -all {\d+} $::s {apply {n {
+	expr {[llength $::s] + $n}
+    }}}
+} {125=458 791}
+test regexp-27.8 {regsub -command representation smash} {
+    set ::t {apply {n {
+	expr {[llength [lindex $::t 1 1 1]] + $n}
+    }}}
+    regsub -command -all {\d+} "123=456 789" $::t
+} {131=464 797}
+test regexp-27.9 {regsub -command memory leak testing} memory {
+    set ::s "123=456 789"
+    set ::t {apply {n {
+	expr {[llength [lindex $::t 1 1 1]] + [llength $::s] + $n}
+    }}}
+    memtest {
+	regsub -command -all {\d+} $::s $::t
+    }
+} 0
+test regexp-27.10 {regsub -command error cases} -returnCodes error -body {
+    regsub -command . abc "def \{ghi"
+} -result {unmatched open brace in list}
+test regexp-27.11 {regsub -command error cases} -returnCodes error -body {
+    regsub -command . abc {}
+} -result {command prefix must be a list of at least one element}
 
 # cleanup
 ::tcltest::cleanupTests
diff --git a/tests/regexpComp.test b/tests/regexpComp.test
index b8e64b6..fbf8012 100644
--- a/tests/regexpComp.test
+++ b/tests/regexpComp.test
@@ -587,7 +587,7 @@ test regexpComp-11.5 {regsub errors} {
     evalInProc {
 	list [catch {regsub -gorp a b c} msg] $msg
     }
-} {1 {bad option "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}}
+} {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}}
 test regexpComp-11.6 {regsub errors} {
     evalInProc {
 	list [catch {regsub -nocase a( b c d} msg] $msg
author	dkf <donal.k.fellows@manchester.ac.uk>	2017-06-22 21:46:40 (GMT)
committer	dkf <donal.k.fellows@manchester.ac.uk>	2017-06-22 21:46:40 (GMT)
commit	f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff (patch)
tree	318ce059db03075568b505a7caa2624fce436e13
parent	f57dacd4955b9dd474cfd9d4e4d55cac22990f6c (diff)
parent	69201f94bdcef502012a231e2100bceef5062f90 (diff)
download	tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.zip tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.tar.gz tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.tar.bz2