summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordkf <donal.k.fellows@manchester.ac.uk>2017-06-22 21:46:40 (GMT)
committerdkf <donal.k.fellows@manchester.ac.uk>2017-06-22 21:46:40 (GMT)
commitf5cf6bbf990d8bb8c07e986c9f67c94f75c878ff (patch)
tree318ce059db03075568b505a7caa2624fce436e13
parentf57dacd4955b9dd474cfd9d4e4d55cac22990f6c (diff)
parent69201f94bdcef502012a231e2100bceef5062f90 (diff)
downloadtcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.zip
tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.tar.gz
tcl-f5cf6bbf990d8bb8c07e986c9f67c94f75c878ff.tar.bz2
Implement TIP #463: Command-Driven Substitutions for regsub
-rw-r--r--doc/regsub.n72
-rw-r--r--generic/tclCmdMZ.c128
-rw-r--r--tests/regexp.test63
-rw-r--r--tests/regexpComp.test2
4 files changed, 256 insertions, 9 deletions
diff --git a/doc/regsub.n b/doc/regsub.n
index a5b79de..23bbff9 100644
--- a/doc/regsub.n
+++ b/doc/regsub.n
@@ -68,6 +68,31 @@ and
sequences are handled for each substitution using the information
from the corresponding match.
.TP
+\fB\-command\fR
+.VS 8.7
+Changes the handling of the substitution string so that it no longer treats
+.QW &
+and
+.QW \e
+as special characters, but instead uses them as a non-empty list of words.
+Each time a substitution is processed, another complete Tcl word is appended
+to that list for each substitution value (the first such argument represents
+the overall matched substring, the subsequent arguments will be one per
+capturing sub-RE, much as are returned from \fBregexp\fR \fB\-inline\fR) and
+the overall list is then evaluated as a Tcl command call. If the command
+finishes successfully, the result of command call is substituted into the
+resulting string.
+.RS
+.PP
+If \fB\-all\fR is not also given, the command callback will be invoked at most
+once (exactly when the regular expression matches). If \fB\-all\fR is given,
+the command callback will be invoked for each matched location, in sequence.
+The exact location indices that matched are not made available to the script.
+.PP
+See \fBEXAMPLES\fR below for illustrative cases.
+.RE
+.VE 8.7
+.TP
\fB\-expanded\fR
.
Enables use of the expanded regular expression syntax where
@@ -183,6 +208,53 @@ set substitution {[format \e\e\e\eu%04x [scan "\e\e&" %c]]}
set quoted [subst [string map {\en {\e\eu000a}} \e
[\fBregsub\fR -all $RE $string $substitution]]]
.CE
+.PP
+.VS 8.7
+The above operation can be done using \fBregsub \-command\fR instead, which is
+often faster. (A full pre-computed \fBstring map\fR would be faster still, but
+the cost of computing the map for a transformation as complex as this can be
+quite large.)
+.PP
+.CS
+# This RE is just a character class for everything "bad"
+set RE {[][{};#\e\e\e$\es\eu0080-\euffff]}
+
+# This encodes what the RE described above matches
+proc encodeChar {ch} {
+ # newline is handled specially since backslash-newline is a
+ # special sequence.
+ if {$ch eq "\en"} {
+ return "\e\eu000a"
+ }
+ # No point in writing this as a one-liner
+ scan $ch %c charNumber
+ format "\e\eu%04x" $charNumber
+}
+
+set quoted [\fBregsub\fR -all -command $RE $string encodeChar]
+.CE
+.PP
+Decoding a URL-encoded string using \fBregsub \-command\fR, a lambda term and
+the \fBapply\fR command.
+.PP
+.CS
+# Match one of the sequences in a URL-encoded string that needs
+# fixing, converting + to space and %XX to the right character
+# (e.g., %7e becomes ~)
+set RE {(\e+)|%([0-9A-Fa-f]{2})}
+
+# Note that -command uses a command prefix, not a command name
+set decoded [\fBregsub\fR -all -command $RE $string {apply {{- p h} {
+ # + is a special case; handle directly
+ if {$p eq "+"} {
+ return " "
+ }
+ # convert hex to a char
+ scan $h %x charNumber
+ format %c $charNumber
+}}}]
+.CE
+.VE 8.7
.SH "SEE ALSO"
regexp(n), re_syntax(n), subst(n), string(n)
.SH KEYWORDS
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index e24dd58..2c6b7bb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -487,26 +487,28 @@ Tcl_RegsubObjCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
int idx, result, cflags, all, wlen, wsublen, numMatches, offset;
- int start, end, subStart, subEnd, match;
+ int start, end, subStart, subEnd, match, command, numParts, numArgs;
Tcl_RegExp regExpr;
Tcl_RegExpInfo info;
Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL;
+ Tcl_Obj **args = NULL, **parts;
Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec, *wend;
static const char *const options[] = {
- "-all", "-nocase", "-expanded",
- "-line", "-linestop", "-lineanchor", "-start",
+ "-all", "-command", "-expanded", "-line",
+ "-linestop", "-lineanchor", "-nocase", "-start",
"--", NULL
};
enum options {
- REGSUB_ALL, REGSUB_NOCASE, REGSUB_EXPANDED,
- REGSUB_LINE, REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_START,
+ REGSUB_ALL, REGSUB_COMMAND, REGSUB_EXPANDED, REGSUB_LINE,
+ REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE, REGSUB_START,
REGSUB_LAST
};
cflags = TCL_REG_ADVANCED;
all = 0;
offset = 0;
+ command = 0;
resultPtr = NULL;
for (idx = 1; idx < objc; idx++) {
@@ -528,6 +530,9 @@ Tcl_RegsubObjCmd(
case REGSUB_NOCASE:
cflags |= TCL_REG_NOCASE;
break;
+ case REGSUB_COMMAND:
+ command = 1;
+ break;
case REGSUB_EXPANDED:
cflags |= TCL_REG_EXPANDED;
break;
@@ -585,7 +590,7 @@ Tcl_RegsubObjCmd(
}
}
- if (all && (offset == 0)
+ if (all && (offset == 0) && (command == 0)
&& (strpbrk(TclGetString(objv[2]), "&\\") == NULL)
&& (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) {
/*
@@ -661,6 +666,28 @@ Tcl_RegsubObjCmd(
return TCL_ERROR;
}
+ if (command) {
+ /*
+ * In command-prefix mode, we require that the third non-option
+ * argument be a list, so we enforce that here. Afterwards, we fetch
+ * the RE compilation again in case objv[0] and objv[2] are the same
+ * object. (If they aren't, that's cheap to do.)
+ */
+
+ if (Tcl_ListObjLength(interp, objv[2], &numParts) != TCL_OK) {
+ return TCL_ERROR;
+ }
+ if (numParts < 1) {
+ Tcl_SetObjResult(interp, Tcl_NewStringObj(
+ "command prefix must be a list of at least one element",
+ -1));
+ Tcl_SetErrorCode(interp, "TCL", "OPERATION", "REGSUB",
+ "CMDEMPTY", NULL);
+ return TCL_ERROR;
+ }
+ regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags);
+ }
+
/*
* Make sure to avoid problems where the objects are shared. This can
* cause RegExpObj <> UnicodeObj shimmering that causes data corruption.
@@ -678,7 +705,9 @@ Tcl_RegsubObjCmd(
} else {
subPtr = objv[2];
}
- wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen);
+ if (!command) {
+ wsubspec = Tcl_GetUnicodeFromObj(subPtr, &wsublen);
+ }
result = TCL_OK;
@@ -737,6 +766,88 @@ Tcl_RegsubObjCmd(
Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, start);
/*
+ * In command-prefix mode, the substitutions are added as quoted
+ * arguments to the subSpec to form a command, that is then executed
+ * and the result used as the string to substitute in. Actually,
+ * everything is passed through Tcl_EvalObjv, as that's much faster.
+ */
+
+ if (command) {
+ if (args == NULL) {
+ Tcl_ListObjGetElements(interp, subPtr, &numParts, &parts);
+ numArgs = numParts + info.nsubs + 1;
+ args = ckalloc(sizeof(Tcl_Obj*) * numArgs);
+ memcpy(args, parts, sizeof(Tcl_Obj*) * numParts);
+ }
+
+ for (idx = 0 ; idx <= info.nsubs ; idx++) {
+ subStart = info.matches[idx].start;
+ subEnd = info.matches[idx].end;
+ if ((subStart >= 0) && (subEnd >= 0)) {
+ args[idx + numParts] = Tcl_NewUnicodeObj(
+ wstring + offset + subStart, subEnd - subStart);
+ } else {
+ args[idx + numParts] = Tcl_NewObj();
+ }
+ Tcl_IncrRefCount(args[idx + numParts]);
+ }
+
+ /*
+ * At this point, we're locally holding the references to the
+ * argument words we added for this time round the loop, and the
+ * subPtr is holding the references to the words that the user
+ * supplied directly. None are zero-refcount, which is important
+ * because Tcl_EvalObjv is "hairy monster" in terms of refcount
+ * handling, being able to optionally add references to any of its
+ * argument words. We'll drop the local refs immediately
+ * afterwarsds; subPtr is handled in the main exit stanza.
+ */
+
+ result = Tcl_EvalObjv(interp, numArgs, args, 0);
+ for (idx = 0 ; idx <= info.nsubs ; idx++) {
+ TclDecrRefCount(args[idx + numParts]);
+ }
+ if (result != TCL_OK) {
+ if (result == TCL_ERROR) {
+ Tcl_AppendObjToErrorInfo(interp, Tcl_ObjPrintf(
+ "\n (%s substitution computation script)",
+ options[REGSUB_COMMAND]));
+ }
+ goto done;
+ }
+
+ Tcl_AppendObjToObj(resultPtr, Tcl_GetObjResult(interp));
+ Tcl_ResetResult(interp);
+
+ /*
+ * Refetch the unicode, in case the representation was smashed by
+ * the user code.
+ */
+
+ wstring = Tcl_GetUnicodeFromObj(objPtr, &wlen);
+
+ offset += end;
+ if (end == 0 || start == end) {
+ /*
+ * Always consume at least one character of the input string
+ * in order to prevent infinite loops, even when we
+ * technically matched the empty string; we must not match
+ * again at the same spot.
+ */
+
+ if (offset < wlen) {
+ Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1);
+ }
+ offset++;
+ }
+ if (all) {
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ /*
* Append the subSpec argument to the variable, making appropriate
* substitutions. This code is a bit hairy because of the backslash
* conventions and because the code saves up ranges of characters in
@@ -864,6 +975,9 @@ Tcl_RegsubObjCmd(
if (subPtr && (objv[2] == objv[0])) {
Tcl_DecrRefCount(subPtr);
}
+ if (args) {
+ ckfree(args);
+ }
if (resultPtr) {
Tcl_DecrRefCount(resultPtr);
}
diff --git a/tests/regexp.test b/tests/regexp.test
index 4ffdbdb..2686526 100644
--- a/tests/regexp.test
+++ b/tests/regexp.test
@@ -19,6 +19,20 @@ if {"::tcltest" ni [namespace children]} {
unset -nocomplain foo
testConstraint exec [llength [info commands exec]]
+
+# Used for constraining memory leak tests
+testConstraint memory [llength [info commands memory]]
+if {[testConstraint memory]} {
+ proc memtest script {
+ set end [lindex [split [memory info] \n] 3 3]
+ for {set i 0} {$i < 5} {incr i} {
+ uplevel 1 $script
+ set tmp $end
+ set end [lindex [split [memory info] \n] 3 3]
+ }
+ expr {$end - $tmp}
+ }
+}
test regexp-1.1 {basic regexp operation} {
regexp ab*c abbbc
@@ -453,7 +467,7 @@ test regexp-11.4 {regsub errors} {
} {1 {wrong # args: should be "regsub ?-option ...? exp string subSpec ?varName?"}}
test regexp-11.5 {regsub errors} {
list [catch {regsub -gorp a b c} msg] $msg
-} {1 {bad option "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}}
+} {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}}
test regexp-11.6 {regsub errors} {
list [catch {regsub -nocase a( b c d} msg] $msg
} {1 {couldn't compile regular expression pattern: parentheses () not balanced}}
@@ -1123,6 +1137,53 @@ test regexp-26.12 {regexp with -line option} {
test regexp-26.13 {regexp without -line option} {
regexp -all -inline -- {a*} "b\n"
} {{} {}}
+
+test regexp-27.1 {regsub -command} {
+ regsub -command {.x.} {abcxdef} {string length}
+} ab3ef
+test regexp-27.2 {regsub -command} {
+ regsub -command {.x.} {abcxdefxghi} {string length}
+} ab3efxghi
+test regexp-27.3 {regsub -command} {
+ set x 0
+ regsub -all -command {(?=.)} abcde {apply {args {incr ::x}}}
+} 1a2b3c4d5e
+test regexp-27.4 {regsub -command} -body {
+ regsub -command {.x.} {abcxdef} error
+} -returnCodes error -result cxd
+test regexp-27.5 {regsub -command} {
+ regsub -command {(.)(.)} {abcdef} {list ,}
+} {, ab a bcdef}
+test regexp-27.6 {regsub -command} {
+ regsub -command -all {(.)(.)} {abcdef} {list ,}
+} {, ab a b, cd c d, ef e f}
+test regexp-27.7 {regsub -command representation smash} {
+ set ::s {123=456 789}
+ regsub -command -all {\d+} $::s {apply {n {
+ expr {[llength $::s] + $n}
+ }}}
+} {125=458 791}
+test regexp-27.8 {regsub -command representation smash} {
+ set ::t {apply {n {
+ expr {[llength [lindex $::t 1 1 1]] + $n}
+ }}}
+ regsub -command -all {\d+} "123=456 789" $::t
+} {131=464 797}
+test regexp-27.9 {regsub -command memory leak testing} memory {
+ set ::s "123=456 789"
+ set ::t {apply {n {
+ expr {[llength [lindex $::t 1 1 1]] + [llength $::s] + $n}
+ }}}
+ memtest {
+ regsub -command -all {\d+} $::s $::t
+ }
+} 0
+test regexp-27.10 {regsub -command error cases} -returnCodes error -body {
+ regsub -command . abc "def \{ghi"
+} -result {unmatched open brace in list}
+test regexp-27.11 {regsub -command error cases} -returnCodes error -body {
+ regsub -command . abc {}
+} -result {command prefix must be a list of at least one element}
# cleanup
::tcltest::cleanupTests
diff --git a/tests/regexpComp.test b/tests/regexpComp.test
index b8e64b6..fbf8012 100644
--- a/tests/regexpComp.test
+++ b/tests/regexpComp.test
@@ -587,7 +587,7 @@ test regexpComp-11.5 {regsub errors} {
evalInProc {
list [catch {regsub -gorp a b c} msg] $msg
}
-} {1 {bad option "-gorp": must be -all, -nocase, -expanded, -line, -linestop, -lineanchor, -start, or --}}
+} {1 {bad option "-gorp": must be -all, -command, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}}
test regexpComp-11.6 {regsub errors} {
evalInProc {
list [catch {regsub -nocase a( b c d} msg] $msg