From 6aa0cc7188b6df1dac97b03bc0b9240aa780799b Mon Sep 17 00:00:00 2001 From: dkf Date: Sat, 18 Feb 2017 18:38:52 +0000 Subject: Add documentation of [regsub -command]. --- doc/regsub.n | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ generic/tclCmdMZ.c | 4 +-- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/doc/regsub.n b/doc/regsub.n index a5b79de..23bbff9 100644 --- a/doc/regsub.n +++ b/doc/regsub.n @@ -68,6 +68,31 @@ and sequences are handled for each substitution using the information from the corresponding match. .TP +\fB\-command\fR +.VS 8.7 +Changes the handling of the substitution string so that it no longer treats +.QW & +and +.QW \e +as special characters, but instead uses them as a non-empty list of words. +Each time a substitution is processed, another complete Tcl word is appended +to that list for each substitution value (the first such argument represents +the overall matched substring, the subsequent arguments will be one per +capturing sub-RE, much as are returned from \fBregexp\fR \fB\-inline\fR) and +the overall list is then evaluated as a Tcl command call. If the command +finishes successfully, the result of command call is substituted into the +resulting string. +.RS +.PP +If \fB\-all\fR is not also given, the command callback will be invoked at most +once (exactly when the regular expression matches). If \fB\-all\fR is given, +the command callback will be invoked for each matched location, in sequence. +The exact location indices that matched are not made available to the script. +.PP +See \fBEXAMPLES\fR below for illustrative cases. +.RE +.VE 8.7 +.TP \fB\-expanded\fR . Enables use of the expanded regular expression syntax where @@ -183,6 +208,53 @@ set substitution {[format \e\e\e\eu%04x [scan "\e\e&" %c]]} set quoted [subst [string map {\en {\e\eu000a}} \e [\fBregsub\fR -all $RE $string $substitution]]] .CE +.PP +.VS 8.7 +The above operation can be done using \fBregsub \-command\fR instead, which is +often faster. (A full pre-computed \fBstring map\fR would be faster still, but +the cost of computing the map for a transformation as complex as this can be +quite large.) +.PP +.CS +# This RE is just a character class for everything "bad" +set RE {[][{};#\e\e\e$\es\eu0080-\euffff]} + +# This encodes what the RE described above matches +proc encodeChar {ch} { + # newline is handled specially since backslash-newline is a + # special sequence. + if {$ch eq "\en"} { + return "\e\eu000a" + } + # No point in writing this as a one-liner + scan $ch %c charNumber + format "\e\eu%04x" $charNumber +} + +set quoted [\fBregsub\fR -all -command $RE $string encodeChar] +.CE +.PP +Decoding a URL-encoded string using \fBregsub \-command\fR, a lambda term and +the \fBapply\fR command. +.PP +.CS +# Match one of the sequences in a URL-encoded string that needs +# fixing, converting + to space and %XX to the right character +# (e.g., %7e becomes ~) +set RE {(\e+)|%([0-9A-Fa-f]{2})} + +# Note that -command uses a command prefix, not a command name +set decoded [\fBregsub\fR -all -command $RE $string {apply {{- p h} { + # + is a special case; handle directly + if {$p eq "+"} { + return " " + } + # convert hex to a char + scan $h %x charNumber + format %c $charNumber +}}}] +.CE +.VE 8.7 .SH "SEE ALSO" regexp(n), re_syntax(n), subst(n), string(n) .SH KEYWORDS diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index d5a6b01..4178ba8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -500,8 +500,8 @@ Tcl_RegsubObjCmd( "--", NULL }; enum options { - REGSUB_ALL, REGSUB_COMMAND, REGSUB_EXPANDED, REGSUB_LINE, - REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE, REGSUB_START, + REGSUB_ALL, REGSUB_COMMAND, REGSUB_EXPANDED, REGSUB_LINE, + REGSUB_LINESTOP, REGSUB_LINEANCHOR, REGSUB_NOCASE, REGSUB_START, REGSUB_LAST }; -- cgit v0.12