diff options
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | doc/regexp.n | 56 | ||||
-rw-r--r-- | generic/tclCmdMZ.c | 224 | ||||
-rw-r--r-- | tests/regexp.test | 62 |
4 files changed, 245 insertions, 104 deletions
@@ -1,3 +1,10 @@ +2000-02-02 Jeff Hobbs <hobbs@scriptics.com> + + * tests/regexp.test: added tests for -all and -inline switches + * doc/regexp.n: added docs for -all and -inline switches + * generic/tclCmdMZ.c (Tcl_RegexpObjCmd): added extra comments for + new -all and -inline switches to regexp command + 2000-02-01 Eric Melski <ericm@scriptics.com> * library/init.tcl: Applied patch from rfe 1734 regarding diff --git a/doc/regexp.n b/doc/regexp.n index 3b19420..373691a 100644 --- a/doc/regexp.n +++ b/doc/regexp.n @@ -4,7 +4,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: regexp.n,v 1.7 1999/09/21 04:20:36 hobbs Exp $ +'\" RCS: @(#) $Id: regexp.n,v 1.8 2000/02/02 22:32:09 hobbs Exp $ '\" .so man.macros .TH regexp n 8.3 Tcl "Tcl Built-In Commands" @@ -20,7 +20,8 @@ regexp \- Match a regular expression against a string .SH DESCRIPTION .PP Determines whether the regular expression \fIexp\fR matches part or -all of \fIstring\fR and returns 1 if it does, 0 if it doesn't. +all of \fIstring\fR and returns 1 if it does, 0 if it doesn't, unless +\fB-inline\fR is specified (see below). (Regular expression matching is described in the \fBre_syntax\fR reference page.) .LP @@ -38,9 +39,17 @@ If the initial arguments to \fBregexp\fR start with \fB\-\fR then they are treated as switches. The following switches are currently supported: .TP 15 -\fB\-nocase\fR -Causes upper-case characters in \fIstring\fR to be treated as -lower case during the matching process. +\fB\-about\fR +Instead of attempting to match the regular expression, returns a list +containing information about the regular expression. The first +element of the list is a subexpression count. The second element is a +list of property names that describe various attributes of the regular +expression. This switch is primarily intended for debugging purposes. +.TP 15 +\fB\-expanded\fR +Enables use of the expanded regular expression syntax where +whitespace and comments are ignored. This is the same as specifying +the \fB(?x)\fR embedded option (see METASYNTAX, below). .TP 15 \fB\-indices\fR Changes what is stored in the \fIsubMatchVar\fRs. @@ -49,12 +58,6 @@ each variable will contain a list of two decimal strings giving the indices in \fIstring\fR of the first and last characters in the matching range of characters. -.VS 8.1 -.TP 15 -\fB\-expanded\fR -Enables use of the expanded regular expression syntax where -whitespace and comments are ignored. This is the same as specifying -the \fB(?x)\fR embedded option (see METASYNTAX, below). .TP 15 \fB\-line\fR Enables newline-sensitive matching. By default, newline is a @@ -76,15 +79,32 @@ Changes the behavior of `^' and `$' (the ``anchors'') so they match the beginning and end of a line respectively. This is the same as specifying the \fB(?w)\fR embedded option (see METASYNTAX, below). .TP 15 -\fB\-about\fR -Instead of attempting to match the regular expression, returns a list -containing information about the regular expression. The first -element of the list is a subexpression count. The second element is a -list of property names that describe various attributes of the regular -expression. This switch is primarily intended for debugging purposes. -.VE 8.1 +\fB\-nocase\fR +Causes upper-case characters in \fIstring\fR to be treated as +lower case during the matching process. .VS 8.3 .TP 15 +\fB\-all\fR +Causes the regular expression to be matched as many times as possible +in the string, returning the total number of matches found. If this +is specified with match variables, they will continue information for +the last match only. +.TP 15 +\fB\-inline\fR +Causes the command to return the data that would otherwise be placed +in match variables to be returned as a list. When using \fB-inline\fR, +match variables may not be specified. If used with \fB-all\fR, the +list will be concatenated at each iteration, such that a flat list is +always returned. For each match iteration, the command will append the +overall match data, plus one element for each subexpression in the +regular expression. Examples are: +.CS + regexp -inline -- {\\w(\\w)} " inlined " + => {in n} + regexp -all -inline -- {\\w(\\w)} " inlined " + => {in n li i ne e} +.CE +.TP 15 \fB\-start\fR \fIindex\fR Specifies a character index offset into the string to start matching the regular expression at. When using this switch, `^' diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index af3da2c..52ad4d8 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -13,7 +13,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.23 1999/12/12 02:26:41 hobbs Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.24 2000/02/02 22:32:11 hobbs Exp $ */ #include "tclInt.h" @@ -126,27 +126,29 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) int objc; /* Number of arguments. */ Tcl_Obj *CONST objv[]; /* Argument objects. */ { - int i, indices, match, about, offset; + int i, indices, match, about, offset, all, doinline, numMatchesSaved; int cflags, eflags; Tcl_RegExp regExpr; - Tcl_Obj *objPtr; + Tcl_Obj *objPtr, *resultPtr; Tcl_RegExpInfo info; static char *options[] = { - "-indices", "-nocase", "-about", "-expanded", - "-line", "-linestop", "-lineanchor", "-start", - "--", (char *) NULL + "-all", "-about", "-indices", "-inline", + "-expanded", "-line", "-linestop", "-lineanchor", + "-nocase", "-start", "--", (char *) NULL }; enum options { - REGEXP_INDICES, REGEXP_NOCASE, REGEXP_ABOUT, REGEXP_EXPANDED, - REGEXP_LINE, REGEXP_LINESTOP, REGEXP_LINEANCHOR, REGEXP_START, - REGEXP_LAST + REGEXP_ALL, REGEXP_ABOUT, REGEXP_INDICES, REGEXP_INLINE, + REGEXP_EXPANDED,REGEXP_LINE, REGEXP_LINESTOP,REGEXP_LINEANCHOR, + REGEXP_NOCASE, REGEXP_START, REGEXP_LAST }; - indices = 0; - about = 0; - cflags = TCL_REG_ADVANCED; - eflags = 0; - offset = 0; + indices = 0; + about = 0; + cflags = TCL_REG_ADVANCED; + eflags = 0; + offset = 0; + all = 0; + doinline = 0; for (i = 1; i < objc; i++) { char *name; @@ -161,10 +163,18 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) return TCL_ERROR; } switch ((enum options) index) { + case REGEXP_ALL: { + all = 1; + break; + } case REGEXP_INDICES: { indices = 1; break; } + case REGEXP_INLINE: { + doinline = 1; + break; + } case REGEXP_NOCASE: { cflags |= TCL_REG_NOCASE; break; @@ -209,14 +219,22 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) } endOfForLoop: - if (objc - i < 2 - about) { - Tcl_WrongNumArgs(interp, 1, objv, - "?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"); + if ((objc - i) < (2 - about)) { + Tcl_WrongNumArgs(interp, 1, objv, "?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"); return TCL_ERROR; } objc -= i; objv += i; + if (doinline && ((objc - 2) != 0)) { + /* + * User requested -inline, but specified match variables - a no-no. + */ + Tcl_AppendResult(interp, "regexp match variables not allowed", + " when using -inline", (char *) NULL); + return TCL_ERROR; + } + regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags); if (regExpr == NULL) { return TCL_ERROR; @@ -238,84 +256,142 @@ Tcl_RegexpObjCmd(dummy, interp, objc, objv) eflags |= TCL_REG_NOTBOL; } - match = Tcl_RegExpExecObj(interp, regExpr, objPtr, offset /* offset */, - objc-2 /* nmatches */, eflags); - - if (match < 0) { - return TCL_ERROR; - } + objc -= 2; + objv += 2; + resultPtr = Tcl_GetObjResult(interp); - if (match == 0) { + if (doinline) { /* - * Set the interpreter's object result to an integer object w/ - * value 0. + * Save all the subexpressions, as we will return them as a list */ - - Tcl_SetIntObj(Tcl_GetObjResult(interp), 0); - return TCL_OK; + numMatchesSaved = -1; + } else { + /* + * Save only enough subexpressions for matches we want to keep, + * expect in the case of -all, where we need to keep at least + * one to know where to move the offset. + */ + numMatchesSaved = (objc == 0) ? all : objc; } - /* - * If additional variable names have been specified, return - * index information in those variables. - */ - - objc -= 2; - objv += 2; - - Tcl_RegExpGetInfo(regExpr, &info); - for (i = 0; i < objc; i++) { - Tcl_Obj *varPtr, *valuePtr, *newPtr; - - varPtr = objv[i]; - if (indices) { - int start, end; - Tcl_Obj *objs[2]; + while (1) { + match = Tcl_RegExpExecObj(interp, regExpr, objPtr, + offset /* offset */, numMatchesSaved, eflags); - if (i <= info.nsubs) { - start = offset + info.matches[i].start; - end = offset + info.matches[i].end; + if (match < 0) { + return TCL_ERROR; + } + if (match == 0) { + /* + * We want to set the value of the intepreter result only when + * this is the first time through the loop. + */ + if (all <= 1) { /* - * Adjust index so it refers to the last character in the - * match instead of the first character after the match. + * If inlining, set the interpreter's object result to an + * empty list, otherwise set it to an integer object w/ + * value 0. */ - - if (end >= offset) { - end--; + if (doinline) { + Tcl_SetListObj(resultPtr, 0, NULL); + } else { + Tcl_SetIntObj(resultPtr, 0); } - } else { - start = -1; - end = -1; + return TCL_OK; } + break; + } - objs[0] = Tcl_NewLongObj(start); - objs[1] = Tcl_NewLongObj(end); + /* + * If additional variable names have been specified, return + * index information in those variables. + */ - newPtr = Tcl_NewListObj(2, objs); - } else { - if (i <= info.nsubs) { - newPtr = Tcl_GetRange(objPtr, offset + info.matches[i].start, - offset + info.matches[i].end - 1); + Tcl_RegExpGetInfo(regExpr, &info); + if (doinline) { + /* + * It's the number of substitutions, plus one for the matchVar + * at index 0 + */ + objc = info.nsubs + 1; + } + for (i = 0; i < objc; i++) { + Tcl_Obj *newPtr; + + if (indices) { + int start, end; + Tcl_Obj *objs[2]; + + if (i <= info.nsubs) { + start = offset + info.matches[i].start; + end = offset + info.matches[i].end; + + /* + * Adjust index so it refers to the last character in the + * match instead of the first character after the match. + */ + + if (end >= offset) { + end--; + } + } else { + start = -1; + end = -1; + } + + objs[0] = Tcl_NewLongObj(start); + objs[1] = Tcl_NewLongObj(end); + + newPtr = Tcl_NewListObj(2, objs); } else { - newPtr = Tcl_NewObj(); - + if (i <= info.nsubs) { + newPtr = Tcl_GetRange(objPtr, + offset + info.matches[i].start, + offset + info.matches[i].end - 1); + } else { + newPtr = Tcl_NewObj(); + } + } + if (doinline) { + if (Tcl_ListObjAppendElement(interp, resultPtr, newPtr) + != TCL_OK) { + Tcl_DecrRefCount(newPtr); + return TCL_ERROR; + } + } else { + Tcl_Obj *valuePtr; + valuePtr = Tcl_ObjSetVar2(interp, objv[i], NULL, newPtr, 0); + if (valuePtr == NULL) { + Tcl_DecrRefCount(newPtr); + Tcl_AppendResult(interp, "couldn't set variable \"", + Tcl_GetString(objv[i]), "\"", (char *) NULL); + return TCL_ERROR; + } } } - valuePtr = Tcl_ObjSetVar2(interp, varPtr, NULL, newPtr, 0); - if (valuePtr == NULL) { - Tcl_DecrRefCount(newPtr); - Tcl_AppendResult(interp, "couldn't set variable \"", - Tcl_GetString(varPtr), "\"", (char *) NULL); - return TCL_ERROR; + + if (all == 0) { + break; } + /* + * Adjust the offset to the character just after the last one + * in the matchVar and increment all to count how many times + * we are making a match + */ + offset += info.matches[0].end; + all++; } /* - * Set the interpreter's object result to an integer object w/ value 1. + * Set the interpreter's object result to an integer object + * with value 1 if -all wasn't specified, otherwise it's all-1 + * (the number of times through the while - 1). */ - - Tcl_SetIntObj(Tcl_GetObjResult(interp), 1); + + if (!doinline) { + Tcl_SetIntObj(resultPtr, (all ? all-1 : 1)); + } return TCL_OK; } diff --git a/tests/regexp.test b/tests/regexp.test index 6bff015..e8836bd 100644 --- a/tests/regexp.test +++ b/tests/regexp.test @@ -11,7 +11,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: regexp.test,v 1.10 1999/09/21 04:20:45 hobbs Exp $ +# RCS: @(#) $Id: regexp.test,v 1.11 2000/02/02 22:32:13 hobbs Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest @@ -190,7 +190,7 @@ test regexp-6.2 {regexp errors} { } {1 {wrong # args: should be "regexp ?switches? exp string ?matchVar? ?subMatchVar subMatchVar ...?"}} test regexp-6.3 {regexp errors} { list [catch {regexp -gorp a} msg] $msg -} {1 {bad switch "-gorp": must be -indices, -nocase, -about, -expanded, -line, -linestop, -lineanchor, -start, or --}} +} {1 {bad switch "-gorp": must be -all, -about, -indices, -inline, -expanded, -line, -linestop, -lineanchor, -nocase, -start, or --}} test regexp-6.4 {regexp errors} { list [catch {regexp a( b} msg] $msg } {1 {couldn't compile regular expression pattern: parentheses () not balanced}} @@ -371,8 +371,9 @@ test regexp-11.8 {regsub errors, -start bad int check} { # This test crashes on the Mac unless you increase the Stack Space to about 1 # Meg. This is probably bigger than most users want... - -test regexp-12.1 {macCrash} {Tcl_RegExpExec: large number of subexpressions} { +# 8.2.3 regexp reduced stack space requirements, but this should be +# tested again +test regexp-12.1 {Tcl_RegExpExec: large number of subexpressions} {macCrash} { list [regexp (.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.) abcdefghijklmnopqrstuvwxyz all a b c d e f g h i j k l m n o p q r s t u v w x y z] $all $a $b $c $d $e $f $g $h $i $j $k $l $m $n $o $p $q $r $s $t $u $v $w $x $y $z } {1 abcdefghijklmnopqrstuvwxyz a b c d e f g h i j k l m n o p q r s t u v w x y z} @@ -382,10 +383,7 @@ test regexp-13.1 {regsub of a very large string} { # is in use. set line {BEGIN_TABLE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; END_TABLE} - set filedata "" - for {set i 1} {$i<200} {incr i} { - append filedata $line - } + set filedata [string repeat $line 200] for {set i 1} {$i<10} {incr i} { regsub -all "BEGIN_TABLE " $filedata "" newfiledata } @@ -415,7 +413,7 @@ test regexp-14.2 {CompileRegexp: regexp cache, different flags} { # There is no exec on the Mac ... -test regexp-14.3 {unixOrPc} {CompileRegexp: regexp cache, empty regexp and empty cache} { +test regexp-14.3 {CompileRegexp: regexp cache, empty regexp and empty cache} {unixOrPc} { makeFile {puts [regexp {} foo]} junk.tcl exec $::tcltest::tcltest junk.tcl } 1 @@ -462,9 +460,49 @@ test regexp-16.4 {regsub -start, \A behavior} { lappend out [regsub -start 2 -all {\A(\w)} {abcde} {/\1} x] $x } {5 /a/b/c/d/e 3 ab/c/d/e} -set x 1 -set y 2 -regexp "$x$y" 123 +test regexp-17.1 {regexp -inline} { + regexp -inline b ababa +} {b} +test regexp-17.2 {regexp -inline} { + regexp -inline (b) ababa +} {b b} +test regexp-17.3 {regexp -inline -indices} { + regexp -inline -indices (b) ababa +} {{1 1} {1 1}} +test regexp-17.4 {regexp -inline} { + regexp -inline {\w(\d+)\w} " hello 23 there456def " +} {e456d 456} +test regexp-17.5 {regexp -inline no matches} { + regexp -inline {\w(\d+)\w} "" +} {} +test regexp-17.6 {regexp -inline no matches} { + regexp -inline hello goodbye +} {} +test regexp-17.7 {regexp -inline, no matchvars allowed} { + list [catch {regexp -inline b abc match} msg] $msg +} {1 {regexp match variables not allowed when using -inline}} + +test regexp-18.1 {regexp -all} { + regexp -all b bbbbb +} {5} +test regexp-18.2 {regexp -all} { + regexp -all b abababbabaaaaaaaaaab +} {6} +test regexp-18.3 {regexp -all -inline} { + regexp -all -inline b abababbabaaaaaaaaaab +} {b b b b b b} +test regexp-18.4 {regexp -all -inline} { + regexp -all -inline {\w(\w)} abcdefg +} {ab b cd d ef f} +test regexp-18.5 {regexp -all -inline} { + regexp -all -inline {\w(\w)$} abcdefg +} {fg g} +test regexp-18.6 {regexp -all -inline} { + regexp -all -inline {\d+} 10:20:30:40 +} {10 20 30 40} +test regexp-18.7 {regexp -all -inline} { + list [catch {regexp -all -inline b abc match} msg] $msg +} {1 {regexp match variables not allowed when using -inline}} # cleanup ::tcltest::cleanupTests |