From 9f6f5c519abd3e4640e5fed701205a348b0bf006 Mon Sep 17 00:00:00 2001 From: jpeek Date: Thu, 24 Jun 1999 21:15:13 +0000 Subject: Moved description of regular expression syntax from regexp.n into a new re_syntax.n page. Modified other pages' references to regexp(n). Added a few new "see also" entries pointing to re_syntax(n). --- doc/CrtInterp.3 | 4 +- doc/RegExp.3 | 5 +- doc/library.n | 5 +- doc/lsearch.n | 5 +- doc/re_syntax.n | 926 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ doc/regexp.n | 906 +----------------------------------------------------- doc/regsub.n | 4 +- doc/switch.n | 4 +- 8 files changed, 948 insertions(+), 911 deletions(-) create mode 100644 doc/re_syntax.n diff --git a/doc/CrtInterp.3 b/doc/CrtInterp.3 index 62055d9..360ff77 100644 --- a/doc/CrtInterp.3 +++ b/doc/CrtInterp.3 @@ -5,7 +5,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: CrtInterp.3,v 1.2 1998/09/14 18:39:47 stanton Exp $ +'\" RCS: @(#) $Id: CrtInterp.3,v 1.3 1999/06/24 21:15:13 jpeek Exp $ '\" .so man.macros .TH Tcl_CreateInterp 3 7.5 Tcl "Tcl Library Procedures" @@ -36,7 +36,7 @@ a token for it. The token is required in calls to most other Tcl procedures, such as \fBTcl_CreateCommand\fR, \fBTcl_Eval\fR, and \fBTcl_DeleteInterp\fR. Clients are only allowed to access a few of the fields of -Tcl_Interp structures; see the Tcl_Interp +Tcl_Interp structures; see the \fBTcl_Interp\fR and \fBTcl_CreateCommand\fR man pages for details. The new interpreter is initialized with no defined variables and only the built-in Tcl commands. To bind in additional commands, call diff --git a/doc/RegExp.3 b/doc/RegExp.3 index 9fefdaf..c8e8887 100644 --- a/doc/RegExp.3 +++ b/doc/RegExp.3 @@ -6,7 +6,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: RegExp.3,v 1.4 1999/06/17 19:32:21 stanton Exp $ +'\" RCS: @(#) $Id: RegExp.3,v 1.5 1999/06/24 21:15:13 jpeek Exp $ '\" .so man.macros .TH Tcl_RegExpMatch 3 8.1 Tcl "Tcl Library Procedures" @@ -340,6 +340,7 @@ found, this will be the same as the beginning of the current match. If no match was found, then it indicates the earliest point at which a match might occur if additional text is appended to the string. .VE 8.1 - +.SH "SEE ALSO" +re_syntax(n) .SH KEYWORDS match, pattern, regular expression, string, subexpression, Tcl_RegExpIndices, Tcl_RegExpInfo diff --git a/doc/library.n b/doc/library.n index eb20351..44093bb 100644 --- a/doc/library.n +++ b/doc/library.n @@ -5,7 +5,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: library.n,v 1.6 1999/04/16 00:46:35 stanton Exp $ +'\" RCS: @(#) $Id: library.n,v 1.7 1999/06/24 21:15:13 jpeek Exp $ .so man.macros .TH library n "8.0" Tcl "Tcl Built-In Commands" .BS @@ -284,5 +284,8 @@ It is used to detect errors where \fBunknown\fR recurses on itself infinitely. The variable is unset before \fBunknown\fR returns. +.SH "SEE ALSO" +re_syntax(n) + .SH KEYWORDS auto-exec, auto-load, library, unknown, word, whitespace diff --git a/doc/lsearch.n b/doc/lsearch.n index bcede70..39eddac 100644 --- a/doc/lsearch.n +++ b/doc/lsearch.n @@ -5,7 +5,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: lsearch.n,v 1.2 1998/09/14 18:39:53 stanton Exp $ +'\" RCS: @(#) $Id: lsearch.n,v 1.3 1999/06/24 21:15:13 jpeek Exp $ '\" .so man.macros .TH lsearch n 7.0 Tcl "Tcl Built-In Commands" @@ -37,7 +37,8 @@ element using the same rules as the \fBstring match\fR command. .TP \fB\-regexp\fR \fIPattern\fR is treated as a regular expression and matched against -each list element using the same rules as the \fBregexp\fR command. +each list element using the rules described in the \fBre_syntax\fR +reference page. .PP If \fImode\fR is omitted then it defaults to \fB\-glob\fR. diff --git a/doc/re_syntax.n b/doc/re_syntax.n new file mode 100644 index 0000000..63eb76c --- /dev/null +++ b/doc/re_syntax.n @@ -0,0 +1,926 @@ +'\" +'\" Copyright (c) 1998 Sun Microsystems, Inc. +'\" Copyright (c) 1999 Scriptics Corporation +'\" +'\" See the file "license.terms" for information on usage and redistribution +'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. +'\" +'\" RCS: @(#) $Id: re_syntax.n,v 1.1 1999/06/24 21:15:13 jpeek Exp $ +'\" +.so man.macros +.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands" +.BS +.SH NAME +re_syntax \- Syntax of Tcl regular expressions. +.BE + +.SH DESCRIPTION +.PP +A \fIregular expression\fR describes strings of characters. +It's a pattern that matches certain strings and doesn't match others. + +.SH "DIFFERENT FLAVORS OF REs" +.VS 8.1 +Regular expressions (``RE''s), as defined by POSIX, come in two +flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs''). +EREs are roughly those of the traditional \fIegrep\fR, while BREs are +roughly those of the traditional \fIed\fR. This implementation adds +a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with +some significant extensions. +.PP +This manual page primarily describes AREs. BREs mostly exist for +backward compatibility in some old programs; they will be discussed at +the end. POSIX EREs are almost an exact subset of AREs. Features of +AREs that are not present in EREs will be indicated. + +.SH "REGULAR EXPRESSION SYNTAX" +.PP +Tcl regular expressions are implemented using the package written by +Henry Spencer, based on the 1003.2 spec and some (not quite all) of +the Perl5 extensions (thanks, Henry!). Much of the description of +regular expressions below is copied verbatim from his manual entry. +.PP +An ARE is one or more \fIbranches\fR, +separated by `\fB|\fR', +matching anything that matches any of the branches. +.PP +A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR, +concatenated. +It matches a match for the first, followed by a match for the second, etc; +an empty branch matches the empty string. +.PP +A quantified atom is an \fIatom\fR possibly followed +by a single \fIquantifier\fR. +Without a quantifier, it matches a match for the atom. +The quantifiers, +and what a so-quantified atom matches, are: +.RS 2 +.TP 6 +\fB*\fR +a sequence of 0 or more matches of the atom +.TP +\fB+\fR +a sequence of 1 or more matches of the atom +.TP +\fB?\fR +a sequence of 0 or 1 matches of the atom +.TP +\fB{\fIm\fB}\fR +a sequence of exactly \fIm\fR matches of the atom +.TP +\fB{\fIm\fB,}\fR +a sequence of \fIm\fR or more matches of the atom +.TP +\fB{\fIm\fB,\fIn\fB}\fR +a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom; +\fIm\fR may not exceed \fIn\fR +.TP +\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR +\fInon-greedy\fR quantifiers, +which match the same possibilities, +but prefer the smallest number rather than the largest number +of matches (see MATCHING) +.RE +.PP +The forms using +\fB{\fR and \fB}\fR +are known as \fIbound\fRs. +The numbers +\fIm\fR and \fIn\fR are unsigned decimal integers +with permissible values from 0 to 255 inclusive. +.PP +An atom is one of: +.RS 2 +.TP 6 +\fB(\fIre\fB)\fR +(where \fIre\fR is any regular expression) +matches a match for +\fIre\fR, with the match noted for possible reporting +.TP +\fB(?:\fIre\fB)\fR +as previous, +but does no reporting +(a ``non-capturing'' set of parentheses) +.TP +\fB()\fR +matches an empty string, +noted for possible reporting +.TP +\fB(?:)\fR +matches an empty string, +without reporting +.TP +\fB[\fIchars\fB]\fR +a \fIbracket expression\fR, +matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail) +.TP + \fB.\fR +matches any single character +.TP +\fB\e\fIk\fR +(where \fIk\fR is a non-alphanumeric character) +matches that character taken as an ordinary character, +e.g. \e\e matches a backslash character +.TP +\fB\e\fIc\fR +where \fIc\fR is alphanumeric +(possibly followed by other characters), +an \fIescape\fR (AREs only), +see ESCAPES below +.TP +\fB{\fR +when followed by a character other than a digit, +matches the left-brace character `\fB{\fR'; +when followed by a digit, it is the beginning of a +\fIbound\fR (see above) +.TP +\fIx\fR +where \fIx\fR is +a single character with no other significance, matches that character. +.RE +.PP +A \fIconstraint\fR matches an empty string when specific conditions +are met. +A constraint may not be followed by a quantifier. +The simple constraints are as follows; some more constraints are +described later, under ESCAPES. +.RS 2 +.TP 8 +\fB^\fR +matches at the beginning of a line +.TP +\fB$\fR +matches at the end of a line +.TP +\fB(?=\fIre\fB)\fR +\fIpositive lookahead\fR (AREs only), matches at any point +where a substring matching \fIre\fR begins +.TP +\fB(?!\fIre\fB)\fR +\fInegative lookahead\fR (AREs only), matches at any point +where no substring matching \fIre\fR begins +.RE +.PP +The lookahead constraints may not contain back references (see later), +and all parentheses within them are considered non-capturing. +.PP +An RE may not end with `\fB\e\fR'. + +.SH "BRACKET EXPRESSIONS" +A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'. +It normally matches any single character from the list (but see below). +If the list begins with `\fB^\fR', +it matches any single character +(but see below) \fInot\fR from the rest of the list. +.PP +If two characters in the list are separated by `\fB\-\fR', +this is shorthand +for the full \fIrange\fR of characters between those two (inclusive) in the +collating sequence, +e.g. +\fB[0\-9]\fR +in ASCII matches any decimal digit. +Two ranges may not share an +endpoint, so e.g. +\fBa\-c\-e\fR +is illegal. +Ranges are very collating-sequence-dependent, +and portable programs should avoid relying on them. +.PP +To include a literal +\fB]\fR +or +\fB\-\fR +in the list, +the simplest method is to +enclose it in +\fB[.\fR and \fB.]\fR +to make it a collating element (see below). +Alternatively, +make it the first character +(following a possible `\fB^\fR'), +or (AREs only) precede it with `\fB\e\fR'. +Alternatively, for `\fB\-\fR', +make it the last character, +or the second endpoint of a range. +To use a literal +\fB\-\fR +as the first endpoint of a range, +make it a collating element +or (AREs only) precede it with `\fB\e\fR'. +With the exception of these, some combinations using +\fB[\fR +(see next +paragraphs), and escapes, +all other special characters lose their +special significance within a bracket expression. +.PP +Within a bracket expression, a collating element (a character, +a multi-character sequence that collates as if it were a single character, +or a collating-sequence name for either) +enclosed in +\fB[.\fR and \fB.]\fR +stands for the +sequence of characters of that collating element. +The sequence is a single element of the bracket expression's list. +A bracket expression in a locale which has +multi-character collating elements +can thus match more than one character. +Most insidiously, if +\fB^\fR +is used, +this can happen even if no multi-character collating +elements appear in the bracket expression! +If the collating sequence includes a +\fBch\fR +multi-character collating element, +then the RE +\fB[[.ch.]]*c\fR +matches the first five characters +of `\fBchchcc\fR', +and the RE +\fB[^c]b\fR +matches all of `\fBchb\fR'. +.PP +Within a bracket expression, a collating element enclosed in +\fB[=\fR +and +\fB=]\fR +is an equivalence class, standing for the sequences of characters +of all collating elements equivalent to that one, including itself. +(If there are no other equivalent collating elements, +the treatment is as if the enclosing delimiters were `\fB[.\fR'\& +and `\fB.]\fR'.) +For example, if +\fBo\fR +and +\fB\o'o^'\fR +are the members of an equivalence class, +then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR', +and `\fB[o\o'o^']\fR'\& +are all synonymous. +An equivalence class may not be an endpoint +of a range. +.PP +Within a bracket expression, the name of a \fIcharacter class\fR enclosed +in +\fB[:\fR +and +\fB:]\fR +stands for the list of all characters +(not all collating elements!) +belonging to that +class. +Standard character classes are (*** CHECK THESE! ***): +.PP +.RS +.ne 5 +.nf +.ta 3c +\fBalpha\fR A letter. +\fBupper\fR An upper-case letter. +\fBlower\fR A lower-case letter. +\fBdigit\fR A decimal digit. +\fBxdigit\fR A hexadecimal digit. +\fBalnum\fR An alphanumeric (letter or digit). +\fBspace\fR A character producing white space in displayed text. +\fBpunct\fR A punctuation character. +\fBprint\fR A printing character. +\fBgraph\fR A character with a visible representation. +\fBcntrl\fR A control character. +.fi +.RE +.PP +A locale may provide others. (*** NOT ANYMORE, TRUE? ***) +A character class may not be used as an endpoint of a range. +.PP +There are two special cases of bracket expressions: +the bracket expressions +\fB[[:<:]]\fR +and +\fB[[:>:]]\fR +are constraints, matching empty strings at +the beginning and end of a word respectively. +'\" note, discussion of escapes below references this definition of word +A word is defined as a sequence of +word characters +which is neither preceded nor followed by +word characters. +A word character is an +\fIalnum\fR +character (as defined by +\fIctype\fR(3)) +or an underscore +(\fB_\fR). +These special bracket expressions are deprecated; +users of AREs should use constraint escapes instead (see below). +.SH ESCAPES +Escapes (AREs only), which begin with a +\fB\e\fR +followed by an alphanumeric character, +come in several varieties: +character entry, class shorthands, constraint escapes, and back references. +A +\fB\e\fR +followed by an alphanumeric character but not constituting +a valid escape is illegal in AREs. +In EREs, there are no escapes: +outside a bracket expression, +a +\fB\e\fR +followed by an alphanumeric character merely stands for that +character as an ordinary character, +and inside a bracket expression, +\fB\e\fR +is an ordinary character. +(The latter is the one actual incompatibility between EREs and AREs.) +.PP +Character-entry escapes (AREs only) exist to make it easier to specify +non-printing and otherwise inconvenient characters in REs: +.RS 2 +.TP 5 +\fB\ea\fR +alert, aka bell, character, as in C +.TP +\fB\eb\fR +backspace, as in C +.TP +\fB\eB\fR +synonym for +\fB\e\fR +to help reduce backslash doubling in some +applications where there are multiple levels of backslash processing +.TP +\fB\ec\fIX\fR +(where X is any character) the character whose +low-order 5 bits are the same as those of +\fIX\fR, +and whose other bits are all zero +.TP +\fB\ee\fR +the character whose collating-sequence name +is `\fBESC\fR', +or failing that, the character with octal value 033 +.TP +\fB\ef\fR +formfeed, as in C +.TP +\fB\en\fR +newline, as in C +.TP +\fB\er\fR +carriage return, as in C +.TP +\fB\et\fR +horizontal tab, as in C +.TP +\fB\eu\fIwxyz\fR +(where +\fIwxyz\fR +is exactly four hexadecimal digits) +the Unicode character +\fBU+\fIwxyz\fR +in the local byte ordering +.TP +\fB\eU\fIstuvwxyz\fR +(where +\fIstuvwxyz\fR +is exactly eight hexadecimal digits) +reserved for a somewhat-hypothetical Unicode extension to 32 bits +.TP +\fB\ev\fR +vertical tab, as in C +are all available. +.TP +\fB\ex\fIhhh\fR +(where +\fIhhh\fR +is any sequence of hexadecimal digits) +the character whose hexadecimal value is +\fB0x\fIhhh\fR +(a single character no matter how many hexadecimal digits are used). +.TP +\fB\e0\fR +the character whose value is +\fB0\fR +.TP +\fB\e\fIxy\fR +(where +\fIxy\fR +is exactly two octal digits, +and is not a +\fIback reference\fR (see below)) +the character whose octal value is +\fB0\fIxy\fR +.TP +\fB\e\fIxyz\fR +(where +\fIxyz\fR +is exactly three octal digits, +and is not a +back reference (see below)) +the character whose octal value is +\fB0\fIxyz\fR +.RE +.PP +Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR', +and `\fBA\fR'-`\fBF\fR'. +Octal digits are `\fB0\fR'-`\fB7\fR'. +.PP +The character-entry escapes are always taken as ordinary characters. +For example, +\fB\e135\fR +is +\fB]\fR +in ASCII, +but +\fB\e135\fR +does not terminate a bracket expression. +Beware, however, that some applications (e.g., C compilers) interpret +such sequences themselves before the regular-expression package +gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'. +.PP +Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used +character classes: +.RS 2 +.TP 10 +\fB\ed\fR +\fB[[:digit:]]\fR +.TP +\fB\es\fR +\fB[[:space:]]\fR +.TP +\fB\ew\fR +\fB[[:alnum:]_]\fR +(note underscore) +.TP +\fB\eD\fR +\fB[^[:digit:]]\fR +.TP +\fB\eS\fR +\fB[^[:space:]]\fR +.TP +\fB\eW\fR +\fB[^[:alnum:]_]\fR +(note underscore) +.RE +.PP +Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', +and `\fB\ew\fR'\& +lose their outer brackets, +and `\fB\eD\fR', `\fB\eS\fR', +and `\fB\eW\fR'\& +are illegal. +.PP +A constraint escape (AREs only) is a constraint, +matching the empty string if specific conditions are met, +written as an escape: +.RS 2 +.TP 6 +\fB\eA\fR +matches only at the beginning of the string +(see MATCHING, below, for how this differs from `\fB^\fR') +.TP +\fB\em\fR +matches only at the beginning of a word +.TP +\fB\eM\fR +matches only at the end of a word +.TP +\fB\ey\fR +matches only at the beginning or end of a word +.TP +\fB\eY\fR +matches only at a point which is not the beginning or end of a word +.TP +\fB\eZ\fR +matches only at the end of the string +(see MATCHING, below, for how this differs from `\fB$\fR') +.TP +\fB\e\fIm\fR +(where +\fIm\fR +is a nonzero digit) a \fIback reference\fR, see below +.TP +\fB\e\fImnn\fR +(where +\fIm\fR +is a nonzero digit, and +\fInn\fR +is some more digits, +and the decimal value +\fImnn\fR +is not greater than the number of closing capturing parentheses seen so far) +a \fIback reference\fR, see below +.RE +.PP +A word is defined as in the specification of +\fB[[:<:]]\fR +and +\fB[[:>:]]\fR +above. +Constraint escapes are illegal within bracket expressions. +.PP +A back reference (AREs only) matches the same string matched by the parenthesized +subexpression specified by the number, +so that (e.g.) +\fB([bc])\e1\fR +matches +\fBbb\fR +or +\fBcc\fR +but not `\fBbc\fR'. +The subexpression must entirely precede the back reference in the RE. +Subexpressions are numbered in the order of their leading parentheses. +Non-capturing parentheses do not define subexpressions. +.PP +There is an inherent historical ambiguity between octal character-entry +escapes and back references, which is resolved by heuristics, +as hinted at above. +A leading zero always indicates an octal escape. +A single non-zero digit, not followed by another digit, +is always taken as a back reference. +A multi-digit sequence not starting with a zero is taken as a back +reference if it comes after a suitable subexpression +(i.e. the number is in the legal range for a back reference), +and otherwise is taken as octal. +.SH "METASYNTAX" +In addition to the main syntax described above, there are some special +forms and miscellaneous syntactic facilities available. +.PP +Normally the flavor of RE being used is specified by +application-dependent means. +However, this can be overridden by a \fIdirector\fR. +If an RE of any flavor begins with `\fB***:\fR', +the rest of the RE is an ARE. +If an RE of any flavor begins with `\fB***=\fR', +the rest of the RE is taken to be a literal string, +with all characters considered ordinary characters. +.PP +An ARE may begin with \fIembedded options\fR: +a sequence +\fB(?\fIxyz\fB)\fR +(where +\fIxyz\fR +is one or more alphabetic characters) +specifies options affecting the rest of the RE. +These supplement, and can override, +any options specified by the application. +The available option letters are: +.RS 2 +.TP 3 +\fBb\fR +rest of RE is a BRE +.TP 3 +\fBc\fR +case-sensitive matching (usual default) +.TP 3 +\fBe\fR +rest of RE is an ERE +.TP 3 +\fBi\fR +case-insensitive matching (see MATCHING, below) +.TP 3 +\fBm\fR +historical synonym for +\fBn\fR +.TP 3 +\fBn\fR +newline-sensitive matching (see MATCHING, below) +.TP 3 +\fBp\fR +partial newline-sensitive matching (see MATCHING, below) +.TP 3 +\fBq\fR +rest of RE is a literal (``quoted'') string, all ordinary characters +.TP 3 +\fBs\fR +non-newline-sensitive matching (usual default) +.TP 3 +\fBt\fR +tight syntax (usual default; see below) +.TP 3 +\fBw\fR +inverse partial newline-sensitive (``weird'') matching (see MATCHING, below) +.TP 3 +\fBx\fR +expanded syntax (see below) +.RE +.PP +Embedded options take effect at the +\fB)\fR +terminating the sequence. +They are available only at the start of an ARE, +and may not be used later within it. +.PP +In addition to the usual (\fItight\fR) RE syntax, in which all characters are +significant, there is an \fIexpanded\fR syntax, +available in all flavors of RE +with the \fB-expanded\fR switch, or in AREs with the embedded x option. +In the expanded syntax, +white-space characters are ignored +and all characters between a +\fB#\fR +and the following newline (or the end of the RE) are ignored, +permitting paragraphing and commenting a complex RE. +There are three exceptions to that basic rule: +.RS 2 +.PP +a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained +.PP +white space or `\fB#\fR' within a bracket expression is retained +.PP +white space and comments are illegal within multi-character symbols +like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR' +.RE +.PP +Expanded-syntax +white-space characters are blank, tab, newline, etc. (any character +defined as \fIspace\fR by +\fIctype\fR(3)). +Exactly how a multi-line expanded-syntax RE +can be entered interactively by a user, +if at all, is application-specific; +expanded syntax is primarily a scripting facility. +.PP +Finally, in an ARE, +outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR' +(where +\fIttt\fR +is any text not containing a `\fB)\fR') +is a comment, +completely ignored. +Again, this is not allowed between the characters of +multi-character symbols like `\fB(?:\fR'. +Such comments are more a historical artifact than a useful facility, +and their use is deprecated; +use the expanded syntax instead. +.PP +\fINone\fR of these metasyntax extensions is available if the application +(or an initial +\fB***=\fR +director) +has specified that the user's input be treated as a literal string +rather than as an RE. +.SH MATCHING +In the event that an RE could match more than one substring of a given +string, +the RE matches the one starting earliest in the string. +If the RE could match more than one substring starting at that point, +its choice is determined by its \fIpreference\fR: +either the longest substring, or the shortest. +.PP +Most atoms, and all constraints, have no preference. +A parenthesized RE has the same preference (possibly none) as the RE. +A quantified atom with quantifier +\fB{\fIm\fB}\fR +or +\fB{\fIm\fB}?\fR +has the same preference (possibly none) as the atom itself. +A quantified atom with other normal quantifiers (including +\fB{\fIm\fB,\fIn\fB}\fR +with +\fIm\fR +equal to +\fIn\fR) +prefers longest match. +A quantified atom with other non-greedy quantifiers (including +\fB{\fIm\fB,\fIn\fB}?\fR +with +\fIm\fR +equal to +\fIn\fR) +prefers shortest match. +A branch has the same preference as the first quantified atom in it +which has a preference. +An RE consisting of two or more branches connected by the +\fB|\fR +operator prefers longest match. +.PP +Subject to the constraints imposed by the rules for matching the whole RE, +subexpressions also match the longest or shortest possible substrings, +based on their preferences, +with subexpressions starting earlier in the RE taking priority over +ones starting later. +Note that outer subexpressions thus take priority over +their component subexpressions. +.PP +Note that the quantifiers +\fB{1,1}\fR +and +\fB{1,1}?\fR +can be used to force longest and shortest preference, respectively, +on a subexpression or a whole RE. +.PP +Match lengths are measured in characters, not collating elements. +An empty string is considered longer than no match at all. +For example, +\fBbb*\fR +matches the three middle characters of `\fBabbbc\fR', +\fB(week|wee)(night|knights)\fR +matches all ten characters of `\fBweeknights\fR', +when +\fB(.*).*\fR +is matched against +\fBabc\fR +the parenthesized subexpression +matches all three characters, and +when +\fB(a*)*\fR +is matched against +\fBbc\fR +both the whole RE and the parenthesized +subexpression match an empty string. +.PP +If case-independent matching is specified, +the effect is much as if all case distinctions had vanished from the +alphabet. +When an alphabetic that exists in multiple cases appears as an +ordinary character outside a bracket expression, it is effectively +transformed into a bracket expression containing both cases, +so that +\fBx\fR +becomes `\fB[xX]\fR'. +When it appears inside a bracket expression, all case counterparts +of it are added to the bracket expression, so that +\fB[x]\fR +becomes +\fB[xX]\fR +and +\fB[^x]\fR +becomes `\fB[^xX]\fR'. +.PP +If newline-sensitive matching is specified, \fB.\fR +and bracket expressions using +\fB^\fR +will never match the newline character +(so that matches will never cross newlines unless the RE +explicitly arranges it) +and +\fB^\fR +and +\fB$\fR +will match the empty string after and before a newline +respectively, in addition to matching at beginning and end of string +respectively. +ARE +\fB\eA\fR +and +\fB\eZ\fR +continue to match beginning or end of string \fIonly\fR. +.PP +If partial newline-sensitive matching is specified, +this affects \fB.\fR +and bracket expressions +as with newline-sensitive matching, but not +\fB^\fR +and `\fB$\fR'. +.PP +If inverse partial newline-sensitive matching is specified, +this affects +\fB^\fR +and +\fB$\fR +as with +newline-sensitive matching, +but not \fB.\fR +and bracket expressions. +This isn't very useful but is provided for symmetry. +.SH "LIMITS AND COMPATIBILITY" +No particular limit is imposed on the length of REs. +Programs intended to be highly portable should not employ REs longer +than 256 bytes, +as a POSIX-compliant implementation can refuse to accept such REs. +.PP +The only feature of AREs that is actually incompatible with +POSIX EREs is that +\fB\e\fR +does not lose its special +significance inside bracket expressions. +All other ARE features use syntax which is illegal or has +undefined or unspecified effects in POSIX EREs; +the +\fB***\fR +syntax of directors likewise is outside the POSIX +syntax for both BREs and EREs. +.PP +Many of the ARE extensions are borrowed from Perl, but some have +been changed to clean them up, and a few Perl extensions are not present. +Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR', +the lack of special treatment for a trailing newline, +the addition of complemented bracket expressions to the things +affected by newline-sensitive matching, +the restrictions on parentheses and back references in lookahead constraints, +and the longest/shortest-match (rather than first-match) matching semantics. +.PP +The matching rules for REs containing both normal and non-greedy quantifiers +have changed since early beta-test versions of this package. +(The new rules are much simpler and cleaner, +but don't work as hard at guessing the user's real intentions.) +.PP +Henry Spencer's original 1986 \fIregexp\fR package, +still in widespread use (e.g., in pre-8.1 releases of Tcl), +implemented an early version of today's EREs. +There are four incompatibilities between \fIregexp\fR's near-EREs +(`RREs' for short) and AREs. +In roughly increasing order of significance: +.PP +.RS +In AREs, +\fB\e\fR +followed by an alphanumeric character is either an +escape or an error, +while in RREs, it was just another way of writing the +alphanumeric. +This should not be a problem because there was no reason to write +such a sequence in RREs. +.PP +\fB{\fR +followed by a digit in an ARE is the beginning of a bound, +while in RREs, +\fB{\fR +was always an ordinary character. +Such sequences should be rare, +and will often result in an error because following characters +will not look like a valid bound. +.PP +In AREs, +\fB\e\fR +remains a special character within `\fB[\|]\fR', +so a literal +\fB\e\fR +within +\fB[\|]\fR +must be written `\fB\e\e\fR'. +\fB\e\e\fR +also gives a literal +\fB\e\fR +within +\fB[\|]\fR +in RREs, +but only truly paranoid programmers routinely doubled the backslash. +.PP +AREs report the longest/shortest match for the RE, +rather than the first found in a specified search order. +This may affect some RREs which were written in the expectation that +the first match would be reported. +(The careful crafting of RREs to optimize the search order for fast +matching is obsolete (AREs examine all possible matches +in parallel, and their performance is largely insensitive to their +complexity) but cases where the search order was exploited to deliberately +find a match which was \fInot\fR the longest/shortest will need rewriting.) +.RE + +.SH "BASIC REGULAR EXPRESSIONS" +BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR', +and +\fB?\fR +are ordinary characters and there is no equivalent +for their functionality. +The delimiters for bounds are +\fB\e{\fR +and `\fB\e}\fR', +with +\fB{\fR +and +\fB}\fR +by themselves ordinary characters. +The parentheses for nested subexpressions are +\fB\e(\fR +and `\fB\e)\fR', +with +\fB(\fR +and +\fB)\fR +by themselves ordinary characters. +\fB^\fR +is an ordinary character except at the beginning of the +RE or the beginning of a parenthesized subexpression, +\fB$\fR +is an ordinary character except at the end of the +RE or the end of a parenthesized subexpression, +and +\fB*\fR +is an ordinary character if it appears at the beginning of the +RE or the beginning of a parenthesized subexpression +(after a possible leading `\fB^\fR'). +Finally, +single-digit back references are available, +and +\fB\e<\fR +and +\fB\e>\fR +are synonyms for +\fB[[:<:]]\fR +and +\fB[[:>:]]\fR +respectively; +no other escapes are available. + +.VE 8.1 + +.SH "SEE ALSO" +RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n) + +.SH KEYWORDS +match, regular expression, string diff --git a/doc/regexp.n b/doc/regexp.n index 2f6c4b6..03010e3 100644 --- a/doc/regexp.n +++ b/doc/regexp.n @@ -4,7 +4,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: regexp.n,v 1.5 1999/05/06 19:14:42 stanton Exp $ +'\" RCS: @(#) $Id: regexp.n,v 1.6 1999/06/24 21:15:13 jpeek Exp $ '\" .so man.macros .TH regexp n 8.1 Tcl "Tcl Built-In Commands" @@ -21,6 +21,8 @@ regexp \- Match a regular expression against a string .PP Determines whether the regular expression \fIexp\fR matches part or all of \fIstring\fR and returns 1 if it does, 0 if it doesn't. +(Regular expression matching is described in the \fBre_syntax\fR +reference page.) .LP If additional arguments are specified after \fIstring\fR then they are treated as the names of variables in which to return @@ -93,906 +95,8 @@ portion of the expression that wasn't matched), then the corresponding \fIsubMatchVar\fR will be set to ``\fB\-1 \-1\fR'' if \fB\-indices\fR has been specified or to an empty string otherwise. -.SH "DIFFERENT FLAVORS OF REs" -.VS 8.1 -Regular expressions (``RE''s), as defined by POSIX, come in two -flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs''). -EREs are roughly those of the traditional \fIegrep\fR, while BREs are -roughly those of the traditional \fIed\fR . This implementation adds -a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with -some significant extensions. -.PP -This manual page primarily describes AREs. BREs mostly exist for -backward compatibility in some old programs; they will be discussed at -the end. POSIX EREs are almost an exact subset of AREs. Features of -AREs that are not present in EREs will be indicated. - -.SH "REGULAR EXPRESSION SYNTAX" -.PP -Tcl regular expressions are implemented using the package written by -Henry Spencer, based on the 1003.2 spec and some (not quite all) of -the Perl5 extensions (thanks, Henry!). Much of the description of -regular expressions below is copied verbatim from his manual entry. -.PP -An ARE is one or more \fIbranches\fR, -separated by `\fB|\fR', -matching anything that matches any of the branches. -.PP -A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR, -concatenated. -It matches a match for the first, followed by a match for the second, etc; -an empty branch matches the empty string. -.PP -A quantified atom is an \fIatom\fR possibly followed -by a single \fIquantifier\fR. -Without a quantifier, it matches a match for the atom. -The quantifiers, -and what a so-quantified atom matches, are: -.RS 2 -.TP 6 -\fB*\fR -a sequence of 0 or more matches of the atom -.TP -\fB+\fR -a sequence of 1 or more matches of the atom -.TP -\fB?\fR -a sequence of 0 or 1 matches of the atom -.TP -\fB{\fIm\fB}\fR -a sequence of exactly \fIm\fR matches of the atom -.TP -\fB{\fIm\fB,}\fR -a sequence of \fIm\fR or more matches of the atom -.TP -\fB{\fIm\fB,\fIn\fB}\fR -a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom; -\fIm\fR may not exceed \fIn\fR -.TP -\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR -\fInon-greedy\fR quantifiers, -which match the same possibilities, -but prefer the smallest number rather than the largest number -of matches (see MATCHING) -.RE -.PP -The forms using -\fB{\fR and \fB}\fR -are known as \fIbound\fRs. -The numbers -\fIm\fR and \fIn\fR are unsigned decimal integers -with permissible values from 0 to 255 inclusive. -.PP -An atom is one of: -.RS 2 -.TP 6 -\fB(\fIre\fB)\fR -(where \fIre\fR is any regular expression) -matches a match for -\fIre\fR, with the match noted for possible reporting -.TP -\fB(?:\fIre\fB)\fR -as previous, -but does no reporting -(a ``non-capturing'' set of parentheses) -.TP -\fB()\fR -matches an empty string, -noted for possible reporting -.TP -\fB(?:)\fR -matches an empty string, -without reporting -.TP -\fB[\fIchars\fB]\fR -a \fIbracket expression\fR, -matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail) -.TP - \fB.\fR -matches any single character -.TP -\fB\e\fIk\fR -(where \fIk\fR is a non-alphanumeric character) -matches that character taken as an ordinary character, -e.g. \e\e matches a backslash character -.TP -\fB\e\fIc\fR -where \fIc\fR is alphanumeric -(possibly followed by other characters), -an \fIescape\fR (AREs only), -see ESCAPES below -.TP -\fB{\fR -when followed by a character other than a digit, -matches the left-brace character `\fB{\fR'; -when followed by a digit, it is the beginning of a -\fIbound\fR (see above) -.TP -\fIx\fR -where \fIx\fR is -a single character with no other significance, matches that character. -.RE -.PP -A \fIconstraint\fR matches an empty string when specific conditions -are met. -A constraint may not be followed by a quantifier. -The simple constraints are as follows; some more constraints are -described later, under ESCAPES. -.RS 2 -.TP 8 -\fB^\fR -matches at the beginning of a line -.TP -\fB$\fR -matches at the end of a line -.TP -\fB(?=\fIre\fB)\fR -\fIpositive lookahead\fR (AREs only), matches at any point -where a substring matching \fIre\fR begins -.TP -\fB(?!\fIre\fB)\fR -\fInegative lookahead\fR (AREs only), matches at any point -where no substring matching \fIre\fR begins -.RE -.PP -The lookahead constraints may not contain back references (see later), -and all parentheses within them are considered non-capturing. -.PP -An RE may not end with `\fB\e\fR'. - -.SH "BRACKET EXPRESSIONS" -A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'. -It normally matches any single character from the list (but see below). -If the list begins with `\fB^\fR', -it matches any single character -(but see below) \fInot\fR from the rest of the list. -.PP -If two characters in the list are separated by `\fB\-\fR', -this is shorthand -for the full \fIrange\fR of characters between those two (inclusive) in the -collating sequence, -e.g. -\fB[0\-9]\fR -in ASCII matches any decimal digit. -Two ranges may not share an -endpoint, so e.g. -\fBa\-c\-e\fR -is illegal. -Ranges are very collating-sequence-dependent, -and portable programs should avoid relying on them. -.PP -To include a literal -\fB]\fR -or -\fB\-\fR -in the list, -the simplest method is to -enclose it in -\fB[.\fR -and -\fB.]\fR -to make it a collating element (see below). -Alternatively, -make it the first character -(following a possible `\fB^\fR'), -or (AREs only) precede it with `\fB\e\fR'. -Alternatively, for `\fB\-\fR', -make it the last character, -or the second endpoint of a range. -To use a literal -\fB\-\fR -as the first endpoint of a range, -make it a collating element -or (AREs only) precede it with `\fB\e\fR'. -With the exception of these, some combinations using -\fB[\fR -(see next -paragraphs), and escapes, -all other special characters lose their -special significance within a bracket expression. -.PP -Within a bracket expression, a collating element (a character, -a multi-character sequence that collates as if it were a single character, -or a collating-sequence name for either) -enclosed in -\fB[.\fR -and -\fB.]\fR -stands for the -sequence of characters of that collating element. -The sequence is a single element of the bracket expression's list. -A bracket expression in a locale which has -multi-character collating elements -can thus match more than one character. -Most insidiously, if -\fB^\fR -is used, -this can happen even if no multi-character collating -elements appear in the bracket expression! -If the collating sequence includes a -\fBch\fR -multi-character collating element, -then the RE -\fB[[.ch.]]*c\fR -matches the first five characters -of `\fBchchcc\fR', -and the RE -\fB[^c]b\fR -matches all of `\fBchb\fR'. -.PP -Within a bracket expression, a collating element enclosed in -\fB[=\fR -and -\fB=]\fR -is an equivalence class, standing for the sequences of characters -of all collating elements equivalent to that one, including itself. -(If there are no other equivalent collating elements, -the treatment is as if the enclosing delimiters were `\fB[.\fR'\& -and `\fB.]\fR'.) -For example, if -\fBo\fR -and -\fB\o'o^'\fR -are the members of an equivalence class, -then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR', -and `\fB[o\o'o^']\fR'\& -are all synonymous. -An equivalence class may not be an endpoint -of a range. -.PP -Within a bracket expression, the name of a \fIcharacter class\fR enclosed -in -\fB[:\fR -and -\fB:]\fR -stands for the list of all characters -(not all collating elements!) -belonging to that -class. -Standard character class names are: -.PP -.RS -.ne 5 -.nf -.ta 3c 6c 9c -\fBalnum digit punct -alpha graph space -blank lower upper -cntrl print xdigit\fR -.fi -.RE -.PP -These stand for the character classes defined in -\fIctype\fR(3). -A locale may provide others. -A character class may not be used as an endpoint of a range. -.PP -There are two special cases of bracket expressions: -the bracket expressions -\fB[[:<:]]\fR -and -\fB[[:>:]]\fR -are constraints, matching empty strings at -the beginning and end of a word respectively. -'\" note, discussion of escapes below references this definition of word -A word is defined as a sequence of -word characters -which is neither preceded nor followed by -word characters. -A word character is an -\fIalnum\fR -character (as defined by -\fIctype\fR(3)) -or an underscore -(\fB_\fR). -These special bracket expressions are deprecated; -users of AREs should use constraint escapes instead (see below). -.SH ESCAPES -Escapes (AREs only), which begin with a -\fB\e\fR -followed by an alphanumeric character, -come in several varieties: -character entry, class shorthands, constraint escapes, and back references. -A -\fB\e\fR -followed by an alphanumeric character but not constituting -a valid escape is illegal in AREs. -In EREs, there are no escapes: -outside a bracket expression, -a -\fB\e\fR -followed by an alphanumeric character merely stands for that -character as an ordinary character, -and inside a bracket expression, -\fB\e\fR -is an ordinary character. -(The latter is the one actual incompatibility between EREs and AREs.) -.PP -Character-entry escapes (AREs only) exist to make it easier to specify -non-printing and otherwise inconvenient characters in REs: -.RS 2 -.TP 5 -\fB\ea\fR -alert, aka bell, character, as in C -.TP -\fB\eb\fR -backspace, as in C -.TP -\fB\eB\fR -synonym for -\fB\e\fR -to help reduce backslash doubling in some -applications where there are multiple levels of backslash processing -.TP -\fB\ec\fIX\fR -(where X is any character) the character whose -low-order 5 bits are the same as those of -\fIX\fR, -and whose other bits are all zero -.TP -\fB\ee\fR -the character whose collating-sequence name -is `\fBESC\fR', -or failing that, the character with octal value 033 -.TP -\fB\ef\fR -formfeed, as in C -.TP -\fB\en\fR -newline, as in C -.TP -\fB\er\fR -carriage return, as in C -.TP -\fB\et\fR -horizontal tab, as in C -.TP -\fB\eu\fIwxyz\fR -(where -\fIwxyz\fR -is exactly four hexadecimal digits) -the Unicode character -\fBU+\fIwxyz\fR -in the local byte ordering -.TP -\fB\eU\fIstuvwxyz\fR -(where -\fIstuvwxyz\fR -is exactly eight hexadecimal digits) -reserved for a somewhat-hypothetical Unicode extension to 32 bits -.TP -\fB\ev\fR -vertical tab, as in C -are all available. -.TP -\fB\ex\fIhhh\fR -(where -\fIhhh\fR -is any sequence of hexadecimal digits) -the character whose hexadecimal value is -\fB0x\fIhhh\fR -(a single character no matter how many hexadecimal digits are used). -.TP -\fB\e0\fR -the character whose value is -\fB0\fR -.TP -\fB\e\fIxy\fR -(where -\fIxy\fR -is exactly two octal digits, -and is not a -\fIback reference\fR (see below)) -the character whose octal value is -\fB0\fIxy\fR -.TP -\fB\e\fIxyz\fR -(where -\fIxyz\fR -is exactly three octal digits, -and is not a -back reference (see below)) -the character whose octal value is -\fB0\fIxyz\fR -.RE -.PP -Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR', -and `\fBA\fR'-`\fBF\fR'. -Octal digits are `\fB0\fR'-`\fB7\fR'. -.PP -The character-entry escapes are always taken as ordinary characters. -For example, -\fB\e135\fR -is -\fB]\fR -in ASCII, -but -\fB\e135\fR -does not terminate a bracket expression. -Beware, however, that some applications (e.g., C compilers) interpret -such sequences themselves before the regular-expression package -gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'. -.PP -Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used -character classes: -.RS 2 -.TP 10 -\fB\ed\fR -\fB[[:digit:]]\fR -.TP -\fB\es\fR -\fB[[:space:]]\fR -.TP -\fB\ew\fR -\fB[[:alnum:]_]\fR -(note underscore) -.TP -\fB\eD\fR -\fB[^[:digit:]]\fR -.TP -\fB\eS\fR -\fB[^[:space:]]\fR -.TP -\fB\eW\fR -\fB[^[:alnum:]_]\fR -(note underscore) -.RE -.PP -Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', -and `\fB\ew\fR'\& -lose their outer brackets, -and `\fB\eD\fR', `\fB\eS\fR', -and `\fB\eW\fR'\& -are illegal. -.PP -A constraint escape (AREs only) is a constraint, -matching the empty string if specific conditions are met, -written as an escape: -.RS 2 -.TP 6 -\fB\eA\fR -matches only at the beginning of the string -(see MATCHING, below, for how this differs from `\fB^\fR') -.TP -\fB\em\fR -matches only at the beginning of a word -.TP -\fB\eM\fR -matches only at the end of a word -.TP -\fB\ey\fR -matches only at the beginning or end of a word -.TP -\fB\eY\fR -matches only at a point which is not the beginning or end of a word -.TP -\fB\eZ\fR -matches only at the end of the string -(see MATCHING, below, for how this differs from `\fB$\fR') -.TP -\fB\e\fIm\fR -(where -\fIm\fR -is a nonzero digit) a \fIback reference\fR, see below -.TP -\fB\e\fImnn\fR -(where -\fIm\fR -is a nonzero digit, and -\fInn\fR -is some more digits, -and the decimal value -\fImnn\fR -is not greater than the number of closing capturing parentheses seen so far) -a \fIback reference\fR, see below -.RE -.PP -A word is defined as in the specification of -\fB[[:<:]]\fR -and -\fB[[:>:]]\fR -above. -Constraint escapes are illegal within bracket expressions. -.PP -A back reference (AREs only) matches the same string matched by the parenthesized -subexpression specified by the number, -so that (e.g.) -\fB([bc])\e1\fR -matches -\fBbb\fR -or -\fBcc\fR -but not `\fBbc\fR'. -The subexpression must entirely precede the back reference in the RE. -Subexpressions are numbered in the order of their leading parentheses. -Non-capturing parentheses do not define subexpressions. -.PP -There is an inherent historical ambiguity between octal character-entry -escapes and back references, which is resolved by heuristics, -as hinted at above. -A leading zero always indicates an octal escape. -A single non-zero digit, not followed by another digit, -is always taken as a back reference. -A multi-digit sequence not starting with a zero is taken as a back -reference if it comes after a suitable subexpression -(i.e. the number is in the legal range for a back reference), -and otherwise is taken as octal. -.SH "METASYNTAX" -In addition to the main syntax described above, there are some special -forms and miscellaneous syntactic facilities available. -.PP -Normally the flavor of RE being used is specified by -application-dependent means. -However, this can be overridden by a \fIdirector\fR. -If an RE of any flavor begins with `\fB***:\fR', -the rest of the RE is an ARE. -If an RE of any flavor begins with `\fB***=\fR', -the rest of the RE is taken to be a literal string, -with all characters considered ordinary characters. -.PP -An ARE may begin with \fIembedded options\fR: -a sequence -\fB(?\fIxyz\fB)\fR -(where -\fIxyz\fR -is one or more alphabetic characters) -specifies options affecting the rest of the RE. -These supplement, and can override, -any options specified by the application. -The available option letters are: -.RS 2 -.TP 3 -\fBb\fR -rest of RE is a BRE -.TP 3 -\fBc\fR -case-sensitive matching (usual default) -.TP 3 -\fBe\fR -rest of RE is an ERE -.TP 3 -\fBi\fR -case-insensitive matching (see MATCHING, below) -.TP 3 -\fBm\fR -historical synonym for -\fBn\fR -.TP 3 -\fBn\fR -newline-sensitive matching (see MATCHING, below) -.TP 3 -\fBp\fR -partial newline-sensitive matching (see MATCHING, below) -.TP 3 -\fBq\fR -rest of RE is a literal (``quoted'') string, all ordinary characters -.TP 3 -\fBs\fR -non-newline-sensitive matching (usual default) -.TP 3 -\fBt\fR -tight syntax (usual default; see below) -.TP 3 -\fBw\fR -inverse partial newline-sensitive (``weird'') matching (see MATCHING, below) -.TP 3 -\fBx\fR -expanded syntax (see below) -.RE -.PP -Embedded options take effect at the -\fB)\fR -terminating the sequence. -They are available only at the start of an ARE, -and may not be used later within it. -.PP -In addition to the usual (\fItight\fR) RE syntax, in which all characters are -significant, there is an \fIexpanded\fR syntax, -available in all flavors of RE -with the \fB-expanded\fR switch, or in AREs with the embedded x option. -In the expanded syntax, -white-space characters are ignored -and all characters between a -\fB#\fR -and the following newline (or the end of the RE) are ignored, -permitting paragraphing and commenting a complex RE. -There are three exceptions to that basic rule: -.RS 2 -.PP -a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained -.PP -white space or `\fB#\fR' within a bracket expression is retained -.PP -white space and comments are illegal within multi-character symbols -like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR' -.RE -.PP -Expanded-syntax -white-space characters are blank, tab, newline, etc. (any character -defined as \fIspace\fR by -\fIctype\fR(3)). -Exactly how a multi-line expanded-syntax RE -can be entered interactively by a user, -if at all, is application-specific; -expanded syntax is primarily a scripting facility. -.PP -Finally, in an ARE, -outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR' -(where -\fIttt\fR -is any text not containing a `\fB)\fR') -is a comment, -completely ignored. -Again, this is not allowed between the characters of -multi-character symbols like `\fB(?:\fR'. -Such comments are more a historical artifact than a useful facility, -and their use is deprecated; -use the expanded syntax instead. -.PP -\fINone\fR of these metasyntax extensions is available if the application -(or an initial -\fB***=\fR -director) -has specified that the user's input be treated as a literal string -rather than as an RE. -.SH MATCHING -In the event that an RE could match more than one substring of a given -string, -the RE matches the one starting earliest in the string. -If the RE could match more than one substring starting at that point, -its choice is determined by its \fIpreference\fR: -either the longest substring, or the shortest. -.PP -Most atoms, and all constraints, have no preference. -A parenthesized RE has the same preference (possibly none) as the RE. -A quantified atom with quantifier -\fB{\fIm\fB}\fR -or -\fB{\fIm\fB}?\fR -has the same preference (possibly none) as the atom itself. -A quantified atom with other normal quantifiers (including -\fB{\fIm\fB,\fIn\fB}\fR -with -\fIm\fR -equal to -\fIn\fR) -prefers longest match. -A quantified atom with other non-greedy quantifiers (including -\fB{\fIm\fB,\fIn\fB}?\fR -with -\fIm\fR -equal to -\fIn\fR) -prefers shortest match. -A branch has the same preference as the first quantified atom in it -which has a preference. -An RE consisting of two or more branches connected by the -\fB|\fR -operator prefers longest match. -.PP -Subject to the constraints imposed by the rules for matching the whole RE, -subexpressions also match the longest or shortest possible substrings, -based on their preferences, -with subexpressions starting earlier in the RE taking priority over -ones starting later. -Note that outer subexpressions thus take priority over -their component subexpressions. -.PP -Note that the quantifiers -\fB{1,1}\fR -and -\fB{1,1}?\fR -can be used to force longest and shortest preference, respectively, -on a subexpression or a whole RE. -.PP -Match lengths are measured in characters, not collating elements. -An empty string is considered longer than no match at all. -For example, -\fBbb*\fR -matches the three middle characters of `\fBabbbc\fR', -\fB(week|wee)(night|knights)\fR -matches all ten characters of `\fBweeknights\fR', -when -\fB(.*).*\fR -is matched against -\fBabc\fR -the parenthesized subexpression -matches all three characters, and -when -\fB(a*)*\fR -is matched against -\fBbc\fR -both the whole RE and the parenthesized -subexpression match an empty string. -.PP -If case-independent matching is specified, -the effect is much as if all case distinctions had vanished from the -alphabet. -When an alphabetic that exists in multiple cases appears as an -ordinary character outside a bracket expression, it is effectively -transformed into a bracket expression containing both cases, -so that -\fBx\fR -becomes `\fB[xX]\fR'. -When it appears inside a bracket expression, all case counterparts -of it are added to the bracket expression, so that -\fB[x]\fR -becomes -\fB[xX]\fR -and -\fB[^x]\fR -becomes `\fB[^xX]\fR'. -.PP -If newline-sensitive matching is specified, -\fB.\fR -and bracket expressions using -\fB^\fR -will never match the newline character -(so that matches will never cross newlines unless the RE -explicitly arranges it) -and -\fB^\fR -and -\fB$\fR -will match the empty string after and before a newline -respectively, in addition to matching at beginning and end of string -respectively. -ARE -\fB\eA\fR -and -\fB\eZ\fR -continue to match beginning or end of string \fIonly\fR. -.PP -If partial newline-sensitive matching is specified, -this affects -\fB.\fR -and bracket expressions -as with newline-sensitive matching, but not -\fB^\fR -and `\fB$\fR'. -.PP -If inverse partial newline-sensitive matching is specified, -this affects -\fB^\fR -and -\fB$\fR -as with -newline-sensitive matching, -but not -\fB.\fR -and bracket expressions. -This isn't very useful but is provided for symmetry. -.SH "LIMITS AND COMPATIBILITY" -No particular limit is imposed on the length of REs. -Programs intended to be highly portable should not employ REs longer -than 256 bytes, -as a POSIX-compliant implementation can refuse to accept such REs. -.PP -The only feature of AREs that is actually incompatible with -POSIX EREs is that -\fB\e\fR -does not lose its special -significance inside bracket expressions. -All other ARE features use syntax which is illegal or has -undefined or unspecified effects in POSIX EREs; -the -\fB***\fR -syntax of directors likewise is outside the POSIX -syntax for both BREs and EREs. -.PP -Many of the ARE extensions are borrowed from Perl, but some have -been changed to clean them up, and a few Perl extensions are not present. -Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR', -the lack of special treatment for a trailing newline, -the addition of complemented bracket expressions to the things -affected by newline-sensitive matching, -the restrictions on parentheses and back references in lookahead constraints, -and the longest/shortest-match (rather than first-match) matching semantics. -.PP -The matching rules for REs containing both normal and non-greedy quantifiers -have changed since early beta-test versions of this package. -(The new rules are much simpler and cleaner, -but don't work as hard at guessing the user's real intentions.) -.PP -Henry Spencer's original 1986 \fIregexp\fR package, -still in widespread use (e.g., in pre-8.1 releases of Tcl), -implemented an early version of today's EREs. -There are four incompatibilities between \fIregexp\fR's near-EREs -(`RREs' for short) and AREs. -In roughly increasing order of significance: -.PP -.RS -In AREs, -\fB\e\fR -followed by an alphanumeric character is either an -escape or an error, -while in RREs, it was just another way of writing the -alphanumeric. -This should not be a problem because there was no reason to write -such a sequence in RREs. -.PP -\fB{\fR -followed by a digit in an ARE is the beginning of a bound, -while in RREs, -\fB{\fR -was always an ordinary character. -Such sequences should be rare, -and will often result in an error because following characters -will not look like a valid bound. -.PP -In AREs, -\fB\e\fR -remains a special character within `\fB[\|]\fR', -so a literal -\fB\e\fR -within -\fB[\|]\fR -must be written `\fB\e\e\fR'. -\fB\e\e\fR -also gives a literal -\fB\e\fR -within -\fB[\|]\fR -in RREs, -but only truly paranoid programmers routinely doubled the backslash. -.PP -AREs report the longest/shortest match for the RE, -rather than the first found in a specified search order. -This may affect some RREs which were written in the expectation that -the first match would be reported. -(The careful crafting of RREs to optimize the search order for fast -matching is obsolete (AREs examine all possible matches -in parallel, and their performance is largely insensitive to their -complexity) but cases where the search order was exploited to deliberately -find a match which was \fInot\fR the longest/shortest will need rewriting.) -.RE - -.SH "BASIC REGULAR EXPRESSIONS" -BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR', -and -\fB?\fR -are ordinary characters and there is no equivalent -for their functionality. -The delimiters for bounds are -\fB\e{\fR -and `\fB\e}\fR', -with -\fB{\fR -and -\fB}\fR -by themselves ordinary characters. -The parentheses for nested subexpressions are -\fB\e(\fR -and `\fB\e)\fR', -with -\fB(\fR -and -\fB)\fR -by themselves ordinary characters. -\fB^\fR -is an ordinary character except at the beginning of the -RE or the beginning of a parenthesized subexpression, -\fB$\fR -is an ordinary character except at the end of the -RE or the end of a parenthesized subexpression, -and -\fB*\fR -is an ordinary character if it appears at the beginning of the -RE or the beginning of a parenthesized subexpression -(after a possible leading `\fB^\fR'). -Finally, -single-digit back references are available, -and -\fB\e<\fR -and -\fB\e>\fR -are synonyms for -\fB[[:<:]]\fR -and -\fB[[:>:]]\fR -respectively; -no other escapes are available. +.SH "SEE ALSO" +re_syntax(n) -.VE 8.1 .SH KEYWORDS match, regular expression, string diff --git a/doc/regsub.n b/doc/regsub.n index 516a417..65c22a6 100644 --- a/doc/regsub.n +++ b/doc/regsub.n @@ -5,7 +5,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: regsub.n,v 1.2 1998/09/14 18:39:54 stanton Exp $ +'\" RCS: @(#) $Id: regsub.n,v 1.3 1999/06/24 21:15:13 jpeek Exp $ '\" .so man.macros .TH regsub n 7.4 Tcl "Tcl Built-In Commands" @@ -23,6 +23,8 @@ This command matches the regular expression \fIexp\fR against \fIstring\fR, and it copies \fIstring\fR to the variable whose name is given by \fIvarName\fR. +(Regular expression matching is described in the \fBre_syntax\fR +reference page.) If there is a match, then while copying \fIstring\fR to \fIvarName\fR the portion of \fIstring\fR that matched \fIexp\fR is replaced with \fIsubSpec\fR. diff --git a/doc/switch.n b/doc/switch.n index 503a5ba..fb7ac60 100644 --- a/doc/switch.n +++ b/doc/switch.n @@ -5,7 +5,7 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: switch.n,v 1.2 1998/09/14 18:39:55 stanton Exp $ +'\" RCS: @(#) $Id: switch.n,v 1.3 1999/06/24 21:15:14 jpeek Exp $ '\" .so man.macros .TH switch n 7.0 Tcl "Tcl Built-In Commands" @@ -47,7 +47,7 @@ When matching \fIstring\fR to the patterns, use glob-style matching \fB\-regexp\fR When matching \fIstring\fR to the patterns, use regular expression matching -(i.e. the same as implemented by the \fBregexp\fR command). +(as described in the \fBre_syntax\fR reference page). .TP 10 \fB\-\|\-\fR Marks the end of options. The argument following this one will -- cgit v0.12