diff options
author | William Joye <wjoye@cfa.harvard.edu> | 2016-12-21 22:56:22 (GMT) |
---|---|---|
committer | William Joye <wjoye@cfa.harvard.edu> | 2016-12-21 22:56:22 (GMT) |
commit | d1a6de55efc90f190dee42ab8c4fa9070834e77d (patch) | |
tree | ec633f5608ef498bee52a5f42c12c49493ec8bf8 /tcl8.6/generic/regc_lex.c | |
parent | 5514e37335c012cc70f5b9aee3cedfe3d57f583f (diff) | |
parent | 98acd3f494b28ddd8c345a2bb9311e41e2d56ddd (diff) | |
download | blt-d1a6de55efc90f190dee42ab8c4fa9070834e77d.zip blt-d1a6de55efc90f190dee42ab8c4fa9070834e77d.tar.gz blt-d1a6de55efc90f190dee42ab8c4fa9070834e77d.tar.bz2 |
Merge commit '98acd3f494b28ddd8c345a2bb9311e41e2d56ddd' as 'tcl8.6'
Diffstat (limited to 'tcl8.6/generic/regc_lex.c')
-rw-r--r-- | tcl8.6/generic/regc_lex.c | 1195 |
1 files changed, 1195 insertions, 0 deletions
diff --git a/tcl8.6/generic/regc_lex.c b/tcl8.6/generic/regc_lex.c new file mode 100644 index 0000000..affcb48 --- /dev/null +++ b/tcl8.6/generic/regc_lex.c @@ -0,0 +1,1195 @@ +/* + * lexical analyzer + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation of + * software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* scanning macros (know about v) */ +#define ATEOS() (v->now >= v->stop) +#define HAVE(n) (v->stop - v->now >= (n)) +#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) +#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) +#define NEXT3(a,b,c) \ + (HAVE(3) && *v->now == CHR(a) && \ + *(v->now+1) == CHR(b) && \ + *(v->now+2) == CHR(c)) +#define SET(c) (v->nexttype = (c)) +#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) +#define RET(c) return (SET(c), 1) +#define RETV(c, n) return (SETV(c, n), 1) +#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ +#define LASTTYPE(t) (v->lasttype == (t)) + +/* lexical contexts */ +#define L_ERE 1 /* mainline ERE/ARE */ +#define L_BRE 2 /* mainline BRE */ +#define L_Q 3 /* REG_QUOTE */ +#define L_EBND 4 /* ERE/ARE bound */ +#define L_BBND 5 /* BRE bound */ +#define L_BRACK 6 /* brackets */ +#define L_CEL 7 /* collating element */ +#define L_ECL 8 /* equivalence class */ +#define L_CCL 9 /* character class */ +#define INTOCON(c) (v->lexcon = (c)) +#define INCON(con) (v->lexcon == (con)) + +/* construct pointer past end of chr array */ +#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) + +/* + - lexstart - set up lexical stuff, scan leading options + ^ static void lexstart(struct vars *); + */ +static void +lexstart( + struct vars *v) +{ + prefixes(v); /* may turn on new type bits etc. */ + NOERR(); + + if (v->cflags®_QUOTE) { + assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); + INTOCON(L_Q); + } else if (v->cflags®_EXTENDED) { + assert(!(v->cflags®_QUOTE)); + INTOCON(L_ERE); + } else { + assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); + INTOCON(L_BRE); + } + + v->nexttype = EMPTY; /* remember we were at the start */ + next(v); /* set up the first token */ +} + +/* + - prefixes - implement various special prefixes + ^ static void prefixes(struct vars *); + */ +static void +prefixes( + struct vars *v) +{ + /* + * Literal string doesn't get any of this stuff. + */ + + if (v->cflags®_QUOTE) { + return; + } + + /* + * Initial "***" gets special things. + */ + + if (HAVE(4) && NEXT3('*', '*', '*')) { + switch (*(v->now + 3)) { + case CHR('?'): /* "***?" error, msg shows version */ + ERR(REG_BADPAT); + return; /* proceed no further */ + break; + case CHR('='): /* "***=" shifts to literal string */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_QUOTE; + v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); + v->now += 4; + return; /* and there can be no more prefixes */ + break; + case CHR(':'): /* "***:" shifts to AREs */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_ADVANCED; + v->now += 4; + break; + default: /* otherwise *** is just an error */ + ERR(REG_BADRPT); + return; + break; + } + } + + /* + * BREs and EREs don't get embedded options. + */ + + if ((v->cflags®_ADVANCED) != REG_ADVANCED) { + return; + } + + /* + * Embedded options (AREs only). + */ + + if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { + NOTE(REG_UNONPOSIX); + v->now += 2; + for (; !ATEOS() && iscalpha(*v->now); v->now++) { + switch (*v->now) { + case CHR('b'): /* BREs (but why???) */ + v->cflags &= ~(REG_ADVANCED|REG_QUOTE); + break; + case CHR('c'): /* case sensitive */ + v->cflags &= ~REG_ICASE; + break; + case CHR('e'): /* plain EREs */ + v->cflags |= REG_EXTENDED; + v->cflags &= ~(REG_ADVF|REG_QUOTE); + break; + case CHR('i'): /* case insensitive */ + v->cflags |= REG_ICASE; + break; + case CHR('m'): /* Perloid synonym for n */ + case CHR('n'): /* \n affects ^ $ . [^ */ + v->cflags |= REG_NEWLINE; + break; + case CHR('p'): /* ~Perl, \n affects . [^ */ + v->cflags |= REG_NLSTOP; + v->cflags &= ~REG_NLANCH; + break; + case CHR('q'): /* literal string */ + v->cflags |= REG_QUOTE; + v->cflags &= ~REG_ADVANCED; + break; + case CHR('s'): /* single line, \n ordinary */ + v->cflags &= ~REG_NEWLINE; + break; + case CHR('t'): /* tight syntax */ + v->cflags &= ~REG_EXPANDED; + break; + case CHR('w'): /* weird, \n affects ^ $ only */ + v->cflags &= ~REG_NLSTOP; + v->cflags |= REG_NLANCH; + break; + case CHR('x'): /* expanded syntax */ + v->cflags |= REG_EXPANDED; + break; + default: + ERR(REG_BADOPT); + return; + } + } + if (!NEXT1(')')) { + ERR(REG_BADOPT); + return; + } + v->now++; + if (v->cflags®_QUOTE) { + v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); + } + } +} + +/* + - lexnest - "call a subroutine", interpolating string at the lexical level + * Note, this is not a very general facility. There are a number of + * implicit assumptions about what sorts of strings can be subroutines. + ^ static void lexnest(struct vars *, const chr *, const chr *); + */ +static void +lexnest( + struct vars *v, + const chr *beginp, /* start of interpolation */ + const chr *endp) /* one past end of interpolation */ +{ + assert(v->savenow == NULL); /* only one level of nesting */ + v->savenow = v->now; + v->savestop = v->stop; + v->now = beginp; + v->stop = endp; +} + +/* + * string constants to interpolate as expansions of things like \d + */ + +static const chr backd[] = { /* \d */ + CHR('['), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr backD[] = { /* \D */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr brbackd[] = { /* \d within brackets */ + CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']') +}; +static const chr backs[] = { /* \s */ + CHR('['), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr backS[] = { /* \S */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr brbacks[] = { /* \s within brackets */ + CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']') +}; + +#define PUNCT_CONN \ + CHR('_'), \ + 0x203f /* UNDERTIE */, \ + 0x2040 /* CHARACTER TIE */,\ + 0x2054 /* INVERTED UNDERTIE */,\ + 0xfe33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */, \ + 0xfe34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */, \ + 0xfe4d /* DASHED LOW LINE */, \ + 0xfe4e /* CENTRELINE LOW LINE */, \ + 0xfe4f /* WAVY LOW LINE */, \ + 0xff3f /* FULLWIDTH LOW LINE */ + +static const chr backw[] = { /* \w */ + CHR('['), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') +}; +static const chr backW[] = { /* \W */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') +}; +static const chr brbackw[] = { /* \w within brackets */ + CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), PUNCT_CONN +}; + +/* + - lexword - interpolate a bracket expression for word characters + * Possibly ought to inquire whether there is a "word" character class. + ^ static void lexword(struct vars *); + */ +static void +lexword( + struct vars *v) +{ + lexnest(v, backw, ENDOF(backw)); +} + +/* + - next - get next token + ^ static int next(struct vars *); + */ +static int /* 1 normal, 0 failure */ +next( + struct vars *v) +{ + chr c; + + /* + * Errors yield an infinite sequence of failures. + */ + + if (ISERR()) { + return 0; /* the error has set nexttype to EOS */ + } + + /* + * Remember flavor of last token. + */ + + v->lasttype = v->nexttype; + + /* + * REG_BOSONLY + */ + + if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { + /* at start of a REG_BOSONLY RE */ + RETV(SBEGIN, 0); /* same as \A */ + } + + /* + * If we're nested and we've hit end, return to outer level. + */ + + if (v->savenow != NULL && ATEOS()) { + v->now = v->savenow; + v->stop = v->savestop; + v->savenow = v->savestop = NULL; + } + + /* + * Skip white space etc. if appropriate (not in literal or []) + */ + + if (v->cflags®_EXPANDED) { + switch (v->lexcon) { + case L_ERE: + case L_BRE: + case L_EBND: + case L_BBND: + skip(v); + break; + } + } + + /* + * Handle EOS, depending on context. + */ + + if (ATEOS()) { + switch (v->lexcon) { + case L_ERE: + case L_BRE: + case L_Q: + RET(EOS); + break; + case L_EBND: + case L_BBND: + FAILW(REG_EBRACE); + break; + case L_BRACK: + case L_CEL: + case L_ECL: + case L_CCL: + FAILW(REG_EBRACK); + break; + } + assert(NOTREACHED); + } + + /* + * Okay, time to actually get a character. + */ + + c = *v->now++; + + /* + * Deal with the easy contexts, punt EREs to code below. + */ + + switch (v->lexcon) { + case L_BRE: /* punt BREs to separate function */ + return brenext(v, c); + break; + case L_ERE: /* see below */ + break; + case L_Q: /* literal strings are easy */ + RETV(PLAIN, c); + break; + case L_BBND: /* bounds are fairly simple */ + case L_EBND: + switch (c) { + case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): + case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): + case CHR('8'): case CHR('9'): + RETV(DIGIT, (chr)DIGITVAL(c)); + break; + case CHR(','): + RET(','); + break; + case CHR('}'): /* ERE bound ends with } */ + if (INCON(L_EBND)) { + INTOCON(L_ERE); + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('}', 0); + } + RETV('}', 1); + } else { + FAILW(REG_BADBR); + } + break; + case CHR('\\'): /* BRE bound ends with \} */ + if (INCON(L_BBND) && NEXT1('}')) { + v->now++; + INTOCON(L_BRE); + RET('}'); + } else { + FAILW(REG_BADBR); + } + break; + default: + FAILW(REG_BADBR); + break; + } + assert(NOTREACHED); + break; + case L_BRACK: /* brackets are not too hard */ + switch (c) { + case CHR(']'): + if (LASTTYPE('[')) { + RETV(PLAIN, c); + } else { + INTOCON((v->cflags®_EXTENDED) ? L_ERE : L_BRE); + RET(']'); + } + break; + case CHR('\\'): + NOTE(REG_UBBS); + if (!(v->cflags®_ADVF)) { + RETV(PLAIN, c); + } + NOTE(REG_UNONPOSIX); + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + (DISCARD)lexescape(v); + switch (v->nexttype) { /* not all escapes okay here */ + case PLAIN: + return 1; + break; + case CCLASS: + switch (v->nextvalue) { + case 'd': + lexnest(v, brbackd, ENDOF(brbackd)); + break; + case 's': + lexnest(v, brbacks, ENDOF(brbacks)); + break; + case 'w': + lexnest(v, brbackw, ENDOF(brbackw)); + break; + default: + FAILW(REG_EESCAPE); + break; + } + + /* + * lexnest() done, back up and try again. + */ + + v->nexttype = v->lasttype; + return next(v); + break; + } + + /* + * Not one of the acceptable escapes. + */ + + FAILW(REG_EESCAPE); + break; + case CHR('-'): + if (LASTTYPE('[') || NEXT1(']')) { + RETV(PLAIN, c); + } else { + RETV(RANGE, c); + } + break; + case CHR('['): + if (ATEOS()) { + FAILW(REG_EBRACK); + } + switch (*v->now++) { + case CHR('.'): + INTOCON(L_CEL); + + /* + * Might or might not be locale-specific. + */ + + RET(COLLEL); + break; + case CHR('='): + INTOCON(L_ECL); + NOTE(REG_ULOCALE); + RET(ECLASS); + break; + case CHR(':'): + INTOCON(L_CCL); + NOTE(REG_ULOCALE); + RET(CCLASS); + break; + default: /* oops */ + v->now--; + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + default: + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + case L_CEL: /* collating elements are easy */ + if (c == CHR('.') && NEXT1(']')) { + v->now++; + INTOCON(L_BRACK); + RETV(END, '.'); + } else { + RETV(PLAIN, c); + } + break; + case L_ECL: /* ditto equivalence classes */ + if (c == CHR('=') && NEXT1(']')) { + v->now++; + INTOCON(L_BRACK); + RETV(END, '='); + } else { + RETV(PLAIN, c); + } + break; + case L_CCL: /* ditto character classes */ + if (c == CHR(':') && NEXT1(']')) { + v->now++; + INTOCON(L_BRACK); + RETV(END, ':'); + } else { + RETV(PLAIN, c); + } + break; + default: + assert(NOTREACHED); + break; + } + + /* + * That got rid of everything except EREs and AREs. + */ + + assert(INCON(L_ERE)); + + /* + * Deal with EREs and AREs, except for backslashes. + */ + + switch (c) { + case CHR('|'): + RET('|'); + break; + case CHR('*'): + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('*', 0); + } + RETV('*', 1); + break; + case CHR('+'): + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('+', 0); + } + RETV('+', 1); + break; + case CHR('?'): + if ((v->cflags®_ADVF) && NEXT1('?')) { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('?', 0); + } + RETV('?', 1); + break; + case CHR('{'): /* bounds start or plain character */ + if (v->cflags®_EXPANDED) { + skip(v); + } + if (ATEOS() || !iscdigit(*v->now)) { + NOTE(REG_UBRACES); + NOTE(REG_UUNSPEC); + RETV(PLAIN, c); + } else { + NOTE(REG_UBOUNDS); + INTOCON(L_EBND); + RET('{'); + } + assert(NOTREACHED); + break; + case CHR('('): /* parenthesis, or advanced extension */ + if ((v->cflags®_ADVF) && NEXT1('?')) { + NOTE(REG_UNONPOSIX); + v->now++; + switch (*v->now++) { + case CHR(':'): /* non-capturing paren */ + RETV('(', 0); + break; + case CHR('#'): /* comment */ + while (!ATEOS() && *v->now != CHR(')')) { + v->now++; + } + if (!ATEOS()) { + v->now++; + } + assert(v->nexttype == v->lasttype); + return next(v); + break; + case CHR('='): /* positive lookahead */ + NOTE(REG_ULOOKAHEAD); + RETV(LACON, 1); + break; + case CHR('!'): /* negative lookahead */ + NOTE(REG_ULOOKAHEAD); + RETV(LACON, 0); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + } + if (v->cflags®_NOSUB) { + RETV('(', 0); /* all parens non-capturing */ + } else { + RETV('(', 1); + } + break; + case CHR(')'): + if (LASTTYPE('(')) { + NOTE(REG_UUNSPEC); + } + RETV(')', c); + break; + case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ + if (HAVE(6) && *(v->now+0) == CHR('[') && + *(v->now+1) == CHR(':') && + (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && + *(v->now+3) == CHR(':') && + *(v->now+4) == CHR(']') && + *(v->now+5) == CHR(']')) { + c = *(v->now+2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + RET('^'); + break; + case CHR('$'): + RET('$'); + break; + case CHR('\\'): /* mostly punt backslashes to code below */ + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + break; + default: /* ordinary character */ + RETV(PLAIN, c); + break; + } + + /* + * ERE/ARE backslash handling; backslash already eaten. + */ + + assert(!ATEOS()); + if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ + if (iscalnum(*v->now)) { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, *v->now++); + } + (DISCARD)lexescape(v); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + if (v->nexttype == CCLASS) {/* fudge at lexical level */ + switch (v->nextvalue) { + case 'd': lexnest(v, backd, ENDOF(backd)); break; + case 'D': lexnest(v, backD, ENDOF(backD)); break; + case 's': lexnest(v, backs, ENDOF(backs)); break; + case 'S': lexnest(v, backS, ENDOF(backS)); break; + case 'w': lexnest(v, backw, ENDOF(backw)); break; + case 'W': lexnest(v, backW, ENDOF(backW)); break; + default: + assert(NOTREACHED); + FAILW(REG_ASSERT); + break; + } + /* lexnest done, back up and try again */ + v->nexttype = v->lasttype; + return next(v); + } + + /* + * Otherwise, lexescape has already done the work. + */ + + return !ISERR(); +} + +/* + - lexescape - parse an ARE backslash escape (backslash already eaten) + * Note slightly nonstandard use of the CCLASS type code. + ^ static int lexescape(struct vars *); + */ +static int /* not actually used, but convenient for RETV */ +lexescape( + struct vars *v) +{ + chr c; + int i; + static const chr alert[] = { + CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') + }; + static const chr esc[] = { + CHR('E'), CHR('S'), CHR('C') + }; + const chr *save; + + assert(v->cflags®_ADVF); + + assert(!ATEOS()); + c = *v->now++; + if (!iscalnum(c)) { + RETV(PLAIN, c); + } + + NOTE(REG_UNONPOSIX); + switch (c) { + case CHR('a'): + RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); + break; + case CHR('A'): + RETV(SBEGIN, 0); + break; + case CHR('b'): + RETV(PLAIN, CHR('\b')); + break; + case CHR('B'): + RETV(PLAIN, CHR('\\')); + break; + case CHR('c'): + NOTE(REG_UUNPORT); + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, (chr)(*v->now++ & 037)); + break; + case CHR('d'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'd'); + break; + case CHR('D'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'D'); + break; + case CHR('e'): + NOTE(REG_UUNPORT); + RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); + break; + case CHR('f'): + RETV(PLAIN, CHR('\f')); + break; + case CHR('m'): + RET('<'); + break; + case CHR('M'): + RET('>'); + break; + case CHR('n'): + RETV(PLAIN, CHR('\n')); + break; + case CHR('r'): + RETV(PLAIN, CHR('\r')); + break; + case CHR('s'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 's'); + break; + case CHR('S'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'S'); + break; + case CHR('t'): + RETV(PLAIN, CHR('\t')); + break; + case CHR('u'): + c = (uchr) lexdigits(v, 16, 1, 4); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, c); + break; + case CHR('U'): + i = lexdigits(v, 16, 1, 8); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + if (i > 0xFFFF) { + /* TODO: output a Surrogate pair + */ + i = 0xFFFD; + } + RETV(PLAIN, (uchr) i); + break; + case CHR('v'): + RETV(PLAIN, CHR('\v')); + break; + case CHR('w'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'w'); + break; + case CHR('W'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'W'); + break; + case CHR('x'): + NOTE(REG_UUNPORT); + c = (uchr) lexdigits(v, 16, 1, 2); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + RETV(PLAIN, c); + break; + case CHR('y'): + NOTE(REG_ULOCALE); + RETV(WBDRY, 0); + break; + case CHR('Y'): + NOTE(REG_ULOCALE); + RETV(NWBDRY, 0); + break; + case CHR('Z'): + RETV(SEND, 0); + break; + case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): + case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): + case CHR('9'): + save = v->now; + v->now--; /* put first digit back */ + c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + if (ISERR()) { + FAILW(REG_EESCAPE); + } + + /* + * Ugly heuristic (first test is "exactly 1 digit?") + */ + + if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) { + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr)c); + } + + /* + * Oops, doesn't look like it's a backref after all... + */ + + v->now = save; + + /* + * And fall through into octal number. + */ + + case CHR('0'): + NOTE(REG_UUNPORT); + v->now--; /* put first digit back */ + c = (uchr) lexdigits(v, 8, 1, 3); + if (ISERR()) { + FAILW(REG_EESCAPE); + } + if (c > 0xff) { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; + } + RETV(PLAIN, c); + break; + default: + assert(iscalpha(c)); + FAILW(REG_EESCAPE); /* unknown alphabetic escape */ + break; + } + assert(NOTREACHED); +} + +/* + - lexdigits - slurp up digits and return chr value + ^ static int lexdigits(struct vars *, int, int, int); + */ +static int /* chr value; errors signalled via ERR */ +lexdigits( + struct vars *v, + int base, + int minlen, + int maxlen) +{ + int n; + int len; + chr c; + int d; + const uchr ub = (uchr) base; + + n = 0; + for (len = 0; len < maxlen && !ATEOS(); len++) { + if (n > 0x10fff) { + /* Stop when continuing would otherwise overflow */ + break; + } + c = *v->now++; + switch (c) { + case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): + case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): + case CHR('8'): case CHR('9'): + d = DIGITVAL(c); + break; + case CHR('a'): case CHR('A'): d = 10; break; + case CHR('b'): case CHR('B'): d = 11; break; + case CHR('c'): case CHR('C'): d = 12; break; + case CHR('d'): case CHR('D'): d = 13; break; + case CHR('e'): case CHR('E'): d = 14; break; + case CHR('f'): case CHR('F'): d = 15; break; + default: + v->now--; /* oops, not a digit at all */ + d = -1; + break; + } + + if (d >= base) { /* not a plausible digit */ + v->now--; + d = -1; + } + if (d < 0) { + break; /* NOTE BREAK OUT */ + } + n = n*ub + (uchr)d; + } + if (len < minlen) { + ERR(REG_EESCAPE); + } + + return n; +} + +/* + - brenext - get next BRE token + * This is much like EREs except for all the stupid backslashes and the + * context-dependency of some things. + ^ static int brenext(struct vars *, pchr); + */ +static int /* 1 normal, 0 failure */ +brenext( + struct vars *v, + pchr pc) +{ + chr c = (chr)pc; + + switch (c) { + case CHR('*'): + if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) { + RETV(PLAIN, c); + } + RET('*'); + break; + case CHR('['): + if (HAVE(6) && *(v->now+0) == CHR('[') && + *(v->now+1) == CHR(':') && + (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && + *(v->now+3) == CHR(':') && + *(v->now+4) == CHR(']') && + *(v->now+5) == CHR(']')) { + c = *(v->now+2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + if (LASTTYPE(EMPTY)) { + RET('^'); + } + if (LASTTYPE('(')) { + NOTE(REG_UUNSPEC); + RET('^'); + } + RETV(PLAIN, c); + break; + case CHR('$'): + if (v->cflags®_EXPANDED) { + skip(v); + } + if (ATEOS()) { + RET('$'); + } + if (NEXT2('\\', ')')) { + NOTE(REG_UUNSPEC); + RET('$'); + } + RETV(PLAIN, c); + break; + case CHR('\\'): + break; /* see below */ + default: + RETV(PLAIN, c); + break; + } + + assert(c == CHR('\\')); + + if (ATEOS()) { + FAILW(REG_EESCAPE); + } + + c = *v->now++; + switch (c) { + case CHR('{'): + INTOCON(L_BBND); + NOTE(REG_UBOUNDS); + RET('{'); + break; + case CHR('('): + RETV('(', 1); + break; + case CHR(')'): + RETV(')', c); + break; + case CHR('<'): + NOTE(REG_UNONPOSIX); + RET('<'); + break; + case CHR('>'): + NOTE(REG_UNONPOSIX); + RET('>'); + break; + case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): + case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): + case CHR('9'): + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr)DIGITVAL(c)); + break; + default: + if (iscalnum(c)) { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, c); + break; + } + + assert(NOTREACHED); +} + +/* + - skip - skip white space and comments in expanded form + ^ static void skip(struct vars *); + */ +static void +skip( + struct vars *v) +{ + const chr *start = v->now; + + assert(v->cflags®_EXPANDED); + + for (;;) { + while (!ATEOS() && iscspace(*v->now)) { + v->now++; + } + if (ATEOS() || *v->now != CHR('#')) { + break; /* NOTE BREAK OUT */ + } + assert(NEXT1('#')); + while (!ATEOS() && *v->now != CHR('\n')) { + v->now++; + } + + /* + * Leave the newline to be picked up by the iscspace loop. + */ + } + + if (v->now != start) { + NOTE(REG_UNONPOSIX); + } +} + +/* + - newline - return the chr for a newline + * This helps confine use of CHR to this source file. + ^ static chr newline(NOPARMS); + */ +static chr +newline(void) +{ + return CHR('\n'); +} + +/* + - chrnamed - return the chr known by a given (chr string) name + * The code is a bit clumsy, but this routine gets only such specialized + * use that it hardly matters. + ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); + */ +static chr +chrnamed( + struct vars *v, + const chr *startp, /* start of name */ + const chr *endp, /* just past end of name */ + pchr lastresort) /* what to return if name lookup fails */ +{ + celt c; + int errsave; + int e; + struct cvec *cv; + + errsave = v->err; + v->err = 0; + c = element(v, startp, endp); + e = v->err; + v->err = errsave; + + if (e != 0) { + return (chr)lastresort; + } + + cv = range(v, c, c, 0); + if (cv->nchrs == 0) { + return (chr)lastresort; + } + return cv->chrs[0]; +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ |