/* * lexical analyzer * This file is #included by regcomp.c. * * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. * * Development of this software was funded, in part, by Cray Research Inc., * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics * Corporation, none of whom are responsible for the results. The author * thanks all of them. * * Redistribution and use in source and binary forms -- with or without * modification -- are permitted for any purpose, provided that * redistributions in source form retain this entire copyright notice and * indicate the origin and nature of any modifications. * * I'd appreciate being given credit for this package in the documentation of * software which uses it, but that is not a requirement. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* scanning macros (know about v) */ #define ATEOS() (v->now >= v->stop) #define HAVE(n) (v->stop - v->now >= (n)) #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) #define NEXT3(a,b,c) \ (HAVE(3) && *v->now == CHR(a) && \ *(v->now+1) == CHR(b) && \ *(v->now+2) == CHR(c)) #define SET(c) (v->nexttype = (c)) #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) #define RET(c) return (SET(c), 1) #define RETV(c, n) return (SETV(c, n), 1) #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ #define LASTTYPE(t) (v->lasttype == (t)) /* lexical contexts */ #define L_ERE 1 /* mainline ERE/ARE */ #define L_BRE 2 /* mainline BRE */ #define L_Q 3 /* REG_QUOTE */ #define L_EBND 4 /* ERE/ARE bound */ #define L_BBND 5 /* BRE bound */ #define L_BRACK 6 /* brackets */ #define L_CEL 7 /* collating element */ #define L_ECL 8 /* equivalence class */ #define L_CCL 9 /* character class */ #define INTOCON(c) (v->lexcon = (c)) #define INCON(con) (v->lexcon == (con)) /* construct pointer past end of chr array */ #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) /* - lexstart - set up lexical stuff, scan leading options ^ static void lexstart(struct vars *); */ static void lexstart( struct vars *v) { prefixes(v); /* may turn on new type bits etc. */ NOERR(); if (v->cflags®_QUOTE) { assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); INTOCON(L_Q); } else if (v->cflags®_EXTENDED) { assert(!(v->cflags®_QUOTE)); INTOCON(L_ERE); } else { assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); INTOCON(L_BRE); } v->nexttype = EMPTY; /* remember we were at the start */ next(v); /* set up the first token */ } /* - prefixes - implement various special prefixes ^ static void prefixes(struct vars *); */ static void prefixes( struct vars *v) { /* * Literal string doesn't get any of this stuff. */ if (v->cflags®_QUOTE) { return; } /* * Initial "***" gets special things. */ if (HAVE(4) && NEXT3('*', '*', '*')) { switch (*(v->now + 3)) { case CHR('?'): /* "***?" error, msg shows version */ ERR(REG_BADPAT); return; /* proceed no further */ break; case CHR('='): /* "***=" shifts to literal string */ NOTE(REG_UNONPOSIX); v->cflags |= REG_QUOTE; v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); v->now += 4; return; /* and there can be no more prefixes */ break; case CHR(':'): /* "***:" shifts to AREs */ NOTE(REG_UNONPOSIX); v->cflags |= REG_ADVANCED; v->now += 4; break; default: /* otherwise *** is just an error */ ERR(REG_BADRPT); return; break; } } /* * BREs and EREs don't get embedded options. */ if ((v->cflags®_ADVANCED) != REG_ADVANCED) { return; } /* * Embedded options (AREs only). */ if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { NOTE(REG_UNONPOSIX); v->now += 2; for (; !ATEOS() && iscalpha(*v->now); v->now++) { switch (*v->now) { case CHR('b'): /* BREs (but why???) */ v->cflags &= ~(REG_ADVANCED|REG_QUOTE); break; case CHR('c'): /* case sensitive */ v->cflags &= ~REG_ICASE; break; case CHR('e'): /* plain EREs */ v->cflags |= REG_EXTENDED; v->cflags &= ~(REG_ADVF|REG_QUOTE); break; case CHR('i'): /* case insensitive */ v->cflags |= REG_ICASE; break; case CHR('m'): /* Perloid synonym for n */ case CHR('n'): /* \n affects ^ $ . [^ */ v->cflags |= REG_NEWLINE; break; case CHR('p'): /* ~Perl, \n affects . [^ */ v->cflags |= REG_NLSTOP; v->cflags &= ~REG_NLANCH; break; case CHR('q'): /* literal string */ v->cflags |= REG_QUOTE; v->cflags &= ~REG_ADVANCED; break; case CHR('s'): /* single line, \n ordinary */ v->cflags &= ~REG_NEWLINE; break; case CHR('t'): /* tight syntax */ v->cflags &= ~REG_EXPANDED; break; case CHR('w'): /* weird, \n affects ^ $ only */ v->cflags &= ~REG_NLSTOP; v->cflags |= REG_NLANCH; break; case CHR('x'): /* expanded syntax */ v->cflags |= REG_EXPANDED; break; default: ERR(REG_BADOPT); return; } } if (!NEXT1(')')) { ERR(REG_BADOPT); return; } v->now++; if (v->cflags®_QUOTE) { v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); } } } /* - lexnest - "call a subroutine", interpolating string at the lexical level * Note, this is not a very general facility. There are a number of * implicit assumptions about what sorts of strings can be subroutines. ^ static void lexnest(struct vars *, const chr *, const chr *); */ static void lexnest( struct vars *v, const chr *beginp, /* start of interpolation */ const chr *endp) /* one past end of interpolation */ { assert(v->savenow == NULL); /* only one level of nesting */ v->savenow = v->now; v->savestop = v->stop; v->now = beginp; v->stop = endp; } /* * string constants to interpolate as expansions of things like \d */ static const chr backd[] = { /* \d */ CHR('['), CHR('['), CHR(':'), CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), CHR(':'), CHR(']'), CHR(']') }; static const chr backD[] = { /* \D */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), CHR(':'), CHR(']'), CHR(']') }; static const chr brbackd[] = { /* \d within brackets */ CHR('['), CHR(':'), CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), CHR(':'), CHR(']') }; static const chr backs[] = { /* \s */ CHR('['), CHR('['), CHR(':'), CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']'), CHR(']') }; static const chr backS[] = { /* \S */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']'), CHR(']') }; static const chr brbacks[] = { /* \s within brackets */ CHR('['), CHR(':'), CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']') }; #define PUNCT_CONN \ CHR('_'), \ 0x203f /* UNDERTIE */, \ 0x2040 /* CHARACTER TIE */,\ 0x2054 /* INVERTED UNDERTIE */,\ 0xfe33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */, \ 0xfe34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */, \ 0xfe4d /* DASHED LOW LINE */, \ 0xfe4e /* CENTRELINE LOW LINE */, \ 0xfe4f /* WAVY LOW LINE */, \ 0xff3f /* FULLWIDTH LOW LINE */ static const chr backw[] = { /* \w */ CHR('['), CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') }; static const chr backW[] = { /* \W */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') }; static const chr brbackw[] = { /* \w within brackets */ CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), CHR(':'), CHR(']'), PUNCT_CONN }; /* - lexword - interpolate a bracket expression for word characters * Possibly ought to inquire whether there is a "word" character class. ^ static void lexword(struct vars *); */ static void lexword( struct vars *v) { lexnest(v, backw, ENDOF(backw)); } /* - next - get next token ^ static int next(struct vars *); */ static int /* 1 normal, 0 failure */ next( struct vars *v) { chr c; /* * Errors yield an infinite sequence of failures. */ if (ISERR()) { return 0; /* the error has set nexttype to EOS */ } /* * Remember flavor of last token. */ v->lasttype = v->nexttype; /* * REG_BOSONLY */ if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { /* at start of a REG_BOSONLY RE */ RETV(SBEGIN, 0); /* same as \A */ } /* * If we're nested and we've hit end, return to outer level. */ if (v->savenow != NULL && ATEOS()) { v->now = v->savenow; v->stop = v->savestop; v->savenow = v->savestop = NULL; } /* * Skip white space etc. if appropriate (not in literal or []) */ if (v->cflags®_EXPANDED) { switch (v->lexcon) { case L_ERE: case L_BRE: case L_EBND: case L_BBND: skip(v); break; } } /* * Handle EOS, depending on context. */ if (ATEOS()) { switch (v->lexcon) { case L_ERE: case L_BRE: case L_Q: RET(EOS); break; case L_EBND: case L_BBND: FAILW(REG_EBRACE); break; case L_BRACK: case L_CEL: case L_ECL: case L_CCL: FAILW(REG_EBRACK); break; } assert(NOTREACHED); } /* * Okay, time to actually get a character. */ c = *v->now++; /* * Deal with the easy contexts, punt EREs to code below. */ switch (v->lexcon) { case L_BRE: /* punt BREs to separate function */ return brenext(v, c); break; case L_ERE: /* see below */ break; case L_Q: /* literal strings are easy */ RETV(PLAIN, c); break; case L_BBND: /* bounds are fairly simple */ case L_EBND: switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): RETV(DIGIT, (chr)DIGITVAL(c)); break; case CHR(','): RET(','); break; case CHR('}'): /* ERE bound ends with } */ if (INCON(L_EBND)) { INTOCON(L_ERE); if ((v->cflags®_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('}', 0); } RETV('}', 1); } else { FAILW(REG_BADBR); } break; case CHR('\\'): /* BRE bound ends with \} */ if (INCON(L_BBND) && NEXT1('}')) { v->now++; INTOCON(L_BRE); RET('}'); } else { FAILW(REG_BADBR); } break; default: FAILW(REG_BADBR); break; } assert(NOTREACHED); break; case L_BRACK: /* brackets are not too hard */ switch (c) { case CHR(']'): if (LASTTYPE('[')) { RETV(PLAIN, c); } else { INTOCON((v->cflags®_EXTENDED) ? L_ERE : L_BRE); RET(']'); } break; case CHR('\\'): NOTE(REG_UBBS); if (!(v->cflags®_ADVF)) { RETV(PLAIN, c); } NOTE(REG_UNONPOSIX); if (ATEOS()) { FAILW(REG_EESCAPE); } (void)lexescape(v); switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: return 1; break; case CCLASS: switch (v->nextvalue) { case 'd': lexnest(v, brbackd, ENDOF(brbackd)); break; case 's': lexnest(v, brbacks, ENDOF(brbacks)); break; case 'w': lexnest(v, brbackw, ENDOF(brbackw)); break; default: FAILW(REG_EESCAPE); break; } /* * lexnest() done, back up and try again. */ v->nexttype = v->lasttype; return next(v); break; } /* * Not one of the acceptable escapes. */ FAILW(REG_EESCAPE); break; case CHR('-'): if (LASTTYPE('[') || NEXT1(']')) { RETV(PLAIN, c); } else { RETV(RANGE, c); } break; case CHR('['): if (ATEOS()) { FAILW(REG_EBRACK); } switch (*v->now++) { case CHR('.'): INTOCON(L_CEL); /* * Might or might not be locale-specific. */ RET(COLLEL); break; case CHR('='): INTOCON(L_ECL); NOTE(REG_ULOCALE); RET(ECLASS); break; case CHR(':'): INTOCON(L_CCL); NOTE(REG_ULOCALE); RET(CCLASS); break; default: /* oops */ v->now--; RETV(PLAIN, c); break; } assert(NOTREACHED); break; default: RETV(PLAIN, c); break; } assert(NOTREACHED); break; case L_CEL: /* collating elements are easy */ if (c == CHR('.') && NEXT1(']')) { v->now++; INTOCON(L_BRACK); RETV(END, '.'); } else { RETV(PLAIN, c); } break; case L_ECL: /* ditto equivalence classes */ if (c == CHR('=') && NEXT1(']')) { v->now++; INTOCON(L_BRACK); RETV(END, '='); } else { RETV(PLAIN, c); } break; case L_CCL: /* ditto character classes */ if (c == CHR(':') && NEXT1(']')) { v->now++; INTOCON(L_BRACK); RETV(END, ':'); } else { RETV(PLAIN, c); } break; default: assert(NOTREACHED); break; } /* * That got rid of everything except EREs and AREs. */ assert(INCON(L_ERE)); /* * Deal with EREs and AREs, except for backslashes. */ switch (c) { case CHR('|'): RET('|'); break; case CHR('*'): if ((v->cflags®_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('*', 0); } RETV('*', 1); break; case CHR('+'): if ((v->cflags®_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('+', 0); } RETV('+', 1); break; case CHR('?'): if ((v->cflags®_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('?', 0); } RETV('?', 1); break; case CHR('{'): /* bounds start or plain character */ if (v->cflags®_EXPANDED) { skip(v); } if (ATEOS() || !iscdigit(*v->now)) { NOTE(REG_UBRACES); NOTE(REG_UUNSPEC); RETV(PLAIN, c); } else { NOTE(REG_UBOUNDS); INTOCON(L_EBND); RET('{'); } assert(NOTREACHED); break; case CHR('('): /* parenthesis, or advanced extension */ if ((v->cflags®_ADVF) && NEXT1('?')) { NOTE(REG_UNONPOSIX); v->now++; switch (*v->now++) { case CHR(':'): /* non-capturing paren */ RETV('(', 0); break; case CHR('#'): /* comment */ while (!ATEOS() && *v->now != CHR(')')) { v->now++; } if (!ATEOS()) { v->now++; } assert(v->nexttype == v->lasttype); return next(v); break; case CHR('='): /* positive lookahead */ NOTE(REG_ULOOKAHEAD); RETV(LACON, 1); break; case CHR('!'): /* negative lookahead */ NOTE(REG_ULOOKAHEAD); RETV(LACON, 0); break; default: FAILW(REG_BADRPT); break; } assert(NOTREACHED); } if (v->cflags®_NOSUB) { RETV('(', 0); /* all parens non-capturing */ } else { RETV('(', 1); } break; case CHR(')'): if (LASTTYPE('(')) { NOTE(REG_UUNSPEC); } RETV(')', c); break; case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ if (HAVE(6) && *(v->now+0) == CHR('[') && *(v->now+1) == CHR(':') && (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && *(v->now+3) == CHR(':') && *(v->now+4) == CHR(']') && *(v->now+5) == CHR(']')) { c = *(v->now+2); v->now += 6; NOTE(REG_UNONPOSIX); RET((c == CHR('<')) ? '<' : '>'); } INTOCON(L_BRACK); if (NEXT1('^')) { v->now++; RETV('[', 0); } RETV('[', 1); break; case CHR('.'): RET('.'); break; case CHR('^'): RET('^'); break; case CHR('$'): RET('$'); break; case CHR('\\'): /* mostly punt backslashes to code below */ if (ATEOS()) { FAILW(REG_EESCAPE); } break; default: /* ordinary character */ RETV(PLAIN, c); break; } /* * ERE/ARE backslash handling; backslash already eaten. */ assert(!ATEOS()); if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ if (iscalnum(*v->now)) { NOTE(REG_UBSALNUM); NOTE(REG_UUNSPEC); } RETV(PLAIN, *v->now++); } (void)lexescape(v); if (ISERR()) { FAILW(REG_EESCAPE); } if (v->nexttype == CCLASS) {/* fudge at lexical level */ switch (v->nextvalue) { case 'd': lexnest(v, backd, ENDOF(backd)); break; case 'D': lexnest(v, backD, ENDOF(backD)); break; case 's': lexnest(v, backs, ENDOF(backs)); break; case 'S': lexnest(v, backS, ENDOF(backS)); break; case 'w': lexnest(v, backw, ENDOF(backw)); break; case 'W': lexnest(v, backW, ENDOF(backW)); break; default: assert(NOTREACHED); FAILW(REG_ASSERT); break; } /* lexnest done, back up and try again */ v->nexttype = v->lasttype; return next(v); } /* * Otherwise, lexescape has already done the work. */ return !ISERR(); } /* - lexescape - parse an ARE backslash escape (backslash already eaten) * Note slightly nonstandard use of the CCLASS type code. ^ static int lexescape(struct vars *); */ static int /* not actually used, but convenient for RETV */ lexescape( struct vars *v) { chr c; int i; static const chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; static const chr esc[] = { CHR('E'), CHR('S'), CHR('C') }; const chr *save; assert(v->cflags®_ADVF); assert(!ATEOS()); c = *v->now++; if (!iscalnum(c)) { RETV(PLAIN, c); } NOTE(REG_UNONPOSIX); switch (c) { case CHR('a'): RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); break; case CHR('A'): RETV(SBEGIN, 0); break; case CHR('b'): RETV(PLAIN, CHR('\b')); break; case CHR('B'): RETV(PLAIN, CHR('\\')); break; case CHR('c'): NOTE(REG_UUNPORT); if (ATEOS()) { FAILW(REG_EESCAPE); } RETV(PLAIN, (chr)(*v->now++ & 037)); break; case CHR('d'): NOTE(REG_ULOCALE); RETV(CCLASS, 'd'); break; case CHR('D'): NOTE(REG_ULOCALE); RETV(CCLASS, 'D'); break; case CHR('e'): NOTE(REG_UUNPORT); RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); break; case CHR('f'): RETV(PLAIN, CHR('\f')); break; case CHR('m'): RET('<'); break; case CHR('M'): RET('>'); break; case CHR('n'): RETV(PLAIN, CHR('\n')); break; case CHR('r'): RETV(PLAIN, CHR('\r')); break; case CHR('s'): NOTE(REG_ULOCALE); RETV(CCLASS, 's'); break; case CHR('S'): NOTE(REG_ULOCALE); RETV(CCLASS, 'S'); break; case CHR('t'): RETV(PLAIN, CHR('\t')); break; case CHR('u'): c = (uchr) lexdigits(v, 16, 1, 4); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('U'): i = lexdigits(v, 16, 1, 8); if (ISERR()) { FAILW(REG_EESCAPE); } if (i > 0xFFFF) { /* TODO: output a Surrogate pair */ i = 0xFFFD; } RETV(PLAIN, (uchr) i); break; case CHR('v'): RETV(PLAIN, CHR('\v')); break; case CHR('w'): NOTE(REG_ULOCALE); RETV(CCLASS, 'w'); break; case CHR('W'): NOTE(REG_ULOCALE); RETV(CCLASS, 'W'); break; case CHR('x'): NOTE(REG_UUNPORT); c = (uchr) lexdigits(v, 16, 1, 2); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('y'): NOTE(REG_ULOCALE); RETV(WBDRY, 0); break; case CHR('Y'): NOTE(REG_ULOCALE); RETV(NWBDRY, 0); break; case CHR('Z'): RETV(SEND, 0); break; case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): save = v->now; v->now--; /* put first digit back */ c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ if (ISERR()) { FAILW(REG_EESCAPE); } /* * Ugly heuristic (first test is "exactly 1 digit?") */ if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) { NOTE(REG_UBACKREF); RETV(BACKREF, (chr)c); } /* * Oops, doesn't look like it's a backref after all... */ v->now = save; /* * And fall through into octal number. */ case CHR('0'): NOTE(REG_UUNPORT); v->now--; /* put first digit back */ c = (uchr) lexdigits(v, 8, 1, 3); if (ISERR()) { FAILW(REG_EESCAPE); } if (c > 0xff) { /* out of range, so we handled one digit too much */ v->now--; c >>= 3; } RETV(PLAIN, c); break; default: assert(iscalpha(c)); FAILW(REG_EESCAPE); /* unknown alphabetic escape */ break; } assert(NOTREACHED); } /* - lexdigits - slurp up digits and return chr value ^ static int lexdigits(struct vars *, int, int, int); */ static int /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, int base, int minlen, int maxlen) { int n; int len; chr c; int d; const uchr ub = (uchr) base; n = 0; for (len = 0; len < maxlen && !ATEOS(); len++) { if (n > 0x10fff) { /* Stop when continuing would otherwise overflow */ break; } c = *v->now++; switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): d = DIGITVAL(c); break; case CHR('a'): case CHR('A'): d = 10; break; case CHR('b'): case CHR('B'): d = 11; break; case CHR('c'): case CHR('C'): d = 12; break; case CHR('d'): case CHR('D'): d = 13; break; case CHR('e'): case CHR('E'): d = 14; break; case CHR('f'): case CHR('F'): d = 15; break; default: v->now--; /* oops, not a digit at all */ d = -1; break; } if (d >= base) { /* not a plausible digit */ v->now--; d = -1; } if (d < 0) { break; /* NOTE BREAK OUT */ } n = n*ub + (uchr)d; } if (len < minlen) { ERR(REG_EESCAPE); } return n; } /* - brenext - get next BRE token * This is much like EREs except for all the stupid backslashes and the * context-dependency of some things. ^ static int brenext(struct vars *, pchr); */ static int /* 1 normal, 0 failure */ brenext( struct vars *v, pchr pc) { chr c = (chr)pc; switch (c) { case CHR('*'): if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) { RETV(PLAIN, c); } RET('*'); break; case CHR('['): if (HAVE(6) && *(v->now+0) == CHR('[') && *(v->now+1) == CHR(':') && (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && *(v->now+3) == CHR(':') && *(v->now+4) == CHR(']') && *(v->now+5) == CHR(']')) { c = *(v->now+2); v->now += 6; NOTE(REG_UNONPOSIX); RET((c == CHR('<')) ? '<' : '>'); } INTOCON(L_BRACK); if (NEXT1('^')) { v->now++; RETV('[', 0); } RETV('[', 1); break; case CHR('.'): RET('.'); break; case CHR('^'): if (LASTTYPE(EMPTY)) { RET('^'); } if (LASTTYPE('(')) { NOTE(REG_UUNSPEC); RET('^'); } RETV(PLAIN, c); break; case CHR('$'): if (v->cflags®_EXPANDED) { skip(v); } if (ATEOS()) { RET('$'); } if (NEXT2('\\', ')')) { NOTE(REG_UUNSPEC); RET('$'); } RETV(PLAIN, c); break; case CHR('\\'): break; /* see below */ default: RETV(PLAIN, c); break; } assert(c == CHR('\\')); if (ATEOS()) { FAILW(REG_EESCAPE); } c = *v->now++; switch (c) { case CHR('{'): INTOCON(L_BBND); NOTE(REG_UBOUNDS); RET('{'); break; case CHR('('): RETV('(', 1); break; case CHR(')'): RETV(')', c); break; case CHR('<'): NOTE(REG_UNONPOSIX); RET('<'); break; case CHR('>'): NOTE(REG_UNONPOSIX); RET('>'); break; case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): NOTE(REG_UBACKREF); RETV(BACKREF, (chr)DIGITVAL(c)); break; default: if (iscalnum(c)) { NOTE(REG_UBSALNUM); NOTE(REG_UUNSPEC); } RETV(PLAIN, c); break; } assert(NOTREACHED); } /* - skip - skip white space and comments in expanded form ^ static void skip(struct vars *); */ static void skip( struct vars *v) { const chr *start = v->now; assert(v->cflags®_EXPANDED); for (;;) { while (!ATEOS() && iscspace(*v->now)) { v->now++; } if (ATEOS() || *v->now != CHR('#')) { break; /* NOTE BREAK OUT */ } assert(NEXT1('#')); while (!ATEOS() && *v->now != CHR('\n')) { v->now++; } /* * Leave the newline to be picked up by the iscspace loop. */ } if (v->now != start) { NOTE(REG_UNONPOSIX); } } /* - newline - return the chr for a newline * This helps confine use of CHR to this source file. ^ static chr newline(void); */ static chr newline(void) { return CHR('\n'); } /* - chrnamed - return the chr known by a given (chr string) name * The code is a bit clumsy, but this routine gets only such specialized * use that it hardly matters. ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); */ static chr chrnamed( struct vars *v, const chr *startp, /* start of name */ const chr *endp, /* just past end of name */ pchr lastresort) /* what to return if name lookup fails */ { celt c; int errsave; int e; struct cvec *cv; errsave = v->err; v->err = 0; c = element(v, startp, endp); e = v->err; v->err = errsave; if (e != 0) { return (chr)lastresort; } cv = range(v, c, c, 0); if (cv->nchrs == 0) { return (chr)lastresort; } return cv->chrs[0]; } /* * Local Variables: * mode: c * c-basic-offset: 4 * fill-column: 78 * End: */