diff options
Diffstat (limited to 'generic/regc_lex.c')
| -rw-r--r-- | generic/regc_lex.c | 128 |
1 files changed, 69 insertions, 59 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c index cc02e9d..4c8f15f 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -63,7 +63,7 @@ /* - lexstart - set up lexical stuff, scan leading options - ^ static VOID lexstart(struct vars *); + ^ static void lexstart(struct vars *); */ static void lexstart( @@ -89,7 +89,7 @@ lexstart( /* - prefixes - implement various special prefixes - ^ static VOID prefixes(struct vars *); + ^ static void prefixes(struct vars *); */ static void prefixes( @@ -207,13 +207,13 @@ prefixes( - lexnest - "call a subroutine", interpolating string at the lexical level * Note, this is not a very general facility. There are a number of * implicit assumptions about what sorts of strings can be subroutines. - ^ static VOID lexnest(struct vars *, chr *, chr *); + ^ static void lexnest(struct vars *, const chr *, const chr *); */ static void lexnest( struct vars *v, - chr *beginp, /* start of interpolation */ - chr *endp) /* one past end of interpolation */ + const chr *beginp, /* start of interpolation */ + const chr *endp) /* one past end of interpolation */ { assert(v->savenow == NULL); /* only one level of nesting */ v->savenow = v->now; @@ -226,56 +226,69 @@ lexnest( * string constants to interpolate as expansions of things like \d */ -static chr backd[] = { /* \d */ +static const chr backd[] = { /* \d */ CHR('['), CHR('['), CHR(':'), CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), CHR(':'), CHR(']'), CHR(']') }; -static chr backD[] = { /* \D */ +static const chr backD[] = { /* \D */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), CHR(':'), CHR(']'), CHR(']') }; -static chr brbackd[] = { /* \d within brackets */ +static const chr brbackd[] = { /* \d within brackets */ CHR('['), CHR(':'), CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), CHR(':'), CHR(']') }; -static chr backs[] = { /* \s */ +static const chr backs[] = { /* \s */ CHR('['), CHR('['), CHR(':'), CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']'), CHR(']') }; -static chr backS[] = { /* \S */ +static const chr backS[] = { /* \S */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']'), CHR(']') }; -static chr brbacks[] = { /* \s within brackets */ +static const chr brbacks[] = { /* \s within brackets */ CHR('['), CHR(':'), CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), CHR(':'), CHR(']') }; -static chr backw[] = { /* \w */ + +#define PUNCT_CONN \ + CHR('_'), \ + 0x203f /* UNDERTIE */, \ + 0x2040 /* CHARACTER TIE */,\ + 0x2054 /* INVERTED UNDERTIE */,\ + 0xfe33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */, \ + 0xfe34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */, \ + 0xfe4d /* DASHED LOW LINE */, \ + 0xfe4e /* CENTRELINE LOW LINE */, \ + 0xfe4f /* WAVY LOW LINE */, \ + 0xff3f /* FULLWIDTH LOW LINE */ + +static const chr backw[] = { /* \w */ CHR('['), CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') + CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') }; -static chr backW[] = { /* \W */ +static const chr backW[] = { /* \W */ CHR('['), CHR('^'), CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') + CHR(':'), CHR(']'), PUNCT_CONN, CHR(']') }; -static chr brbackw[] = { /* \w within brackets */ +static const chr brbackw[] = { /* \w within brackets */ CHR('['), CHR(':'), CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_') + CHR(':'), CHR(']'), PUNCT_CONN }; /* - lexword - interpolate a bracket expression for word characters * Possibly ought to inquire whether there is a "word" character class. - ^ static VOID lexword(struct vars *); + ^ static void lexword(struct vars *); */ static void lexword( @@ -444,7 +457,7 @@ next( if (ATEOS()) { FAILW(REG_EESCAPE); } - (DISCARD)lexescape(v); + (void)lexescape(v); switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: return 1; @@ -703,7 +716,7 @@ next( } RETV(PLAIN, *v->now++); } - (DISCARD)lexescape(v); + (void)lexescape(v); if (ISERR()) { FAILW(REG_EESCAPE); } @@ -742,13 +755,14 @@ lexescape( struct vars *v) { chr c; - static chr alert[] = { + int i; + static const chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; - static chr esc[] = { + static const chr esc[] = { CHR('E'), CHR('S'), CHR('C') }; - chr *save; + const chr *save; assert(v->cflags®_ADVF); @@ -818,18 +832,23 @@ lexescape( RETV(PLAIN, CHR('\t')); break; case CHR('u'): - c = lexdigits(v, 16, 4, 4); + c = (uchr) lexdigits(v, 16, 1, 4); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('U'): - c = lexdigits(v, 16, 8, 8); + i = lexdigits(v, 16, 1, 8); if (ISERR()) { FAILW(REG_EESCAPE); } - RETV(PLAIN, c); + if (i > 0xFFFF) { + /* TODO: output a Surrogate pair + */ + i = 0xFFFD; + } + RETV(PLAIN, (uchr) i); break; case CHR('v'): RETV(PLAIN, CHR('\v')); @@ -844,7 +863,7 @@ lexescape( break; case CHR('x'): NOTE(REG_UUNPORT); - c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 16, 1, 2); if (ISERR()) { FAILW(REG_EESCAPE); } @@ -866,7 +885,7 @@ lexescape( case CHR('9'): save = v->now; v->now--; /* put first digit back */ - c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ if (ISERR()) { FAILW(REG_EESCAPE); } @@ -875,7 +894,7 @@ lexescape( * Ugly heuristic (first test is "exactly 1 digit?") */ - if (v->now - save == 0 || (int)c <= v->nsubexp) { + if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) { NOTE(REG_UBACKREF); RETV(BACKREF, (chr)c); } @@ -893,10 +912,15 @@ lexescape( case CHR('0'): NOTE(REG_UUNPORT); v->now--; /* put first digit back */ - c = lexdigits(v, 8, 1, 3); + c = (uchr) lexdigits(v, 8, 1, 3); if (ISERR()) { FAILW(REG_EESCAPE); } + if (c > 0xff) { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; + } RETV(PLAIN, c); break; default: @@ -909,23 +933,27 @@ lexescape( /* - lexdigits - slurp up digits and return chr value - ^ static chr lexdigits(struct vars *, int, int, int); + ^ static int lexdigits(struct vars *, int, int, int); */ -static chr /* chr value; errors signalled via ERR */ +static int /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, int base, int minlen, int maxlen) { - uchr n; /* unsigned to avoid overflow misbehavior */ + int n; int len; chr c; int d; - CONST uchr ub = (uchr) base; + const uchr ub = (uchr) base; n = 0; for (len = 0; len < maxlen && !ATEOS(); len++) { + if (n > 0x10fff) { + /* Stop when continuing would otherwise overflow */ + break; + } c = *v->now++; switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): @@ -958,7 +986,7 @@ lexdigits( ERR(REG_EESCAPE); } - return (chr)n; + return n; } /* @@ -1080,13 +1108,13 @@ brenext( /* - skip - skip white space and comments in expanded form - ^ static VOID skip(struct vars *); + ^ static void skip(struct vars *); */ static void skip( struct vars *v) { - chr *start = v->now; + const chr *start = v->now; assert(v->cflags®_EXPANDED); @@ -1115,7 +1143,7 @@ skip( /* - newline - return the chr for a newline * This helps confine use of CHR to this source file. - ^ static chr newline(NOPARMS); + ^ static chr newline(void); */ static chr newline(void) @@ -1124,34 +1152,16 @@ newline(void) } /* - - ch - return the chr sequence for regc_locale.c's fake collating element ch - * This helps confine use of CHR to this source file. Beware that the caller - * knows how long the sequence is. - ^ #ifdef REG_DEBUG - ^ static chr *ch(NOPARMS); - ^ #endif - */ -#ifdef REG_DEBUG -static chr * -ch(void) -{ - static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; - - return chstr; -} -#endif - -/* - chrnamed - return the chr known by a given (chr string) name * The code is a bit clumsy, but this routine gets only such specialized * use that it hardly matters. - ^ static chr chrnamed(struct vars *, chr *, chr *, pchr); + ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); */ static chr chrnamed( struct vars *v, - chr *startp, /* start of name */ - chr *endp, /* just past end of name */ + const chr *startp, /* start of name */ + const chr *endp, /* just past end of name */ pchr lastresort) /* what to return if name lookup fails */ { celt c; |
