summaryrefslogtreecommitdiffstats
path: root/generic/regc_lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/regc_lex.c')
-rw-r--r--generic/regc_lex.c128
1 files changed, 69 insertions, 59 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index cc02e9d..4c8f15f 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -63,7 +63,7 @@
/*
- lexstart - set up lexical stuff, scan leading options
- ^ static VOID lexstart(struct vars *);
+ ^ static void lexstart(struct vars *);
*/
static void
lexstart(
@@ -89,7 +89,7 @@ lexstart(
/*
- prefixes - implement various special prefixes
- ^ static VOID prefixes(struct vars *);
+ ^ static void prefixes(struct vars *);
*/
static void
prefixes(
@@ -207,13 +207,13 @@ prefixes(
- lexnest - "call a subroutine", interpolating string at the lexical level
* Note, this is not a very general facility. There are a number of
* implicit assumptions about what sorts of strings can be subroutines.
- ^ static VOID lexnest(struct vars *, chr *, chr *);
+ ^ static void lexnest(struct vars *, const chr *, const chr *);
*/
static void
lexnest(
struct vars *v,
- chr *beginp, /* start of interpolation */
- chr *endp) /* one past end of interpolation */
+ const chr *beginp, /* start of interpolation */
+ const chr *endp) /* one past end of interpolation */
{
assert(v->savenow == NULL); /* only one level of nesting */
v->savenow = v->now;
@@ -226,56 +226,69 @@ lexnest(
* string constants to interpolate as expansions of things like \d
*/
-static chr backd[] = { /* \d */
+static const chr backd[] = { /* \d */
CHR('['), CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']'), CHR(']')
};
-static chr backD[] = { /* \D */
+static const chr backD[] = { /* \D */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']'), CHR(']')
};
-static chr brbackd[] = { /* \d within brackets */
+static const chr brbackd[] = { /* \d within brackets */
CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']')
};
-static chr backs[] = { /* \s */
+static const chr backs[] = { /* \s */
CHR('['), CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']'), CHR(']')
};
-static chr backS[] = { /* \S */
+static const chr backS[] = { /* \S */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']'), CHR(']')
};
-static chr brbacks[] = { /* \s within brackets */
+static const chr brbacks[] = { /* \s within brackets */
CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']')
};
-static chr backw[] = { /* \w */
+
+#define PUNCT_CONN \
+ CHR('_'), \
+ 0x203f /* UNDERTIE */, \
+ 0x2040 /* CHARACTER TIE */,\
+ 0x2054 /* INVERTED UNDERTIE */,\
+ 0xfe33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */, \
+ 0xfe34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */, \
+ 0xfe4d /* DASHED LOW LINE */, \
+ 0xfe4e /* CENTRELINE LOW LINE */, \
+ 0xfe4f /* WAVY LOW LINE */, \
+ 0xff3f /* FULLWIDTH LOW LINE */
+
+static const chr backw[] = { /* \w */
CHR('['), CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
- CHR(':'), CHR(']'), CHR('_'), CHR(']')
+ CHR(':'), CHR(']'), PUNCT_CONN, CHR(']')
};
-static chr backW[] = { /* \W */
+static const chr backW[] = { /* \W */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
- CHR(':'), CHR(']'), CHR('_'), CHR(']')
+ CHR(':'), CHR(']'), PUNCT_CONN, CHR(']')
};
-static chr brbackw[] = { /* \w within brackets */
+static const chr brbackw[] = { /* \w within brackets */
CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
- CHR(':'), CHR(']'), CHR('_')
+ CHR(':'), CHR(']'), PUNCT_CONN
};
/*
- lexword - interpolate a bracket expression for word characters
* Possibly ought to inquire whether there is a "word" character class.
- ^ static VOID lexword(struct vars *);
+ ^ static void lexword(struct vars *);
*/
static void
lexword(
@@ -444,7 +457,7 @@ next(
if (ATEOS()) {
FAILW(REG_EESCAPE);
}
- (DISCARD)lexescape(v);
+ (void)lexescape(v);
switch (v->nexttype) { /* not all escapes okay here */
case PLAIN:
return 1;
@@ -703,7 +716,7 @@ next(
}
RETV(PLAIN, *v->now++);
}
- (DISCARD)lexescape(v);
+ (void)lexescape(v);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -742,13 +755,14 @@ lexescape(
struct vars *v)
{
chr c;
- static chr alert[] = {
+ int i;
+ static const chr alert[] = {
CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
};
- static chr esc[] = {
+ static const chr esc[] = {
CHR('E'), CHR('S'), CHR('C')
};
- chr *save;
+ const chr *save;
assert(v->cflags&REG_ADVF);
@@ -818,18 +832,23 @@ lexescape(
RETV(PLAIN, CHR('\t'));
break;
case CHR('u'):
- c = lexdigits(v, 16, 4, 4);
+ c = (uchr) lexdigits(v, 16, 1, 4);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
RETV(PLAIN, c);
break;
case CHR('U'):
- c = lexdigits(v, 16, 8, 8);
+ i = lexdigits(v, 16, 1, 8);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
- RETV(PLAIN, c);
+ if (i > 0xFFFF) {
+ /* TODO: output a Surrogate pair
+ */
+ i = 0xFFFD;
+ }
+ RETV(PLAIN, (uchr) i);
break;
case CHR('v'):
RETV(PLAIN, CHR('\v'));
@@ -844,7 +863,7 @@ lexescape(
break;
case CHR('x'):
NOTE(REG_UUNPORT);
- c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
+ c = (uchr) lexdigits(v, 16, 1, 2);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -866,7 +885,7 @@ lexescape(
case CHR('9'):
save = v->now;
v->now--; /* put first digit back */
- c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
+ c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -875,7 +894,7 @@ lexescape(
* Ugly heuristic (first test is "exactly 1 digit?")
*/
- if (v->now - save == 0 || (int)c <= v->nsubexp) {
+ if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) {
NOTE(REG_UBACKREF);
RETV(BACKREF, (chr)c);
}
@@ -893,10 +912,15 @@ lexescape(
case CHR('0'):
NOTE(REG_UUNPORT);
v->now--; /* put first digit back */
- c = lexdigits(v, 8, 1, 3);
+ c = (uchr) lexdigits(v, 8, 1, 3);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
+ if (c > 0xff) {
+ /* out of range, so we handled one digit too much */
+ v->now--;
+ c >>= 3;
+ }
RETV(PLAIN, c);
break;
default:
@@ -909,23 +933,27 @@ lexescape(
/*
- lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
*/
-static chr /* chr value; errors signalled via ERR */
+static int /* chr value; errors signalled via ERR */
lexdigits(
struct vars *v,
int base,
int minlen,
int maxlen)
{
- uchr n; /* unsigned to avoid overflow misbehavior */
+ int n;
int len;
chr c;
int d;
- CONST uchr ub = (uchr) base;
+ const uchr ub = (uchr) base;
n = 0;
for (len = 0; len < maxlen && !ATEOS(); len++) {
+ if (n > 0x10fff) {
+ /* Stop when continuing would otherwise overflow */
+ break;
+ }
c = *v->now++;
switch (c) {
case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
@@ -958,7 +986,7 @@ lexdigits(
ERR(REG_EESCAPE);
}
- return (chr)n;
+ return n;
}
/*
@@ -1080,13 +1108,13 @@ brenext(
/*
- skip - skip white space and comments in expanded form
- ^ static VOID skip(struct vars *);
+ ^ static void skip(struct vars *);
*/
static void
skip(
struct vars *v)
{
- chr *start = v->now;
+ const chr *start = v->now;
assert(v->cflags&REG_EXPANDED);
@@ -1115,7 +1143,7 @@ skip(
/*
- newline - return the chr for a newline
* This helps confine use of CHR to this source file.
- ^ static chr newline(NOPARMS);
+ ^ static chr newline(void);
*/
static chr
newline(void)
@@ -1124,34 +1152,16 @@ newline(void)
}
/*
- - ch - return the chr sequence for regc_locale.c's fake collating element ch
- * This helps confine use of CHR to this source file. Beware that the caller
- * knows how long the sequence is.
- ^ #ifdef REG_DEBUG
- ^ static chr *ch(NOPARMS);
- ^ #endif
- */
-#ifdef REG_DEBUG
-static chr *
-ch(void)
-{
- static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
-
- return chstr;
-}
-#endif
-
-/*
- chrnamed - return the chr known by a given (chr string) name
* The code is a bit clumsy, but this routine gets only such specialized
* use that it hardly matters.
- ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
+ ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
*/
static chr
chrnamed(
struct vars *v,
- chr *startp, /* start of name */
- chr *endp, /* just past end of name */
+ const chr *startp, /* start of name */
+ const chr *endp, /* just past end of name */
pchr lastresort) /* what to return if name lookup fails */
{
celt c;