1 files changed, 69 insertions, 59 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index cc02e9d..4c8f15f 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -63,7 +63,7 @@
 
 /*
  - lexstart - set up lexical stuff, scan leading options
- ^ static VOID lexstart(struct vars *);
+ ^ static void lexstart(struct vars *);
  */
 static void
 lexstart(
@@ -89,7 +89,7 @@ lexstart(
 
 /*
  - prefixes - implement various special prefixes
- ^ static VOID prefixes(struct vars *);
+ ^ static void prefixes(struct vars *);
  */
 static void
 prefixes(
@@ -207,13 +207,13 @@ prefixes(
  - lexnest - "call a subroutine", interpolating string at the lexical level
  * Note, this is not a very general facility.  There are a number of
  * implicit assumptions about what sorts of strings can be subroutines.
- ^ static VOID lexnest(struct vars *, chr *, chr *);
+ ^ static void lexnest(struct vars *, const chr *, const chr *);
  */
 static void
 lexnest(
     struct vars *v,
-    chr *beginp,		/* start of interpolation */
-    chr *endp)			/* one past end of interpolation */
+    const chr *beginp,		/* start of interpolation */
+    const chr *endp)		/* one past end of interpolation */
 {
     assert(v->savenow == NULL);	/* only one level of nesting */
     v->savenow = v->now;
@@ -226,56 +226,69 @@ lexnest(
  * string constants to interpolate as expansions of things like \d
  */
 
-static chr backd[] = {		/* \d */
+static const chr backd[] = {	/* \d */
     CHR('['), CHR('['), CHR(':'),
     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
     CHR(':'), CHR(']'), CHR(']')
 };
-static chr backD[] = {		/* \D */
+static const chr backD[] = {	/* \D */
     CHR('['), CHR('^'), CHR('['), CHR(':'),
     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
     CHR(':'), CHR(']'), CHR(']')
 };
-static chr brbackd[] = {	/* \d within brackets */
+static const chr brbackd[] = {	/* \d within brackets */
     CHR('['), CHR(':'),
     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
     CHR(':'), CHR(']')
 };
-static chr backs[] = {		/* \s */
+static const chr backs[] = {	/* \s */
     CHR('['), CHR('['), CHR(':'),
     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
     CHR(':'), CHR(']'), CHR(']')
 };
-static chr backS[] = {		/* \S */
+static const chr backS[] = {	/* \S */
     CHR('['), CHR('^'), CHR('['), CHR(':'),
     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
     CHR(':'), CHR(']'), CHR(']')
 };
-static chr brbacks[] = {	/* \s within brackets */
+static const chr brbacks[] = {	/* \s within brackets */
     CHR('['), CHR(':'),
     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
     CHR(':'), CHR(']')
 };
-static chr backw[] = {		/* \w */
+
+#define PUNCT_CONN \
+	CHR('_'), \
+	0x203f /* UNDERTIE */, \
+	0x2040 /* CHARACTER TIE */,\
+	0x2054 /* INVERTED UNDERTIE */,\
+	0xfe33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */, \
+	0xfe34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */, \
+	0xfe4d /* DASHED LOW LINE */, \
+	0xfe4e /* CENTRELINE LOW LINE */, \
+	0xfe4f /* WAVY LOW LINE */, \
+	0xff3f /* FULLWIDTH LOW LINE */
+
+static const chr backw[] = {	/* \w */
     CHR('['), CHR('['), CHR(':'),
     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-    CHR(':'), CHR(']'), CHR('_'), CHR(']')
+    CHR(':'), CHR(']'), PUNCT_CONN, CHR(']')
 };
-static chr backW[] = {		/* \W */
+static const chr backW[] = {	/* \W */
     CHR('['), CHR('^'), CHR('['), CHR(':'),
     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-    CHR(':'), CHR(']'), CHR('_'), CHR(']')
+    CHR(':'), CHR(']'), PUNCT_CONN, CHR(']')
 };
-static chr brbackw[] = {	/* \w within brackets */
+static const chr brbackw[] = {	/* \w within brackets */
     CHR('['), CHR(':'),
     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-    CHR(':'), CHR(']'), CHR('_')
+    CHR(':'), CHR(']'), PUNCT_CONN
 };
 
 /*
  - lexword - interpolate a bracket expression for word characters
  * Possibly ought to inquire whether there is a "word" character class.
- ^ static VOID lexword(struct vars *);
+ ^ static void lexword(struct vars *);
  */
 static void
 lexword(
@@ -444,7 +457,7 @@ next(
 	    if (ATEOS()) {
 		FAILW(REG_EESCAPE);
 	    }
-	    (DISCARD)lexescape(v);
+	    (void)lexescape(v);
 	    switch (v->nexttype) {	/* not all escapes okay here */
 	    case PLAIN:
 		return 1;
@@ -703,7 +716,7 @@ next(
 	}
 	RETV(PLAIN, *v->now++);
     }
-    (DISCARD)lexescape(v);
+    (void)lexescape(v);
     if (ISERR()) {
 	FAILW(REG_EESCAPE);
     }
@@ -742,13 +755,14 @@ lexescape(
     struct vars *v)
 {
     chr c;
-    static chr alert[] = {
+    int i;
+    static const chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
-    static chr esc[] = {
+    static const chr esc[] = {
 	CHR('E'), CHR('S'), CHR('C')
     };
-    chr *save;
+    const chr *save;
 
     assert(v->cflags&REG_ADVF);
 
@@ -818,18 +832,23 @@ lexescape(
 	RETV(PLAIN, CHR('\t'));
 	break;
     case CHR('u'):
-	c = lexdigits(v, 16, 4, 4);
+	c = (uchr) lexdigits(v, 16, 1, 4);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
 	RETV(PLAIN, c);
 	break;
     case CHR('U'):
-	c = lexdigits(v, 16, 8, 8);
+	i = lexdigits(v, 16, 1, 8);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, c);
+	if (i > 0xFFFF) {
+	    /* TODO: output a Surrogate pair
+	     */
+	    i = 0xFFFD;
+	}
+	RETV(PLAIN, (uchr) i);
 	break;
     case CHR('v'):
 	RETV(PLAIN, CHR('\v'));
@@ -844,7 +863,7 @@ lexescape(
 	break;
     case CHR('x'):
 	NOTE(REG_UUNPORT);
-	c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 16, 1, 2);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
@@ -866,7 +885,7 @@ lexescape(
     case CHR('9'):
 	save = v->now;
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
@@ -875,7 +894,7 @@ lexescape(
 	 * Ugly heuristic (first test is "exactly 1 digit?")
 	 */
 
-	if (v->now - save == 0 || (int)c <= v->nsubexp) {
+	if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) {
 	    NOTE(REG_UBACKREF);
 	    RETV(BACKREF, (chr)c);
 	}
@@ -893,10 +912,15 @@ lexescape(
     case CHR('0'):
 	NOTE(REG_UUNPORT);
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 8, 1, 3);
+	c = (uchr) lexdigits(v, 8, 1, 3);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
+	if (c > 0xff) {
+	    /* out of range, so we handled one digit too much */
+	    v->now--;
+	    c >>= 3;
+	}
 	RETV(PLAIN, c);
 	break;
     default:
@@ -909,23 +933,27 @@ lexescape(
 
 /*
  - lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
  */
-static chr			/* chr value; errors signalled via ERR */
+static int			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
     int base,
     int minlen,
     int maxlen)
 {
-    uchr n;			/* unsigned to avoid overflow misbehavior */
+    int n;
     int len;
     chr c;
     int d;
-    CONST uchr ub = (uchr) base;
+    const uchr ub = (uchr) base;
 
     n = 0;
     for (len = 0; len < maxlen && !ATEOS(); len++) {
+	if (n > 0x10fff) {
+	    /* Stop when continuing would otherwise overflow */
+	    break;
+	}
 	c = *v->now++;
 	switch (c) {
 	case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
@@ -958,7 +986,7 @@ lexdigits(
 	ERR(REG_EESCAPE);
     }
 
-    return (chr)n;
+    return n;
 }
 
 /*
@@ -1080,13 +1108,13 @@ brenext(
 
 /*
  - skip - skip white space and comments in expanded form
- ^ static VOID skip(struct vars *);
+ ^ static void skip(struct vars *);
  */
 static void
 skip(
     struct vars *v)
 {
-    chr *start = v->now;
+    const chr *start = v->now;
 
     assert(v->cflags&REG_EXPANDED);
 
@@ -1115,7 +1143,7 @@ skip(
 /*
  - newline - return the chr for a newline
  * This helps confine use of CHR to this source file.
- ^ static chr newline(NOPARMS);
+ ^ static chr newline(void);
  */
 static chr
 newline(void)
@@ -1124,34 +1152,16 @@ newline(void)
 }
 
 /*
- - ch - return the chr sequence for regc_locale.c's fake collating element ch
- * This helps confine use of CHR to this source file.  Beware that the caller
- * knows how long the sequence is.
- ^ #ifdef REG_DEBUG
- ^ static chr *ch(NOPARMS);
- ^ #endif
- */
-#ifdef REG_DEBUG
-static chr *
-ch(void)
-{
-    static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
-
-    return chstr;
-}
-#endif
-
-/*
  - chrnamed - return the chr known by a given (chr string) name
  * The code is a bit clumsy, but this routine gets only such specialized
  * use that it hardly matters.
- ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
+ ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
  */
 static chr
 chrnamed(
     struct vars *v,
-    chr *startp,		/* start of name */
-    chr *endp,			/* just past end of name */
+    const chr *startp,		/* start of name */
+    const chr *endp,		/* just past end of name */
     pchr lastresort)		/* what to return if name lookup fails */
 {
     celt c;