1 files changed, 31 insertions, 16 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index 8d07c59..132e757 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -63,7 +63,7 @@
 
 /*
  - lexstart - set up lexical stuff, scan leading options
- ^ static VOID lexstart(struct vars *);
+ ^ static void lexstart(struct vars *);
  */
 static void
 lexstart(
@@ -89,7 +89,7 @@ lexstart(
 
 /*
  - prefixes - implement various special prefixes
- ^ static VOID prefixes(struct vars *);
+ ^ static void prefixes(struct vars *);
  */
 static void
 prefixes(
@@ -207,7 +207,7 @@ prefixes(
  - lexnest - "call a subroutine", interpolating string at the lexical level
  * Note, this is not a very general facility.  There are a number of
  * implicit assumptions about what sorts of strings can be subroutines.
- ^ static VOID lexnest(struct vars *, const chr *, const chr *);
+ ^ static void lexnest(struct vars *, const chr *, const chr *);
  */
 static void
 lexnest(
@@ -275,7 +275,7 @@ static const chr brbackw[] = {	/* \w within brackets */
 /*
  - lexword - interpolate a bracket expression for word characters
  * Possibly ought to inquire whether there is a "word" character class.
- ^ static VOID lexword(struct vars *);
+ ^ static void lexword(struct vars *);
  */
 static void
 lexword(
@@ -742,6 +742,7 @@ lexescape(
     struct vars *v)
 {
     chr c;
+    int i;
     static const chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
@@ -818,18 +819,23 @@ lexescape(
 	RETV(PLAIN, CHR('\t'));
 	break;
     case CHR('u'):
-	c = lexdigits(v, 16, 4, 4);
+	c = (uchr) lexdigits(v, 16, 1, 4);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
 	RETV(PLAIN, c);
 	break;
     case CHR('U'):
-	c = lexdigits(v, 16, 8, 8);
+	i = lexdigits(v, 16, 1, 8);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, c);
+	if (i > 0xFFFF) {
+	    /* TODO: output a Surrogate pair
+	     */
+	    i = 0xFFFD;
+	}
+	RETV(PLAIN, (uchr) i);
 	break;
     case CHR('v'):
 	RETV(PLAIN, CHR('\v'));
@@ -844,7 +850,7 @@ lexescape(
 	break;
     case CHR('x'):
 	NOTE(REG_UUNPORT);
-	c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 16, 1, 2);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
@@ -866,7 +872,7 @@ lexescape(
     case CHR('9'):
 	save = v->now;
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
@@ -893,10 +899,15 @@ lexescape(
     case CHR('0'):
 	NOTE(REG_UUNPORT);
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 8, 1, 3);
+	c = (uchr) lexdigits(v, 8, 1, 3);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
+	if (c > 0xff) {
+	    /* out of range, so we handled one digit too much */
+	    v->now--;
+	    c >>= 3;
+	}
 	RETV(PLAIN, c);
 	break;
     default:
@@ -909,23 +920,27 @@ lexescape(
 
 /*
  - lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
  */
-static chr			/* chr value; errors signalled via ERR */
+static int			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
     int base,
     int minlen,
     int maxlen)
 {
-    uchr n;			/* unsigned to avoid overflow misbehavior */
+    int n;
     int len;
     chr c;
     int d;
-    CONST uchr ub = (uchr) base;
+    const uchr ub = (uchr) base;
 
     n = 0;
     for (len = 0; len < maxlen && !ATEOS(); len++) {
+	if (n > 0x10fff) {
+	    /* Stop when continuing would otherwise overflow */
+	    break;
+	}
 	c = *v->now++;
 	switch (c) {
 	case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
@@ -958,7 +973,7 @@ lexdigits(
 	ERR(REG_EESCAPE);
     }
 
-    return (chr)n;
+    return n;
 }
 
 /*
@@ -1080,7 +1095,7 @@ brenext(
 
 /*
  - skip - skip white space and comments in expanded form
- ^ static VOID skip(struct vars *);
+ ^ static void skip(struct vars *);
  */
 static void
 skip(