From 860e6b5e43e0ac7e673218dd929d425c5d206014 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Fri, 12 Aug 2011 08:02:23 +0000
Subject: TIP 388 implementation

---
 doc/Tcl.n           | 30 ++++++++++++++++++++----------
 doc/re_syntax.n     | 29 ++++++++++++++++-------------
 generic/regc_lex.c  | 35 +++++++++++++++++++++++++----------
 generic/regcomp.c   |  2 +-
 generic/regcustom.h |  2 +-
 generic/tcl.h       | 14 +++++++-------
 generic/tclParse.c  | 15 ++++++++++++---
 tests/reg.test      | 15 ++++++++++++---
 tests/utf.test      | 14 +++++++++++++-
 9 files changed, 107 insertions(+), 49 deletions(-)

diff --git a/doc/Tcl.n b/doc/Tcl.n
index f56c82c..c14c4dc 100644
--- a/doc/Tcl.n
+++ b/doc/Tcl.n
@@ -6,7 +6,7 @@
 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 '\"
 .so man.macros
-.TH Tcl n "8.5" Tcl "Tcl Built-In Commands"
+.TH Tcl n "8.6" Tcl "Tcl Built-In Commands"
 .BS
 .SH NAME
 Tcl \- Tool Command Language
@@ -193,23 +193,33 @@ Backslash
 .TP 7
 \e\fIooo\fR 
 .
-The digits \fIooo\fR (one, two, or three of them) give an eight-bit octal 
-value for the Unicode character that will be inserted.  The upper bits of the
-Unicode character will be 0.
+The digits \fIooo\fR (one, two, or three of them) give a eight-bit octal 
+value for the Unicode character that will be inserted, in the range \fI000\fR
+- \fI377\fR.  The parser will stop just before this range overflows, or when
+the maximum of three digits is reached.  The upper bits of the Unicode
+character will be 0.
 .TP 7
 \e\fBx\fIhh\fR 
 .
-The hexadecimal digits \fIhh\fR give an eight-bit hexadecimal value for the
-Unicode character that will be inserted.  Any number of hexadecimal digits
-may be present; however, all but the last two are ignored (the result is
-always a one-byte quantity).  The upper bits of the Unicode character will
-be 0.
+The hexadecimal digits \fIhh\fR (one or two of them) give an eight-bit
+hexadecimal value for the Unicode character that will be inserted.  The upper
+bits of the Unicode character will be 0.
 .TP 7
 \e\fBu\fIhhhh\fR 
 .
 The hexadecimal digits \fIhhhh\fR (one, two, three, or four of them) give a
 sixteen-bit hexadecimal value for the Unicode character that will be
-inserted.
+inserted.  The upper bits of the Unicode character will be 0.
+.TP 7
+\e\fBU\fIhhhhhhhh\fR 
+.
+The hexadecimal digits \fIhhhhhhhh\fR (one up to eight of them) give a
+twentiy-one-bit hexadecimal value for the Unicode character that will be
+inserted, in the range U+0000..U+10FFFF.  The parser will stop just
+before this range overflows, or when the maximum of eight digits
+is reached.  The upper bits of the Unicode character will be 0.
+.PP
+The range U+010000..U+10FFFD is reserved for the future.
 .PP
 Backslash substitution is not performed on words enclosed in braces,
 except for backslash-newline as described above.
diff --git a/doc/re_syntax.n b/doc/re_syntax.n
index 8701641..a53f58b 100644
--- a/doc/re_syntax.n
+++ b/doc/re_syntax.n
@@ -359,39 +359,42 @@ horizontal tab, as in C
 .TP
 \fB\eu\fIwxyz\fR
 .
-(where \fIwxyz\fR is exactly four hexadecimal digits) the Unicode
+(where \fIwxyz\fR is one up to four hexadecimal digits) the Unicode
 character \fBU+\fIwxyz\fR in the local byte ordering
 .TP
 \fB\eU\fIstuvwxyz\fR
 .
-(where \fIstuvwxyz\fR is exactly eight hexadecimal digits) reserved
-for a somewhat-hypothetical Unicode extension to 32 bits
+(where \fIstuvwxyz\fR is one up to eight hexadecimal digits) reserved
+for a Unicode extension up to 21 bits. The digits are parsed until the
+first non-hexadecimal character is encountered, the maximun of eight
+hexadecimal digits are reached, or an overflow would occur in the maximum
+value of \fBU+\fI10ffff\fR.
 .TP
 \fB\ev\fR
 .
 vertical tab, as in C are all available.
 .TP
-\fB\ex\fIhhh\fR
+\fB\ex\fIhh\fR
 .
-(where \fIhhh\fR is any sequence of hexadecimal digits) the character
-whose hexadecimal value is \fB0x\fIhhh\fR (a single character no
-matter how many hexadecimal digits are used).
+(where \fIhh\fR is one or two hexadecimal digits) the character
+whose hexadecimal value is \fB0x\fIhh\fR.
 .TP
 \fB\e0\fR
 .
 the character whose value is \fB0\fR
 .TP
+\fB\e\fIxyz\fR
+.
+(where \fIxyz\fR is exactly three octal digits, and is not a \fIback
+reference\fR (see below)) the character whose octal value is
+\fB0\fIxyz\fR. The first digit must be in the range 0-3, otherwise
+the two-digit form is assumed.
+.TP
 \fB\e\fIxy\fR
 .
 (where \fIxy\fR is exactly two octal digits, and is not a \fIback
 reference\fR (see below)) the character whose octal value is
 \fB0\fIxy\fR
-.TP
-\fB\e\fIxyz\fR
-.
-(where \fIxyz\fR is exactly three octal digits, and is not a back
-reference (see below)) the character whose octal value is
-\fB0\fIxyz\fR
 .RE
 .PP
 Hexadecimal digits are
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index f3a46da..132e757 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -742,6 +742,7 @@ lexescape(
     struct vars *v)
 {
     chr c;
+    int i;
     static const chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
@@ -818,18 +819,23 @@ lexescape(
 	RETV(PLAIN, CHR('\t'));
 	break;
     case CHR('u'):
-	c = lexdigits(v, 16, 4, 4);
+	c = (uchr) lexdigits(v, 16, 1, 4);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
 	RETV(PLAIN, c);
 	break;
     case CHR('U'):
-	c = lexdigits(v, 16, 8, 8);
+	i = lexdigits(v, 16, 1, 8);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, c);
+	if (i > 0xFFFF) {
+	    /* TODO: output a Surrogate pair
+	     */
+	    i = 0xFFFD;
+	}
+	RETV(PLAIN, (uchr) i);
 	break;
     case CHR('v'):
 	RETV(PLAIN, CHR('\v'));
@@ -844,7 +850,7 @@ lexescape(
 	break;
     case CHR('x'):
 	NOTE(REG_UUNPORT);
-	c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 16, 1, 2);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
@@ -866,7 +872,7 @@ lexescape(
     case CHR('9'):
 	save = v->now;
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
@@ -893,10 +899,15 @@ lexescape(
     case CHR('0'):
 	NOTE(REG_UUNPORT);
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 8, 1, 3);
+	c = (uchr) lexdigits(v, 8, 1, 3);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
+	if (c > 0xff) {
+	    /* out of range, so we handled one digit too much */
+	    v->now--;
+	    c >>= 3;
+	}
 	RETV(PLAIN, c);
 	break;
     default:
@@ -909,16 +920,16 @@ lexescape(
 
 /*
  - lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
  */
-static chr			/* chr value; errors signalled via ERR */
+static int			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
     int base,
     int minlen,
     int maxlen)
 {
-    uchr n;			/* unsigned to avoid overflow misbehavior */
+    int n;
     int len;
     chr c;
     int d;
@@ -926,6 +937,10 @@ lexdigits(
 
     n = 0;
     for (len = 0; len < maxlen && !ATEOS(); len++) {
+	if (n > 0x10fff) {
+	    /* Stop when continuing would otherwise overflow */
+	    break;
+	}
 	c = *v->now++;
 	switch (c) {
 	case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
@@ -958,7 +973,7 @@ lexdigits(
 	ERR(REG_EESCAPE);
     }
 
-    return (chr)n;
+    return n;
 }
 
 /*
diff --git a/generic/regcomp.c b/generic/regcomp.c
index d7ae05e..65555aa 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -79,7 +79,7 @@ static void lexnest(struct vars *, const chr *, const chr *);
 static void lexword(struct vars *);
 static int next(struct vars *);
 static int lexescape(struct vars *);
-static chr lexdigits(struct vars *, int, int, int);
+static int lexdigits(struct vars *, int, int, int);
 static int brenext(struct vars *, pchr);
 static void skip(struct vars *);
 static chr newline(NOPARMS);
diff --git a/generic/regcustom.h b/generic/regcustom.h
index bc8c28c..1c970ea 100644
--- a/generic/regcustom.h
+++ b/generic/regcustom.h
@@ -97,7 +97,7 @@ typedef int celt;		/* Type to hold chr, or NOCELT */
 #define	NOCELT (-1)		/* Celt value which is not valid chr */
 #define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
 #define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
 #define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
 #define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
 #define	CHR_MAX	0xffffffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
diff --git a/generic/tcl.h b/generic/tcl.h
index 54bfedc..7370516 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2153,12 +2153,12 @@ typedef struct Tcl_EncodingType {
 
 /*
  * The maximum number of bytes that are necessary to represent a single
- * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1
- * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar
- * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must
- * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and
- * recommended mode. UCS-4 is experimental and not recommended. It works for
- * the core, but most extensions expect UCS-2.
+ * Unicode character in UTF-8. The valid values should be 3, 4 or 6
+ * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or
+ * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6,
+ * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode
+ * is the default and recommended mode. UCS-4 is experimental and not
+ * recommended. It works for the core, but most extensions expect UCS-2.
  */
 
 #ifndef TCL_UTF_MAX
@@ -2170,7 +2170,7 @@ typedef struct Tcl_EncodingType {
  * reflected in regcustom.h.
  */
 
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
     /*
      * unsigned int isn't 100% accurate as it should be a strict 4-byte value
      * (perhaps wchar_t). 64-bit systems may have troubles. The size of this
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 2b0dab4..3c984bf 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -754,7 +754,7 @@ TclParseHex(
     while (numBytes--) {
 	unsigned char digit = UCHAR(*p);
 
-	if (!isxdigit(digit)) {
+	if (!isxdigit(digit) || (result > 0x10fff)) {
 	    break;
 	}
 
@@ -866,7 +866,7 @@ TclParseBackslash(
 	result = 0xb;
 	break;
     case 'x':
-	count += TclParseHex(p+1, numBytes-2, &result);
+	count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result);
 	if (count == 2) {
 	    /*
 	     * No hexadigits -> This is just "x".
@@ -889,6 +889,15 @@ TclParseBackslash(
 	    result = 'u';
 	}
 	break;
+    case 'U':
+	count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
+	if (count == 2) {
+	    /*
+	     * No hexadigits -> This is just "U".
+	     */
+	    result = 'U';
+	}
+	break;
     case '\n':
 	count--;
 	do {
@@ -917,7 +926,7 @@ TclParseBackslash(
 	    result = (result << 3) + (*p - '0');
 	    p++;
 	    if ((numBytes == 3) || !isdigit(UCHAR(*p))	/* INTL: digit */
-		    || (UCHAR(*p) >= '8')) {
+		    || (UCHAR(*p) >= '8') || (result >= 0x20)) {
 		break;
 	    }
 	    count = 4;
diff --git a/tests/reg.test b/tests/reg.test
index d92339f..ca6cdd1 100644
--- a/tests/reg.test
+++ b/tests/reg.test
@@ -626,16 +626,24 @@ expectMatch	13.13 P		"a\\nb"		"a\nb"	"a\nb"
 expectMatch	13.14 P		"a\\rb"		"a\rb"	"a\rb"
 expectMatch	13.15 P		"a\\tb"		"a\tb"	"a\tb"
 expectMatch	13.16 P		"a\\u0008x"	"a\bx"	"a\bx"
-expectError	13.17 -		{a\u008x}	EESCAPE
+expectMatch	13.17 P		{a\u008x}	"a\bx"	"a\bx"
 expectMatch	13.18 P		"a\\u00088x"	"a\b8x"	"a\b8x"
 expectMatch	13.19 P		"a\\U00000008x"	"a\bx"	"a\bx"
-expectError	13.20 -		{a\U0000008x}	EESCAPE
+expectMatch	13.20 P		{a\U0000008x}	"a\bx"	"a\bx"
 expectMatch	13.21 P		"a\\vb"		"a\vb"	"a\vb"
 expectMatch	13.22 MP	"a\\x08x"	"a\bx"	"a\bx"
 expectError	13.23 -		{a\xq}		EESCAPE
-expectMatch	13.24 MP	"a\\x0008x"	"a\bx"	"a\bx"
+expectMatch	13.24 MP	"a\\x08x"	"a\bx"	"a\bx"
 expectError	13.25 -		{a\z}		EESCAPE
 expectMatch	13.26 MP	"a\\010b"	"a\bb"	"a\bb"
+expectMatch	13.27 P		"a\\U00001234x"	"a\u1234x"	"a\u1234x"
+expectMatch	13.28 P		{a\U00001234x}	"a\u1234x"	"a\u1234x"
+expectMatch	13.29 P		"a\\U0001234x"	"a\u1234x"	"a\u1234x"
+expectMatch	13.30 P		{a\U0001234x}	"a\u1234x"	"a\u1234x"
+expectMatch	13.31 P		"a\\U000012345x"	"a\u12345x"	"a\u12345x"
+expectMatch	13.32 P		{a\U000012345x}	"a\u12345x"	"a\u12345x"
+expectMatch	13.33 P		"a\\U1000000x"	"a\ufffd0x"	"a\ufffd0x"
+expectMatch	13.34 P		{a\U1000000x}	"a\ufffd0x"	"a\ufffd0x"
 
 
 doing 14 "back references"
@@ -682,6 +690,7 @@ expectError	15.9  -	{a((((((((((b\10))))))))))c}	ESUBREG
 expectMatch	15.10 MP	"a\\12b"	"a\nb"	"a\nb"
 expectError	15.11 b		{a\12b}		ESUBREG
 expectMatch	15.12 eAS	{a\12b}		a12b	a12b
+expectMatch	15.13 MP	{a\701b}	a\u00381b	a\u00381b
 
 
 doing 16 "expanded syntax"
diff --git a/tests/utf.test b/tests/utf.test
index d319f6e..0f1428f 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -168,7 +168,7 @@ bsCheck \x	120
 bsCheck \xa	10
 bsCheck \xA	10
 bsCheck \x41	65
-bsCheck \x541	65
+bsCheck \x541	84
 bsCheck \u	117
 bsCheck \uk	117
 bsCheck \u41	65
@@ -177,6 +177,18 @@ bsCheck \uA	10
 bsCheck \340	224
 bsCheck \ua1	161
 bsCheck \u4e21	20001
+bsCheck \741	60
+bsCheck \U	85
+bsCheck \Uk	85
+bsCheck \U41	65
+bsCheck \Ua	10
+bsCheck \UA	10
+bsCheck \Ua1	161
+bsCheck \U4e21	20001
+bsCheck \U004e21	20001
+bsCheck \U00004e21	20001
+bsCheck \U00110000	65533
+bsCheck \Uffffffff	65533
 
 test utf-11.1 {Tcl_UtfToUpper} {
     string toupper {}
-- 
cgit v0.12