summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2011-08-12 08:02:23 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2011-08-12 08:02:23 (GMT)
commit860e6b5e43e0ac7e673218dd929d425c5d206014 (patch)
tree37151f179ef8cad02889b0ddf4a07163698782df
parentba9424ed9813043dd8948d59a0fd5aa83b0cd0ca (diff)
downloadtcl-860e6b5e43e0ac7e673218dd929d425c5d206014.zip
tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.tar.gz
tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.tar.bz2
TIP 388 implementation
-rw-r--r--doc/Tcl.n30
-rw-r--r--doc/re_syntax.n29
-rw-r--r--generic/regc_lex.c35
-rw-r--r--generic/regcomp.c2
-rw-r--r--generic/regcustom.h2
-rw-r--r--generic/tcl.h14
-rw-r--r--generic/tclParse.c15
-rw-r--r--tests/reg.test15
-rw-r--r--tests/utf.test14
9 files changed, 107 insertions, 49 deletions
diff --git a/doc/Tcl.n b/doc/Tcl.n
index f56c82c..c14c4dc 100644
--- a/doc/Tcl.n
+++ b/doc/Tcl.n
@@ -6,7 +6,7 @@
'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
'\"
.so man.macros
-.TH Tcl n "8.5" Tcl "Tcl Built-In Commands"
+.TH Tcl n "8.6" Tcl "Tcl Built-In Commands"
.BS
.SH NAME
Tcl \- Tool Command Language
@@ -193,23 +193,33 @@ Backslash
.TP 7
\e\fIooo\fR
.
-The digits \fIooo\fR (one, two, or three of them) give an eight-bit octal
-value for the Unicode character that will be inserted. The upper bits of the
-Unicode character will be 0.
+The digits \fIooo\fR (one, two, or three of them) give a eight-bit octal
+value for the Unicode character that will be inserted, in the range \fI000\fR
+- \fI377\fR. The parser will stop just before this range overflows, or when
+the maximum of three digits is reached. The upper bits of the Unicode
+character will be 0.
.TP 7
\e\fBx\fIhh\fR
.
-The hexadecimal digits \fIhh\fR give an eight-bit hexadecimal value for the
-Unicode character that will be inserted. Any number of hexadecimal digits
-may be present; however, all but the last two are ignored (the result is
-always a one-byte quantity). The upper bits of the Unicode character will
-be 0.
+The hexadecimal digits \fIhh\fR (one or two of them) give an eight-bit
+hexadecimal value for the Unicode character that will be inserted. The upper
+bits of the Unicode character will be 0.
.TP 7
\e\fBu\fIhhhh\fR
.
The hexadecimal digits \fIhhhh\fR (one, two, three, or four of them) give a
sixteen-bit hexadecimal value for the Unicode character that will be
-inserted.
+inserted. The upper bits of the Unicode character will be 0.
+.TP 7
+\e\fBU\fIhhhhhhhh\fR
+.
+The hexadecimal digits \fIhhhhhhhh\fR (one up to eight of them) give a
+twentiy-one-bit hexadecimal value for the Unicode character that will be
+inserted, in the range U+0000..U+10FFFF. The parser will stop just
+before this range overflows, or when the maximum of eight digits
+is reached. The upper bits of the Unicode character will be 0.
+.PP
+The range U+010000..U+10FFFD is reserved for the future.
.PP
Backslash substitution is not performed on words enclosed in braces,
except for backslash-newline as described above.
diff --git a/doc/re_syntax.n b/doc/re_syntax.n
index 8701641..a53f58b 100644
--- a/doc/re_syntax.n
+++ b/doc/re_syntax.n
@@ -359,39 +359,42 @@ horizontal tab, as in C
.TP
\fB\eu\fIwxyz\fR
.
-(where \fIwxyz\fR is exactly four hexadecimal digits) the Unicode
+(where \fIwxyz\fR is one up to four hexadecimal digits) the Unicode
character \fBU+\fIwxyz\fR in the local byte ordering
.TP
\fB\eU\fIstuvwxyz\fR
.
-(where \fIstuvwxyz\fR is exactly eight hexadecimal digits) reserved
-for a somewhat-hypothetical Unicode extension to 32 bits
+(where \fIstuvwxyz\fR is one up to eight hexadecimal digits) reserved
+for a Unicode extension up to 21 bits. The digits are parsed until the
+first non-hexadecimal character is encountered, the maximun of eight
+hexadecimal digits are reached, or an overflow would occur in the maximum
+value of \fBU+\fI10ffff\fR.
.TP
\fB\ev\fR
.
vertical tab, as in C are all available.
.TP
-\fB\ex\fIhhh\fR
+\fB\ex\fIhh\fR
.
-(where \fIhhh\fR is any sequence of hexadecimal digits) the character
-whose hexadecimal value is \fB0x\fIhhh\fR (a single character no
-matter how many hexadecimal digits are used).
+(where \fIhh\fR is one or two hexadecimal digits) the character
+whose hexadecimal value is \fB0x\fIhh\fR.
.TP
\fB\e0\fR
.
the character whose value is \fB0\fR
.TP
+\fB\e\fIxyz\fR
+.
+(where \fIxyz\fR is exactly three octal digits, and is not a \fIback
+reference\fR (see below)) the character whose octal value is
+\fB0\fIxyz\fR. The first digit must be in the range 0-3, otherwise
+the two-digit form is assumed.
+.TP
\fB\e\fIxy\fR
.
(where \fIxy\fR is exactly two octal digits, and is not a \fIback
reference\fR (see below)) the character whose octal value is
\fB0\fIxy\fR
-.TP
-\fB\e\fIxyz\fR
-.
-(where \fIxyz\fR is exactly three octal digits, and is not a back
-reference (see below)) the character whose octal value is
-\fB0\fIxyz\fR
.RE
.PP
Hexadecimal digits are
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index f3a46da..132e757 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -742,6 +742,7 @@ lexescape(
struct vars *v)
{
chr c;
+ int i;
static const chr alert[] = {
CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
};
@@ -818,18 +819,23 @@ lexescape(
RETV(PLAIN, CHR('\t'));
break;
case CHR('u'):
- c = lexdigits(v, 16, 4, 4);
+ c = (uchr) lexdigits(v, 16, 1, 4);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
RETV(PLAIN, c);
break;
case CHR('U'):
- c = lexdigits(v, 16, 8, 8);
+ i = lexdigits(v, 16, 1, 8);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
- RETV(PLAIN, c);
+ if (i > 0xFFFF) {
+ /* TODO: output a Surrogate pair
+ */
+ i = 0xFFFD;
+ }
+ RETV(PLAIN, (uchr) i);
break;
case CHR('v'):
RETV(PLAIN, CHR('\v'));
@@ -844,7 +850,7 @@ lexescape(
break;
case CHR('x'):
NOTE(REG_UUNPORT);
- c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
+ c = (uchr) lexdigits(v, 16, 1, 2);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -866,7 +872,7 @@ lexescape(
case CHR('9'):
save = v->now;
v->now--; /* put first digit back */
- c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
+ c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -893,10 +899,15 @@ lexescape(
case CHR('0'):
NOTE(REG_UUNPORT);
v->now--; /* put first digit back */
- c = lexdigits(v, 8, 1, 3);
+ c = (uchr) lexdigits(v, 8, 1, 3);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
+ if (c > 0xff) {
+ /* out of range, so we handled one digit too much */
+ v->now--;
+ c >>= 3;
+ }
RETV(PLAIN, c);
break;
default:
@@ -909,16 +920,16 @@ lexescape(
/*
- lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
*/
-static chr /* chr value; errors signalled via ERR */
+static int /* chr value; errors signalled via ERR */
lexdigits(
struct vars *v,
int base,
int minlen,
int maxlen)
{
- uchr n; /* unsigned to avoid overflow misbehavior */
+ int n;
int len;
chr c;
int d;
@@ -926,6 +937,10 @@ lexdigits(
n = 0;
for (len = 0; len < maxlen && !ATEOS(); len++) {
+ if (n > 0x10fff) {
+ /* Stop when continuing would otherwise overflow */
+ break;
+ }
c = *v->now++;
switch (c) {
case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
@@ -958,7 +973,7 @@ lexdigits(
ERR(REG_EESCAPE);
}
- return (chr)n;
+ return n;
}
/*
diff --git a/generic/regcomp.c b/generic/regcomp.c
index d7ae05e..65555aa 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -79,7 +79,7 @@ static void lexnest(struct vars *, const chr *, const chr *);
static void lexword(struct vars *);
static int next(struct vars *);
static int lexescape(struct vars *);
-static chr lexdigits(struct vars *, int, int, int);
+static int lexdigits(struct vars *, int, int, int);
static int brenext(struct vars *, pchr);
static void skip(struct vars *);
static chr newline(NOPARMS);
diff --git a/generic/regcustom.h b/generic/regcustom.h
index bc8c28c..1c970ea 100644
--- a/generic/regcustom.h
+++ b/generic/regcustom.h
@@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */
#define NOCELT (-1) /* Celt value which is not valid chr */
#define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */
#define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
#define CHRBITS 32 /* Bits in a chr; must not use sizeof */
#define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */
#define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */
diff --git a/generic/tcl.h b/generic/tcl.h
index 54bfedc..7370516 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2153,12 +2153,12 @@ typedef struct Tcl_EncodingType {
/*
* The maximum number of bytes that are necessary to represent a single
- * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1
- * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar
- * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must
- * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and
- * recommended mode. UCS-4 is experimental and not recommended. It works for
- * the core, but most extensions expect UCS-2.
+ * Unicode character in UTF-8. The valid values should be 3, 4 or 6
+ * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or
+ * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6,
+ * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode
+ * is the default and recommended mode. UCS-4 is experimental and not
+ * recommended. It works for the core, but most extensions expect UCS-2.
*/
#ifndef TCL_UTF_MAX
@@ -2170,7 +2170,7 @@ typedef struct Tcl_EncodingType {
* reflected in regcustom.h.
*/
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
/*
* unsigned int isn't 100% accurate as it should be a strict 4-byte value
* (perhaps wchar_t). 64-bit systems may have troubles. The size of this
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 2b0dab4..3c984bf 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -754,7 +754,7 @@ TclParseHex(
while (numBytes--) {
unsigned char digit = UCHAR(*p);
- if (!isxdigit(digit)) {
+ if (!isxdigit(digit) || (result > 0x10fff)) {
break;
}
@@ -866,7 +866,7 @@ TclParseBackslash(
result = 0xb;
break;
case 'x':
- count += TclParseHex(p+1, numBytes-2, &result);
+ count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result);
if (count == 2) {
/*
* No hexadigits -> This is just "x".
@@ -889,6 +889,15 @@ TclParseBackslash(
result = 'u';
}
break;
+ case 'U':
+ count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
+ if (count == 2) {
+ /*
+ * No hexadigits -> This is just "U".
+ */
+ result = 'U';
+ }
+ break;
case '\n':
count--;
do {
@@ -917,7 +926,7 @@ TclParseBackslash(
result = (result << 3) + (*p - '0');
p++;
if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */
- || (UCHAR(*p) >= '8')) {
+ || (UCHAR(*p) >= '8') || (result >= 0x20)) {
break;
}
count = 4;
diff --git a/tests/reg.test b/tests/reg.test
index d92339f..ca6cdd1 100644
--- a/tests/reg.test
+++ b/tests/reg.test
@@ -626,16 +626,24 @@ expectMatch 13.13 P "a\\nb" "a\nb" "a\nb"
expectMatch 13.14 P "a\\rb" "a\rb" "a\rb"
expectMatch 13.15 P "a\\tb" "a\tb" "a\tb"
expectMatch 13.16 P "a\\u0008x" "a\bx" "a\bx"
-expectError 13.17 - {a\u008x} EESCAPE
+expectMatch 13.17 P {a\u008x} "a\bx" "a\bx"
expectMatch 13.18 P "a\\u00088x" "a\b8x" "a\b8x"
expectMatch 13.19 P "a\\U00000008x" "a\bx" "a\bx"
-expectError 13.20 - {a\U0000008x} EESCAPE
+expectMatch 13.20 P {a\U0000008x} "a\bx" "a\bx"
expectMatch 13.21 P "a\\vb" "a\vb" "a\vb"
expectMatch 13.22 MP "a\\x08x" "a\bx" "a\bx"
expectError 13.23 - {a\xq} EESCAPE
-expectMatch 13.24 MP "a\\x0008x" "a\bx" "a\bx"
+expectMatch 13.24 MP "a\\x08x" "a\bx" "a\bx"
expectError 13.25 - {a\z} EESCAPE
expectMatch 13.26 MP "a\\010b" "a\bb" "a\bb"
+expectMatch 13.27 P "a\\U00001234x" "a\u1234x" "a\u1234x"
+expectMatch 13.28 P {a\U00001234x} "a\u1234x" "a\u1234x"
+expectMatch 13.29 P "a\\U0001234x" "a\u1234x" "a\u1234x"
+expectMatch 13.30 P {a\U0001234x} "a\u1234x" "a\u1234x"
+expectMatch 13.31 P "a\\U000012345x" "a\u12345x" "a\u12345x"
+expectMatch 13.32 P {a\U000012345x} "a\u12345x" "a\u12345x"
+expectMatch 13.33 P "a\\U1000000x" "a\ufffd0x" "a\ufffd0x"
+expectMatch 13.34 P {a\U1000000x} "a\ufffd0x" "a\ufffd0x"
doing 14 "back references"
@@ -682,6 +690,7 @@ expectError 15.9 - {a((((((((((b\10))))))))))c} ESUBREG
expectMatch 15.10 MP "a\\12b" "a\nb" "a\nb"
expectError 15.11 b {a\12b} ESUBREG
expectMatch 15.12 eAS {a\12b} a12b a12b
+expectMatch 15.13 MP {a\701b} a\u00381b a\u00381b
doing 16 "expanded syntax"
diff --git a/tests/utf.test b/tests/utf.test
index d319f6e..0f1428f 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -168,7 +168,7 @@ bsCheck \x 120
bsCheck \xa 10
bsCheck \xA 10
bsCheck \x41 65
-bsCheck \x541 65
+bsCheck \x541 84
bsCheck \u 117
bsCheck \uk 117
bsCheck \u41 65
@@ -177,6 +177,18 @@ bsCheck \uA 10
bsCheck \340 224
bsCheck \ua1 161
bsCheck \u4e21 20001
+bsCheck \741 60
+bsCheck \U 85
+bsCheck \Uk 85
+bsCheck \U41 65
+bsCheck \Ua 10
+bsCheck \UA 10
+bsCheck \Ua1 161
+bsCheck \U4e21 20001
+bsCheck \U004e21 20001
+bsCheck \U00004e21 20001
+bsCheck \U00110000 65533
+bsCheck \Uffffffff 65533
test utf-11.1 {Tcl_UtfToUpper} {
string toupper {}