From 860e6b5e43e0ac7e673218dd929d425c5d206014 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 12 Aug 2011 08:02:23 +0000 Subject: TIP 388 implementation --- doc/Tcl.n | 30 ++++++++++++++++++++---------- doc/re_syntax.n | 29 ++++++++++++++++------------- generic/regc_lex.c | 35 +++++++++++++++++++++++++---------- generic/regcomp.c | 2 +- generic/regcustom.h | 2 +- generic/tcl.h | 14 +++++++------- generic/tclParse.c | 15 ++++++++++++--- tests/reg.test | 15 ++++++++++++--- tests/utf.test | 14 +++++++++++++- 9 files changed, 107 insertions(+), 49 deletions(-) diff --git a/doc/Tcl.n b/doc/Tcl.n index f56c82c..c14c4dc 100644 --- a/doc/Tcl.n +++ b/doc/Tcl.n @@ -6,7 +6,7 @@ '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" .so man.macros -.TH Tcl n "8.5" Tcl "Tcl Built-In Commands" +.TH Tcl n "8.6" Tcl "Tcl Built-In Commands" .BS .SH NAME Tcl \- Tool Command Language @@ -193,23 +193,33 @@ Backslash .TP 7 \e\fIooo\fR . -The digits \fIooo\fR (one, two, or three of them) give an eight-bit octal -value for the Unicode character that will be inserted. The upper bits of the -Unicode character will be 0. +The digits \fIooo\fR (one, two, or three of them) give a eight-bit octal +value for the Unicode character that will be inserted, in the range \fI000\fR +- \fI377\fR. The parser will stop just before this range overflows, or when +the maximum of three digits is reached. The upper bits of the Unicode +character will be 0. .TP 7 \e\fBx\fIhh\fR . -The hexadecimal digits \fIhh\fR give an eight-bit hexadecimal value for the -Unicode character that will be inserted. Any number of hexadecimal digits -may be present; however, all but the last two are ignored (the result is -always a one-byte quantity). The upper bits of the Unicode character will -be 0. +The hexadecimal digits \fIhh\fR (one or two of them) give an eight-bit +hexadecimal value for the Unicode character that will be inserted. The upper +bits of the Unicode character will be 0. .TP 7 \e\fBu\fIhhhh\fR . The hexadecimal digits \fIhhhh\fR (one, two, three, or four of them) give a sixteen-bit hexadecimal value for the Unicode character that will be -inserted. +inserted. The upper bits of the Unicode character will be 0. +.TP 7 +\e\fBU\fIhhhhhhhh\fR +. +The hexadecimal digits \fIhhhhhhhh\fR (one up to eight of them) give a +twentiy-one-bit hexadecimal value for the Unicode character that will be +inserted, in the range U+0000..U+10FFFF. The parser will stop just +before this range overflows, or when the maximum of eight digits +is reached. The upper bits of the Unicode character will be 0. +.PP +The range U+010000..U+10FFFD is reserved for the future. .PP Backslash substitution is not performed on words enclosed in braces, except for backslash-newline as described above. diff --git a/doc/re_syntax.n b/doc/re_syntax.n index 8701641..a53f58b 100644 --- a/doc/re_syntax.n +++ b/doc/re_syntax.n @@ -359,39 +359,42 @@ horizontal tab, as in C .TP \fB\eu\fIwxyz\fR . -(where \fIwxyz\fR is exactly four hexadecimal digits) the Unicode +(where \fIwxyz\fR is one up to four hexadecimal digits) the Unicode character \fBU+\fIwxyz\fR in the local byte ordering .TP \fB\eU\fIstuvwxyz\fR . -(where \fIstuvwxyz\fR is exactly eight hexadecimal digits) reserved -for a somewhat-hypothetical Unicode extension to 32 bits +(where \fIstuvwxyz\fR is one up to eight hexadecimal digits) reserved +for a Unicode extension up to 21 bits. The digits are parsed until the +first non-hexadecimal character is encountered, the maximun of eight +hexadecimal digits are reached, or an overflow would occur in the maximum +value of \fBU+\fI10ffff\fR. .TP \fB\ev\fR . vertical tab, as in C are all available. .TP -\fB\ex\fIhhh\fR +\fB\ex\fIhh\fR . -(where \fIhhh\fR is any sequence of hexadecimal digits) the character -whose hexadecimal value is \fB0x\fIhhh\fR (a single character no -matter how many hexadecimal digits are used). +(where \fIhh\fR is one or two hexadecimal digits) the character +whose hexadecimal value is \fB0x\fIhh\fR. .TP \fB\e0\fR . the character whose value is \fB0\fR .TP +\fB\e\fIxyz\fR +. +(where \fIxyz\fR is exactly three octal digits, and is not a \fIback +reference\fR (see below)) the character whose octal value is +\fB0\fIxyz\fR. The first digit must be in the range 0-3, otherwise +the two-digit form is assumed. +.TP \fB\e\fIxy\fR . (where \fIxy\fR is exactly two octal digits, and is not a \fIback reference\fR (see below)) the character whose octal value is \fB0\fIxy\fR -.TP -\fB\e\fIxyz\fR -. -(where \fIxyz\fR is exactly three octal digits, and is not a back -reference (see below)) the character whose octal value is -\fB0\fIxyz\fR .RE .PP Hexadecimal digits are diff --git a/generic/regc_lex.c b/generic/regc_lex.c index f3a46da..132e757 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -742,6 +742,7 @@ lexescape( struct vars *v) { chr c; + int i; static const chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; @@ -818,18 +819,23 @@ lexescape( RETV(PLAIN, CHR('\t')); break; case CHR('u'): - c = lexdigits(v, 16, 4, 4); + c = (uchr) lexdigits(v, 16, 1, 4); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('U'): - c = lexdigits(v, 16, 8, 8); + i = lexdigits(v, 16, 1, 8); if (ISERR()) { FAILW(REG_EESCAPE); } - RETV(PLAIN, c); + if (i > 0xFFFF) { + /* TODO: output a Surrogate pair + */ + i = 0xFFFD; + } + RETV(PLAIN, (uchr) i); break; case CHR('v'): RETV(PLAIN, CHR('\v')); @@ -844,7 +850,7 @@ lexescape( break; case CHR('x'): NOTE(REG_UUNPORT); - c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 16, 1, 2); if (ISERR()) { FAILW(REG_EESCAPE); } @@ -866,7 +872,7 @@ lexescape( case CHR('9'): save = v->now; v->now--; /* put first digit back */ - c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ if (ISERR()) { FAILW(REG_EESCAPE); } @@ -893,10 +899,15 @@ lexescape( case CHR('0'): NOTE(REG_UUNPORT); v->now--; /* put first digit back */ - c = lexdigits(v, 8, 1, 3); + c = (uchr) lexdigits(v, 8, 1, 3); if (ISERR()) { FAILW(REG_EESCAPE); } + if (c > 0xff) { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; + } RETV(PLAIN, c); break; default: @@ -909,16 +920,16 @@ lexescape( /* - lexdigits - slurp up digits and return chr value - ^ static chr lexdigits(struct vars *, int, int, int); + ^ static int lexdigits(struct vars *, int, int, int); */ -static chr /* chr value; errors signalled via ERR */ +static int /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, int base, int minlen, int maxlen) { - uchr n; /* unsigned to avoid overflow misbehavior */ + int n; int len; chr c; int d; @@ -926,6 +937,10 @@ lexdigits( n = 0; for (len = 0; len < maxlen && !ATEOS(); len++) { + if (n > 0x10fff) { + /* Stop when continuing would otherwise overflow */ + break; + } c = *v->now++; switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): @@ -958,7 +973,7 @@ lexdigits( ERR(REG_EESCAPE); } - return (chr)n; + return n; } /* diff --git a/generic/regcomp.c b/generic/regcomp.c index d7ae05e..65555aa 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -79,7 +79,7 @@ static void lexnest(struct vars *, const chr *, const chr *); static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); -static chr lexdigits(struct vars *, int, int, int); +static int lexdigits(struct vars *, int, int, int); static int brenext(struct vars *, pchr); static void skip(struct vars *); static chr newline(NOPARMS); diff --git a/generic/regcustom.h b/generic/regcustom.h index bc8c28c..1c970ea 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ diff --git a/generic/tcl.h b/generic/tcl.h index 54bfedc..7370516 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2153,12 +2153,12 @@ typedef struct Tcl_EncodingType { /* * The maximum number of bytes that are necessary to represent a single - * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1 - * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar - * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must - * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and - * recommended mode. UCS-4 is experimental and not recommended. It works for - * the core, but most extensions expect UCS-2. + * Unicode character in UTF-8. The valid values should be 3, 4 or 6 + * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or + * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6, + * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode + * is the default and recommended mode. UCS-4 is experimental and not + * recommended. It works for the core, but most extensions expect UCS-2. */ #ifndef TCL_UTF_MAX @@ -2170,7 +2170,7 @@ typedef struct Tcl_EncodingType { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this diff --git a/generic/tclParse.c b/generic/tclParse.c index 2b0dab4..3c984bf 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -754,7 +754,7 @@ TclParseHex( while (numBytes--) { unsigned char digit = UCHAR(*p); - if (!isxdigit(digit)) { + if (!isxdigit(digit) || (result > 0x10fff)) { break; } @@ -866,7 +866,7 @@ TclParseBackslash( result = 0xb; break; case 'x': - count += TclParseHex(p+1, numBytes-2, &result); + count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result); if (count == 2) { /* * No hexadigits -> This is just "x". @@ -889,6 +889,15 @@ TclParseBackslash( result = 'u'; } break; + case 'U': + count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); + if (count == 2) { + /* + * No hexadigits -> This is just "U". + */ + result = 'U'; + } + break; case '\n': count--; do { @@ -917,7 +926,7 @@ TclParseBackslash( result = (result << 3) + (*p - '0'); p++; if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */ - || (UCHAR(*p) >= '8')) { + || (UCHAR(*p) >= '8') || (result >= 0x20)) { break; } count = 4; diff --git a/tests/reg.test b/tests/reg.test index d92339f..ca6cdd1 100644 --- a/tests/reg.test +++ b/tests/reg.test @@ -626,16 +626,24 @@ expectMatch 13.13 P "a\\nb" "a\nb" "a\nb" expectMatch 13.14 P "a\\rb" "a\rb" "a\rb" expectMatch 13.15 P "a\\tb" "a\tb" "a\tb" expectMatch 13.16 P "a\\u0008x" "a\bx" "a\bx" -expectError 13.17 - {a\u008x} EESCAPE +expectMatch 13.17 P {a\u008x} "a\bx" "a\bx" expectMatch 13.18 P "a\\u00088x" "a\b8x" "a\b8x" expectMatch 13.19 P "a\\U00000008x" "a\bx" "a\bx" -expectError 13.20 - {a\U0000008x} EESCAPE +expectMatch 13.20 P {a\U0000008x} "a\bx" "a\bx" expectMatch 13.21 P "a\\vb" "a\vb" "a\vb" expectMatch 13.22 MP "a\\x08x" "a\bx" "a\bx" expectError 13.23 - {a\xq} EESCAPE -expectMatch 13.24 MP "a\\x0008x" "a\bx" "a\bx" +expectMatch 13.24 MP "a\\x08x" "a\bx" "a\bx" expectError 13.25 - {a\z} EESCAPE expectMatch 13.26 MP "a\\010b" "a\bb" "a\bb" +expectMatch 13.27 P "a\\U00001234x" "a\u1234x" "a\u1234x" +expectMatch 13.28 P {a\U00001234x} "a\u1234x" "a\u1234x" +expectMatch 13.29 P "a\\U0001234x" "a\u1234x" "a\u1234x" +expectMatch 13.30 P {a\U0001234x} "a\u1234x" "a\u1234x" +expectMatch 13.31 P "a\\U000012345x" "a\u12345x" "a\u12345x" +expectMatch 13.32 P {a\U000012345x} "a\u12345x" "a\u12345x" +expectMatch 13.33 P "a\\U1000000x" "a\ufffd0x" "a\ufffd0x" +expectMatch 13.34 P {a\U1000000x} "a\ufffd0x" "a\ufffd0x" doing 14 "back references" @@ -682,6 +690,7 @@ expectError 15.9 - {a((((((((((b\10))))))))))c} ESUBREG expectMatch 15.10 MP "a\\12b" "a\nb" "a\nb" expectError 15.11 b {a\12b} ESUBREG expectMatch 15.12 eAS {a\12b} a12b a12b +expectMatch 15.13 MP {a\701b} a\u00381b a\u00381b doing 16 "expanded syntax" diff --git a/tests/utf.test b/tests/utf.test index d319f6e..0f1428f 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -168,7 +168,7 @@ bsCheck \x 120 bsCheck \xa 10 bsCheck \xA 10 bsCheck \x41 65 -bsCheck \x541 65 +bsCheck \x541 84 bsCheck \u 117 bsCheck \uk 117 bsCheck \u41 65 @@ -177,6 +177,18 @@ bsCheck \uA 10 bsCheck \340 224 bsCheck \ua1 161 bsCheck \u4e21 20001 +bsCheck \741 60 +bsCheck \U 85 +bsCheck \Uk 85 +bsCheck \U41 65 +bsCheck \Ua 10 +bsCheck \UA 10 +bsCheck \Ua1 161 +bsCheck \U4e21 20001 +bsCheck \U004e21 20001 +bsCheck \U00004e21 20001 +bsCheck \U00110000 65533 +bsCheck \Uffffffff 65533 test utf-11.1 {Tcl_UtfToUpper} { string toupper {} -- cgit v0.12