diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2011-08-12 08:02:23 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2011-08-12 08:02:23 (GMT) |
commit | 860e6b5e43e0ac7e673218dd929d425c5d206014 (patch) | |
tree | 37151f179ef8cad02889b0ddf4a07163698782df /generic | |
parent | ba9424ed9813043dd8948d59a0fd5aa83b0cd0ca (diff) | |
download | tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.zip tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.tar.gz tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.tar.bz2 |
TIP 388 implementation
Diffstat (limited to 'generic')
-rw-r--r-- | generic/regc_lex.c | 35 | ||||
-rw-r--r-- | generic/regcomp.c | 2 | ||||
-rw-r--r-- | generic/regcustom.h | 2 | ||||
-rw-r--r-- | generic/tcl.h | 14 | ||||
-rw-r--r-- | generic/tclParse.c | 15 |
5 files changed, 46 insertions, 22 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c index f3a46da..132e757 100644 --- a/generic/regc_lex.c +++ b/generic/regc_lex.c @@ -742,6 +742,7 @@ lexescape( struct vars *v) { chr c; + int i; static const chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; @@ -818,18 +819,23 @@ lexescape( RETV(PLAIN, CHR('\t')); break; case CHR('u'): - c = lexdigits(v, 16, 4, 4); + c = (uchr) lexdigits(v, 16, 1, 4); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('U'): - c = lexdigits(v, 16, 8, 8); + i = lexdigits(v, 16, 1, 8); if (ISERR()) { FAILW(REG_EESCAPE); } - RETV(PLAIN, c); + if (i > 0xFFFF) { + /* TODO: output a Surrogate pair + */ + i = 0xFFFD; + } + RETV(PLAIN, (uchr) i); break; case CHR('v'): RETV(PLAIN, CHR('\v')); @@ -844,7 +850,7 @@ lexescape( break; case CHR('x'): NOTE(REG_UUNPORT); - c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 16, 1, 2); if (ISERR()) { FAILW(REG_EESCAPE); } @@ -866,7 +872,7 @@ lexescape( case CHR('9'): save = v->now; v->now--; /* put first digit back */ - c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ if (ISERR()) { FAILW(REG_EESCAPE); } @@ -893,10 +899,15 @@ lexescape( case CHR('0'): NOTE(REG_UUNPORT); v->now--; /* put first digit back */ - c = lexdigits(v, 8, 1, 3); + c = (uchr) lexdigits(v, 8, 1, 3); if (ISERR()) { FAILW(REG_EESCAPE); } + if (c > 0xff) { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; + } RETV(PLAIN, c); break; default: @@ -909,16 +920,16 @@ lexescape( /* - lexdigits - slurp up digits and return chr value - ^ static chr lexdigits(struct vars *, int, int, int); + ^ static int lexdigits(struct vars *, int, int, int); */ -static chr /* chr value; errors signalled via ERR */ +static int /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, int base, int minlen, int maxlen) { - uchr n; /* unsigned to avoid overflow misbehavior */ + int n; int len; chr c; int d; @@ -926,6 +937,10 @@ lexdigits( n = 0; for (len = 0; len < maxlen && !ATEOS(); len++) { + if (n > 0x10fff) { + /* Stop when continuing would otherwise overflow */ + break; + } c = *v->now++; switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): @@ -958,7 +973,7 @@ lexdigits( ERR(REG_EESCAPE); } - return (chr)n; + return n; } /* diff --git a/generic/regcomp.c b/generic/regcomp.c index d7ae05e..65555aa 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -79,7 +79,7 @@ static void lexnest(struct vars *, const chr *, const chr *); static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); -static chr lexdigits(struct vars *, int, int, int); +static int lexdigits(struct vars *, int, int, int); static int brenext(struct vars *, pchr); static void skip(struct vars *); static chr newline(NOPARMS); diff --git a/generic/regcustom.h b/generic/regcustom.h index bc8c28c..1c970ea 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ diff --git a/generic/tcl.h b/generic/tcl.h index 54bfedc..7370516 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2153,12 +2153,12 @@ typedef struct Tcl_EncodingType { /* * The maximum number of bytes that are necessary to represent a single - * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1 - * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar - * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must - * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and - * recommended mode. UCS-4 is experimental and not recommended. It works for - * the core, but most extensions expect UCS-2. + * Unicode character in UTF-8. The valid values should be 3, 4 or 6 + * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or + * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6, + * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode + * is the default and recommended mode. UCS-4 is experimental and not + * recommended. It works for the core, but most extensions expect UCS-2. */ #ifndef TCL_UTF_MAX @@ -2170,7 +2170,7 @@ typedef struct Tcl_EncodingType { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this diff --git a/generic/tclParse.c b/generic/tclParse.c index 2b0dab4..3c984bf 100644 --- a/generic/tclParse.c +++ b/generic/tclParse.c @@ -754,7 +754,7 @@ TclParseHex( while (numBytes--) { unsigned char digit = UCHAR(*p); - if (!isxdigit(digit)) { + if (!isxdigit(digit) || (result > 0x10fff)) { break; } @@ -866,7 +866,7 @@ TclParseBackslash( result = 0xb; break; case 'x': - count += TclParseHex(p+1, numBytes-2, &result); + count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result); if (count == 2) { /* * No hexadigits -> This is just "x". @@ -889,6 +889,15 @@ TclParseBackslash( result = 'u'; } break; + case 'U': + count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); + if (count == 2) { + /* + * No hexadigits -> This is just "U". + */ + result = 'U'; + } + break; case '\n': count--; do { @@ -917,7 +926,7 @@ TclParseBackslash( result = (result << 3) + (*p - '0'); p++; if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */ - || (UCHAR(*p) >= '8')) { + || (UCHAR(*p) >= '8') || (result >= 0x20)) { break; } count = 4; |