summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2011-08-12 08:02:23 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2011-08-12 08:02:23 (GMT)
commit860e6b5e43e0ac7e673218dd929d425c5d206014 (patch)
tree37151f179ef8cad02889b0ddf4a07163698782df /generic
parentba9424ed9813043dd8948d59a0fd5aa83b0cd0ca (diff)
downloadtcl-860e6b5e43e0ac7e673218dd929d425c5d206014.zip
tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.tar.gz
tcl-860e6b5e43e0ac7e673218dd929d425c5d206014.tar.bz2
TIP 388 implementation
Diffstat (limited to 'generic')
-rw-r--r--generic/regc_lex.c35
-rw-r--r--generic/regcomp.c2
-rw-r--r--generic/regcustom.h2
-rw-r--r--generic/tcl.h14
-rw-r--r--generic/tclParse.c15
5 files changed, 46 insertions, 22 deletions
diff --git a/generic/regc_lex.c b/generic/regc_lex.c
index f3a46da..132e757 100644
--- a/generic/regc_lex.c
+++ b/generic/regc_lex.c
@@ -742,6 +742,7 @@ lexescape(
struct vars *v)
{
chr c;
+ int i;
static const chr alert[] = {
CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
};
@@ -818,18 +819,23 @@ lexescape(
RETV(PLAIN, CHR('\t'));
break;
case CHR('u'):
- c = lexdigits(v, 16, 4, 4);
+ c = (uchr) lexdigits(v, 16, 1, 4);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
RETV(PLAIN, c);
break;
case CHR('U'):
- c = lexdigits(v, 16, 8, 8);
+ i = lexdigits(v, 16, 1, 8);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
- RETV(PLAIN, c);
+ if (i > 0xFFFF) {
+ /* TODO: output a Surrogate pair
+ */
+ i = 0xFFFD;
+ }
+ RETV(PLAIN, (uchr) i);
break;
case CHR('v'):
RETV(PLAIN, CHR('\v'));
@@ -844,7 +850,7 @@ lexescape(
break;
case CHR('x'):
NOTE(REG_UUNPORT);
- c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
+ c = (uchr) lexdigits(v, 16, 1, 2);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -866,7 +872,7 @@ lexescape(
case CHR('9'):
save = v->now;
v->now--; /* put first digit back */
- c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
+ c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
if (ISERR()) {
FAILW(REG_EESCAPE);
}
@@ -893,10 +899,15 @@ lexescape(
case CHR('0'):
NOTE(REG_UUNPORT);
v->now--; /* put first digit back */
- c = lexdigits(v, 8, 1, 3);
+ c = (uchr) lexdigits(v, 8, 1, 3);
if (ISERR()) {
FAILW(REG_EESCAPE);
}
+ if (c > 0xff) {
+ /* out of range, so we handled one digit too much */
+ v->now--;
+ c >>= 3;
+ }
RETV(PLAIN, c);
break;
default:
@@ -909,16 +920,16 @@ lexescape(
/*
- lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
*/
-static chr /* chr value; errors signalled via ERR */
+static int /* chr value; errors signalled via ERR */
lexdigits(
struct vars *v,
int base,
int minlen,
int maxlen)
{
- uchr n; /* unsigned to avoid overflow misbehavior */
+ int n;
int len;
chr c;
int d;
@@ -926,6 +937,10 @@ lexdigits(
n = 0;
for (len = 0; len < maxlen && !ATEOS(); len++) {
+ if (n > 0x10fff) {
+ /* Stop when continuing would otherwise overflow */
+ break;
+ }
c = *v->now++;
switch (c) {
case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
@@ -958,7 +973,7 @@ lexdigits(
ERR(REG_EESCAPE);
}
- return (chr)n;
+ return n;
}
/*
diff --git a/generic/regcomp.c b/generic/regcomp.c
index d7ae05e..65555aa 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -79,7 +79,7 @@ static void lexnest(struct vars *, const chr *, const chr *);
static void lexword(struct vars *);
static int next(struct vars *);
static int lexescape(struct vars *);
-static chr lexdigits(struct vars *, int, int, int);
+static int lexdigits(struct vars *, int, int, int);
static int brenext(struct vars *, pchr);
static void skip(struct vars *);
static chr newline(NOPARMS);
diff --git a/generic/regcustom.h b/generic/regcustom.h
index bc8c28c..1c970ea 100644
--- a/generic/regcustom.h
+++ b/generic/regcustom.h
@@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */
#define NOCELT (-1) /* Celt value which is not valid chr */
#define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */
#define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
#define CHRBITS 32 /* Bits in a chr; must not use sizeof */
#define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */
#define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */
diff --git a/generic/tcl.h b/generic/tcl.h
index 54bfedc..7370516 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2153,12 +2153,12 @@ typedef struct Tcl_EncodingType {
/*
* The maximum number of bytes that are necessary to represent a single
- * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1
- * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar
- * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must
- * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and
- * recommended mode. UCS-4 is experimental and not recommended. It works for
- * the core, but most extensions expect UCS-2.
+ * Unicode character in UTF-8. The valid values should be 3, 4 or 6
+ * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or
+ * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6,
+ * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode
+ * is the default and recommended mode. UCS-4 is experimental and not
+ * recommended. It works for the core, but most extensions expect UCS-2.
*/
#ifndef TCL_UTF_MAX
@@ -2170,7 +2170,7 @@ typedef struct Tcl_EncodingType {
* reflected in regcustom.h.
*/
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
/*
* unsigned int isn't 100% accurate as it should be a strict 4-byte value
* (perhaps wchar_t). 64-bit systems may have troubles. The size of this
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 2b0dab4..3c984bf 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -754,7 +754,7 @@ TclParseHex(
while (numBytes--) {
unsigned char digit = UCHAR(*p);
- if (!isxdigit(digit)) {
+ if (!isxdigit(digit) || (result > 0x10fff)) {
break;
}
@@ -866,7 +866,7 @@ TclParseBackslash(
result = 0xb;
break;
case 'x':
- count += TclParseHex(p+1, numBytes-2, &result);
+ count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result);
if (count == 2) {
/*
* No hexadigits -> This is just "x".
@@ -889,6 +889,15 @@ TclParseBackslash(
result = 'u';
}
break;
+ case 'U':
+ count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
+ if (count == 2) {
+ /*
+ * No hexadigits -> This is just "U".
+ */
+ result = 'U';
+ }
+ break;
case '\n':
count--;
do {
@@ -917,7 +926,7 @@ TclParseBackslash(
result = (result << 3) + (*p - '0');
p++;
if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */
- || (UCHAR(*p) >= '8')) {
+ || (UCHAR(*p) >= '8') || (result >= 0x20)) {
break;
}
count = 4;