[Bug 3464428] string is graph \u0120 is wrong

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2012-01-09 19:59:47 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2012-01-09 19:59:47 (GMT)
commit: 05a1c1c8d017993fac4875d1baed7e62ece0bd93 (patch)
tree: e9c70fecb67f1316d832759eafcec6f8df633692 /generic
parent: e1074be4bf98ca2d9b91651693f0da2ee6c1042d (diff)
download: tcl-05a1c1c8d017993fac4875d1baed7e62ece0bd93.zip
tcl-05a1c1c8d017993fac4875d1baed7e62ece0bd93.tar.gz
tcl-05a1c1c8d017993fac4875d1baed7e62ece0bd93.tar.bz2
4 files changed, 164 insertions, 274 deletions
diff --git a/generic/regc_cvec.c b/generic/regc_cvec.c
index d2d56fc..0b976b8 100644
--- a/generic/regc_cvec.c
+++ b/generic/regc_cvec.c
@@ -3,20 +3,20 @@
  * This file is #included by regcomp.c.
  *
  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
- * 
+ *
  * Development of this software was funded, in part, by Cray Research Inc.,
  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
  * Corporation, none of whom are responsible for the results.  The author
- * thanks all of them. 
- * 
+ * thanks all of them.
+ *
  * Redistribution and use in source and binary forms -- with or without
  * modification -- are permitted for any purpose, provided that
  * redistributions in source form retain this entire copyright notice and
  * indicate the origin and nature of any modifications.
- * 
+ *
  * I'd appreciate being given credit for this package in the documentation
  * of software which uses it, but that is not a requirement.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
@@ -113,38 +113,6 @@ addrange(cv, from, to)
 }
 
 /*
- - addmcce - add an MCCE to a cvec
- ^ static VOID addmcce(struct cvec *, chr *, chr *);
- */
-static VOID
-addmcce(cv, startp, endp)
-    struct cvec *cv;			/* character vector */
-    chr *startp;			/* beginning of text */
-    chr *endp;				/* just past end of text */
-{
-    int len;
-    int i;
-    chr *s;
-    chr *d;
-
-    if (startp == NULL && endp == NULL) {
-	return;
-    }
-    len = endp - startp;
-    assert(len > 0);
-    assert(cv->nchrs + len < cv->chrspace - cv->nmccechrs);
-    assert(cv->nmcces < cv->mccespace);
-    d = &cv->chrs[cv->chrspace - cv->nmccechrs - len - 1];
-    cv->mcces[cv->nmcces++] = d;
-    for (s = startp, i = len; i > 0; s++, i--) {
-	*d++ = *s;
-    }
-    *d++ = 0;				/* endmarker */
-    assert(d == &cv->chrs[cv->chrspace - cv->nmccechrs]);
-    cv->nmccechrs += len + 1;
-}
-
-/*
  - haschr - does a cvec contain this chr?
  ^ static int haschr(struct cvec *, pchr);
  */
@@ -171,24 +139,23 @@ haschr(cv, c)
 
 /*
  - getcvec - get a cvec, remembering it as v->cv
- ^ static struct cvec *getcvec(struct vars *, int, int, int);
+ ^ static struct cvec *getcvec(struct vars *, int, int);
  */
 static struct cvec *
-getcvec(v, nchrs, nranges, nmcces)
+getcvec(v, nchrs, nranges)
     struct vars *v;			/* context */
     int nchrs;				/* to hold this many chrs... */
     int nranges;			/* ... and this many ranges... */
-    int nmcces;				/* ... and this many MCCEs */
 {
     if (v->cv != NULL && nchrs <= v->cv->chrspace &&
-	    nranges <= v->cv->rangespace && nmcces <= v->cv->mccespace) {
+	    nranges <= v->cv->rangespace) {
 	return clearcvec(v->cv);
     }
 
     if (v->cv != NULL) {
 	freecvec(v->cv);
     }
-    v->cv = newcvec(nchrs, nranges, nmcces);
+    v->cv = newcvec(nchrs, nranges, 0);
     if (v->cv == NULL) {
 	ERR(REG_ESPACE);
     }
diff --git a/generic/regc_locale.c b/generic/regc_locale.c
index 77cfd8e..ef8ecdc 100644
--- a/generic/regc_locale.c
+++ b/generic/regc_locale.c
@@ -12,7 +12,7 @@
 
 /* ASCII character-name table */
 
-static struct cname {
+static CONST struct cname {
     CONST char *name;
     CONST char code;
 } cnames[] = {
@@ -224,6 +224,23 @@ static CONST chr alphaCharTable[] = {
 #define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
 
 /*
+ * Unicode: control characters.
+ */
+
+static CONST crange controlRangeTable[] = {
+    {0x007f, 0x009f}, {0x0600, 0x0603}, {0x200b, 0x200f}, {0x202a, 0x202e},
+    {0x2060, 0x2064}, {0x206a, 0x206f}, {0xe000, 0xf8ff}, {0xfff9, 0xfffb}
+};
+
+#define NUM_CONTROL_RANGE (sizeof(controlRangeTable)/sizeof(crange))
+
+static CONST chr controlCharTable[] = {
+    0x00ad, 0x06dd, 0x070f, 0x17b4, 0x17b5, 0xfeff
+};
+
+#define NUM_CONTROL_CHAR (sizeof(controlCharTable)/sizeof(chr))
+
+/*
  * Unicode: decimal digit characters.
  */
 
@@ -478,7 +495,7 @@ static CONST chr upperCharTable[] = {
  */
 
 static CONST crange graphRangeTable[] = {
-    {0x0021, 0x007e}, {0x00a0, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e},
+    {0x0021, 0x007e}, {0x00a1, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e},
     {0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x0527}, {0x0531, 0x0556},
     {0x0559, 0x055f}, {0x0561, 0x0587}, {0x0591, 0x05c7}, {0x05d0, 0x05ea},
     {0x05f0, 0x05f4}, {0x0606, 0x061b}, {0x061e, 0x06dc}, {0x06de, 0x070d},
@@ -513,21 +530,21 @@ static CONST crange graphRangeTable[] = {
     {0x1250, 0x1256}, {0x125a, 0x125d}, {0x1260, 0x1288}, {0x128a, 0x128d},
     {0x1290, 0x12b0}, {0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5},
     {0x12c8, 0x12d6}, {0x12d8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135a},
-    {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x169c},
-    {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714}, {0x1720, 0x1736},
-    {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770}, {0x1780, 0x17b3},
-    {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9}, {0x1800, 0x180e},
-    {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa}, {0x18b0, 0x18f5},
-    {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b}, {0x1944, 0x196d},
-    {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9}, {0x19d0, 0x19da},
-    {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c}, {0x1a7f, 0x1a89},
-    {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b}, {0x1b50, 0x1b7c},
-    {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3}, {0x1bfc, 0x1c37},
-    {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2}, {0x1d00, 0x1de6},
-    {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45}, {0x1f48, 0x1f4d},
-    {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, {0x1fb6, 0x1fc4},
-    {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, {0x1ff2, 0x1ff4},
-    {0x1ff6, 0x1ffe}, {0x2000, 0x200a}, {0x2010, 0x2029}, {0x202f, 0x205f},
+    {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x167f},
+    {0x1681, 0x169c}, {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714},
+    {0x1720, 0x1736}, {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770},
+    {0x1780, 0x17b3}, {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9},
+    {0x1800, 0x180d}, {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa},
+    {0x18b0, 0x18f5}, {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b},
+    {0x1944, 0x196d}, {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9},
+    {0x19d0, 0x19da}, {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c},
+    {0x1a7f, 0x1a89}, {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b},
+    {0x1b50, 0x1b7c}, {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3},
+    {0x1bfc, 0x1c37}, {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2},
+    {0x1d00, 0x1de6}, {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45},
+    {0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4},
+    {0x1fb6, 0x1fc4}, {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef},
+    {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffe}, {0x2010, 0x2027}, {0x2030, 0x205e},
     {0x2074, 0x208e}, {0x2090, 0x209c}, {0x20a0, 0x20b9}, {0x20d0, 0x20f0},
     {0x2100, 0x2189}, {0x2190, 0x23f3}, {0x2400, 0x2426}, {0x2440, 0x244a},
     {0x2460, 0x26ff}, {0x2701, 0x27ca}, {0x27ce, 0x2b4c}, {0x2b50, 0x2b59},
@@ -535,7 +552,7 @@ static CONST crange graphRangeTable[] = {
     {0x2d30, 0x2d65}, {0x2d7f, 0x2d96}, {0x2da0, 0x2da6}, {0x2da8, 0x2dae},
     {0x2db0, 0x2db6}, {0x2db8, 0x2dbe}, {0x2dc0, 0x2dc6}, {0x2dc8, 0x2dce},
     {0x2dd0, 0x2dd6}, {0x2dd8, 0x2dde}, {0x2de0, 0x2e31}, {0x2e80, 0x2e99},
-    {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3000, 0x303f},
+    {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3001, 0x303f},
     {0x3041, 0x3096}, {0x3099, 0x30ff}, {0x3105, 0x312d}, {0x3131, 0x318e},
     {0x3190, 0x31ba}, {0x31c0, 0x31e3}, {0x31f0, 0x321e}, {0x3220, 0x32fe},
     {0x3300, 0x4db5}, {0x4dc0, 0x9fcb}, {0xa000, 0xa48c}, {0xa490, 0xa4c6},
@@ -582,43 +599,6 @@ static CONST chr graphCharTable[] = {
 #define	CH	NOCELT
 
 /*
- - nmcces - how many distinct MCCEs are there?
- ^ static int nmcces(struct vars *);
- */
-static int
-nmcces(v)
-    struct vars *v;			/* context */
-{
-    /*
-     * No multi-character collating elements defined at the moment.
-     */
-    return 0;
-}
-
-/*
- - nleaders - how many chrs can be first chrs of MCCEs?
- ^ static int nleaders(struct vars *);
- */
-static int
-nleaders(v)
-    struct vars *v;			/* context */
-{
-    return 0;
-}
-
-/*
- - allmcces - return a cvec with all the MCCEs of the locale
- ^ static struct cvec *allmcces(struct vars *, struct cvec *);
- */
-static struct cvec *
-allmcces(v, cv)
-    struct vars *v;			/* context */
-    struct cvec *cv;			/* this is supposed to have enough room */
-{
-    return clearcvec(cv);
-}
-
-/*
  - element - map collating-element name to celt
  ^ static celt element(struct vars *, CONST chr *, CONST chr *);
  */
@@ -690,7 +670,7 @@ range(v, a, b, cases)
     }
 
     if (!cases) {		/* easy version */
-	cv = getcvec(v, 0, 1, 0);
+	cv = getcvec(v, 0, 1);
 	NOERRN();
 	addrange(cv, a, b);
 	return cv;
@@ -704,7 +684,7 @@ range(v, a, b, cases)
 
     nchrs = (b - a + 1)*2 + 4;
 
-    cv = getcvec(v, nchrs, 0, 0);
+    cv = getcvec(v, nchrs, 0);
     NOERRN();
 
     for (c=a; c<=b; c++) {
@@ -759,7 +739,7 @@ eclass(v, c, cases)
      */
 
     if ((v->cflags&REG_FAKE) && c == 'x') {
-	cv = getcvec(v, 4, 0, 0);
+	cv = getcvec(v, 4, 0);
 	addchr(cv, (chr)'x');
 	addchr(cv, (chr)'y');
 	if (cases) {
@@ -776,7 +756,7 @@ eclass(v, c, cases)
     if (cases) {
 	return allcases(v, c);
     }
-    cv = getcvec(v, 1, 0, 0);
+    cv = getcvec(v, 1, 0);
     assert(cv != NULL);
     addchr(cv, (chr)c);
     return cv;
@@ -825,15 +805,6 @@ cclass(v, startp, endp, cases)
     np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
 
     /*
-     * Remap lower and upper to alpha if the match is case insensitive.
-     */
-
-    if (cases && len == 5 && (strncmp("lower", np, 5) == 0
-	    || strncmp("upper", np, 5) == 0)) {
-	np = "alpha";
-    }
-
-    /*
      * Map the name to the corresponding enumerated value.
      */
 
@@ -851,13 +822,20 @@ cclass(v, startp, endp, cases)
     }
 
     /*
+     * Remap lower and upper to alpha if the match is case insensitive.
+     */
+
+    if (cases && ((index == CC_LOWER) || (index == CC_UPPER))) {
+	index = CC_ALNUM;
+    }
+
+    /*
      * Now compute the character class contents.
      */
 
     switch((enum classes) index) {
-    case CC_PRINT:
     case CC_ALNUM:
-	cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE, 0);
+	cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_ALPHA_CHAR ; i++) {
 		addchr(cv, alphaCharTable[i]);
@@ -873,7 +851,7 @@ cclass(v, startp, endp, cases)
 	}
 	break;
     case CC_ALPHA:
-	cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE, 0);
+	cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_ALPHA_RANGE ; i++) {
 		addrange(cv, alphaRangeTable[i].start,
@@ -885,23 +863,30 @@ cclass(v, startp, endp, cases)
 	}
 	break;
     case CC_ASCII:
-	cv = getcvec(v, 0, 1, 0);
+	cv = getcvec(v, 0, 1);
 	if (cv) {
 	    addrange(cv, 0, 0x7f);
 	}
 	break;
     case CC_BLANK:
-	cv = getcvec(v, 2, 0, 0);
+	cv = getcvec(v, 2, 0);
 	addchr(cv, '\t');
 	addchr(cv, ' ');
 	break;
     case CC_CNTRL:
-	cv = getcvec(v, 0, 2, 0);
-	addrange(cv, 0x0, 0x1f);
-	addrange(cv, 0x7f, 0x9f);
+	cv = getcvec(v, NUM_CONTROL_CHAR, NUM_CONTROL_RANGE);
+	if (cv) {
+	    for (i=0 ; (size_t)i<NUM_CONTROL_RANGE ; i++) {
+		addrange(cv, controlRangeTable[i].start,
+			controlRangeTable[i].end);
+	    }
+	    for (i=0 ; (size_t)i<NUM_CONTROL_CHAR ; i++) {
+		addchr(cv, controlCharTable[i]);
+	    }
+	}
 	break;
     case CC_DIGIT:
-	cv = getcvec(v, 0, NUM_DIGIT_RANGE, 0);
+	cv = getcvec(v, 0, NUM_DIGIT_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) {
 		addrange(cv, digitRangeTable[i].start,
@@ -910,7 +895,7 @@ cclass(v, startp, endp, cases)
 	}
 	break;
     case CC_PUNCT:
-	cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE, 0);
+	cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_PUNCT_RANGE ; i++) {
 		addrange(cv, punctRangeTable[i].start,
@@ -930,7 +915,8 @@ cclass(v, startp, endp, cases)
 	 * Whatever is actually the case, the number of ranges is fixed (until
 	 * someone comes up with a better arrangement!)
 	 */
-	cv = getcvec(v, 0, 3, 0);
+
+	cv = getcvec(v, 0, 3);
 	if (cv) {
 	    addrange(cv, '0', '9');
 	    addrange(cv, 'a', 'f');
@@ -938,7 +924,7 @@ cclass(v, startp, endp, cases)
 	}
 	break;
     case CC_SPACE:
-	cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE, 0);
+	cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_SPACE_RANGE ; i++) {
 		addrange(cv, spaceRangeTable[i].start,
@@ -950,7 +936,7 @@ cclass(v, startp, endp, cases)
 	}
 	break;
     case CC_LOWER:
-	cv  = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE, 0);
+	cv  = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_LOWER_RANGE ; i++) {
 		addrange(cv, lowerRangeTable[i].start,
@@ -962,7 +948,7 @@ cclass(v, startp, endp, cases)
 	}
 	break;
     case CC_UPPER:
-	cv  = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE, 0);
+	cv  = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_UPPER_RANGE ; i++) {
 		addrange(cv, upperRangeTable[i].start,
@@ -973,8 +959,27 @@ cclass(v, startp, endp, cases)
 	    }
 	}
 	break;
+    case CC_PRINT:
+    	cv  = getcvec(v, NUM_SPACE_CHAR + NUM_GRAPH_CHAR, NUM_SPACE_RANGE + NUM_GRAPH_RANGE  - 1);
+    	if (cv) {
+    	    for (i=1 ; (size_t)i<NUM_SPACE_RANGE ; i++) {
+    		addrange(cv, spaceRangeTable[i].start,
+    				spaceRangeTable[i].end);
+    	    }
+    	    for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) {
+    		addchr(cv, spaceCharTable[i]);
+    	    }
+    	    for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
+    		addrange(cv, graphRangeTable[i].start,
+    				graphRangeTable[i].end);
+    	    }
+    	    for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) {
+    		addchr(cv, graphCharTable[i]);
+    	    }
+    	}
+    	break;
     case CC_GRAPH:
-	cv  = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE, 0);
+	cv  = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE);
 	if (cv) {
 	    for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
 		addrange(cv, graphRangeTable[i].start,
@@ -1012,10 +1017,10 @@ allcases(v, pc)
     tc = Tcl_UniCharToTitle((chr)c);
 
     if (tc != uc) {
-	cv = getcvec(v, 3, 0, 0);
+	cv = getcvec(v, 3, 0);
 	addchr(cv, tc);
     } else {
-	cv = getcvec(v, 2, 0, 0);
+	cv = getcvec(v, 2, 0);
     }
     addchr(cv, lc);
     if (lc != uc) {
diff --git a/generic/regcomp.c b/generic/regcomp.c
index 66cdf33..c239d8b 100644
--- a/generic/regcomp.c
+++ b/generic/regcomp.c
@@ -3,20 +3,20 @@
  * This file #includes several others (see the bottom).
  *
  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
- * 
+ *
  * Development of this software was funded, in part, by Cray Research Inc.,
  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
  * Corporation, none of whom are responsible for the results.  The author
- * thanks all of them. 
- * 
+ * thanks all of them.
+ *
  * Redistribution and use in source and binary forms -- with or without
  * modification -- are permitted for any purpose, provided that
  * redistributions in source form retain this entire copyright notice and
  * indicate the origin and nature of any modifications.
- * 
+ *
  * I'd appreciate being given credit for this package in the documentation
  * of software which uses it, but that is not a requirement.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
@@ -53,7 +53,6 @@ static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *))
 static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
 static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
 static chr *scanplain _ANSI_ARGS_((struct vars *));
-static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
 static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
 static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
 static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
@@ -171,14 +170,10 @@ static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
 static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
 static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
 static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
-static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
 static int haschr _ANSI_ARGS_((struct cvec *, pchr));
-static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
+static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int));
 static VOID freecvec _ANSI_ARGS_((struct cvec *));
 /* === regc_locale.c === */
-static int nmcces _ANSI_ARGS_((struct vars *));
-static int nleaders _ANSI_ARGS_((struct vars *));
-static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
 static celt element _ANSI_ARGS_((struct vars *, CONST chr *, CONST chr *));
 static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
 static int before _ANSI_ARGS_((celt, celt));
@@ -351,14 +346,6 @@ int flags;
 	v->cv = newcvec(100, 20, 10);
 	if (v->cv == NULL)
 		return freev(v, REG_ESPACE);
-	i = nmcces(v);
-	if (i > 0) {
-		v->mcces = newcvec(nleaders(v), 0, i);
-		CNOERR();
-		v->mcces = allmcces(v, v->mcces);
-		leaders(v, v->mcces);
-		addmcce(v->mcces, (chr *)NULL, (chr *)NULL);	/* dummy */
-	}
 	CNOERR();
 
 	/* parsing */
@@ -1356,7 +1343,7 @@ struct state *rp;
 	assert(right->nins == 0);
 	freestate(v->nfa, right);
 }
-			
+
 /*
  - brackpart - handle one item (or range) within a bracket expression
  ^ static VOID brackpart(struct vars *, struct state *, struct state *);
@@ -1493,50 +1480,6 @@ struct vars *v;
 }
 
 /*
- - leaders - process a cvec of collating elements to also include leaders
- * Also gives all characters involved their own colors, which is almost
- * certainly necessary, and sets up little disconnected subNFA.
- ^ static VOID leaders(struct vars *, struct cvec *);
- */
-static VOID
-leaders(v, cv)
-struct vars *v;
-struct cvec *cv;
-{
-	int mcce;
-	chr *p;
-	chr leader;
-	struct state *s;
-	struct arc *a;
-
-	v->mccepbegin = newstate(v->nfa);
-	v->mccepend = newstate(v->nfa);
-	NOERR();
-
-	for (mcce = 0; mcce < cv->nmcces; mcce++) {
-		p = cv->mcces[mcce];
-		leader = *p;
-		if (!haschr(cv, leader)) {
-			addchr(cv, leader);
-			s = newstate(v->nfa);
-			newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
-							v->mccepbegin, s);
-			okcolors(v->nfa, v->cm);
-		} else {
-			a = findarc(v->mccepbegin, PLAIN,
-						GETCOLOR(v->cm, leader));
-			assert(a != NULL);
-			s = a->to;
-			assert(s != v->mccepend);
-		}
-		p++;
-		assert(*p != 0 && *(p+1) == 0);	/* only 2-char MCCEs for now */
-		newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
-		okcolors(v->nfa, v->cm);
-	}
-}
-
-/*
  - onechr - fill in arcs for a plain character, and possible case complements
  * This is mostly a shortcut for efficient handling of the common case.
  ^ static VOID onechr(struct vars *, pchr, struct state *, struct state *);
@@ -1581,19 +1524,7 @@ struct state *rp;
 	struct state *s;
 	struct state *ps;	/* state in prototype */
 
-	/* need a place to store leaders, if any */
-	if (nmcces(v) > 0) {
-		assert(v->mcces != NULL);
-		if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
-			if (v->cv2 != NULL)
-				free(v->cv2);
-			v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
-			NOERR();
-			leads = v->cv2;
-		} else
-			leads = clearcvec(v->cv2);
-	} else
-		leads = NULL;
+	leads = NULL;
 
 	/* first, get the ordinary characters out of the way */
 	for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
@@ -2067,7 +1998,7 @@ FILE *f;
 								GUTSMAGIC);
 
 	fprintf(f, "\n\n\n========= DUMP ==========\n");
-	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", 
+	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
 		re->re_nsub, re->re_info, re->re_csize, g->ntree);
 
 	dumpcolors(&g->cmap, f);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 505dc91..6b5e2e8 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -21,36 +21,35 @@
  * The following macros are used for fast character category tests.  The
  * x_BITS values are shifted right by the category value to determine whether
  * the given category is included in the set.
- */ 
+ */
 
 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
 
+#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
+
 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
 
 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
     | (1 << PARAGRAPH_SEPARATOR))
 
-#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
-
-#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
-	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
-	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
-	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
-	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
-	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
-	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
-	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
-	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
 
 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
 
+#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
+	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+	    (1 << OTHER_NUMBER) | \
+	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+
 /*
- * Unicode characters less than this value are represented by themselves 
- * in UTF-8 strings. 
+ * Unicode characters less than this value are represented by themselves
+ * in UTF-8 strings.
  */
 
 #define UNICODE_SELF	0x80
@@ -108,7 +107,7 @@ static int UtfCount _ANSI_ARGS_((int ch));
  *
  *---------------------------------------------------------------------------
  */
- 
+
 INLINE static int
 UtfCount(ch)
     int ch;			/* The Tcl_UniChar whose size is returned. */
@@ -146,14 +145,14 @@ UtfCount(ch)
  *
  * Results:
  *	The return values is the number of bytes in the buffer that
- *	were consumed.  
+ *	were consumed.
  *
  * Side effects:
  *	None.
  *
  *---------------------------------------------------------------------------
  */
- 
+
 INLINE int
 Tcl_UniCharToUtf(ch, str)
     int ch;			/* The Tcl_UniChar to be stored in the
@@ -230,7 +229,7 @@ Tcl_UniCharToUtf(ch, str)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 char *
 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
     CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */
@@ -289,7 +288,7 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 int
 Tcl_UtfToUniChar(str, chPtr)
     register CONST char *str;	 /* The UTF-8 string. */
@@ -297,7 +296,7 @@ Tcl_UtfToUniChar(str, chPtr)
 				  * by the UTF-8 string. */
 {
     register int byte;
-    
+
     /*
      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
      */
@@ -334,7 +333,7 @@ Tcl_UtfToUniChar(str, chPtr)
 	     * Three-byte-character lead byte followed by two trail bytes.
 	     */
 
-	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 
+	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
 		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
 	    return 3;
 	}
@@ -474,15 +473,15 @@ Tcl_UtfCharComplete(str, len)
  *	Plan 9 utflen() and utfnlen().
  *
  * Results:
- *	As above.  
+ *	As above.
  *
  * Side effects:
  *	None.
  *
  *---------------------------------------------------------------------------
  */
- 
-int 
+
+int
 Tcl_NumUtfChars(str, len)
     register CONST char *str;	/* The UTF-8 string to measure. */
     int len;			/* The length of the string in bytes, or -1
@@ -549,7 +548,7 @@ Tcl_UtfFindFirst(string, ch)
 {
     int len;
     Tcl_UniChar find;
-    
+
     while (1) {
 	len = TclUtfToUniChar(string, &find);
 	if (find == ch) {
@@ -590,7 +589,7 @@ Tcl_UtfFindLast(string, ch)
     int len;
     Tcl_UniChar find;
     CONST char *last;
-	
+
     last = NULL;
     while (1) {
 	len = TclUtfToUniChar(string, &find);
@@ -624,9 +623,9 @@ Tcl_UtfFindLast(string, ch)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 CONST char *
-Tcl_UtfNext(str) 
+Tcl_UtfNext(str)
     CONST char *str;		    /* The current location in the string. */
 {
     Tcl_UniChar ch;
@@ -664,7 +663,7 @@ Tcl_UtfPrev(str, start)
 {
     CONST char *look;
     int i, byte;
-    
+
     str--;
     look = str;
     for (i = 0; i < TCL_UTF_MAX; i++) {
@@ -685,7 +684,7 @@ Tcl_UtfPrev(str, start)
     }
     return str;
 }
-	
+
 /*
  *---------------------------------------------------------------------------
  *
@@ -702,7 +701,7 @@ Tcl_UtfPrev(str, start)
  *
  *---------------------------------------------------------------------------
  */
- 
+
 Tcl_UniChar
 Tcl_UniCharAtIndex(src, index)
     register CONST char *src;	/* The UTF-8 string to dereference. */
@@ -740,7 +739,7 @@ Tcl_UtfAtIndex(src, index)
     register int index;		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
-    
+
     while (index > 0) {
 	index--;
 	src += TclUtfToUniChar(src, &ch);
@@ -760,7 +759,7 @@ Tcl_UtfAtIndex(src, index)
  *	returns the number of bytes written to dst.  At most TCL_UTF_MAX
  *	bytes are written to dst; dst must have been large enough to accept
  *	those bytes.  If readPtr isn't NULL then it is filled in with a
- *	count of the number of bytes in the backslash sequence.  
+ *	count of the number of bytes in the backslash sequence.
  *
  * Side effects:
  *	The maximum number of bytes it takes to represent a Unicode
@@ -839,7 +838,7 @@ Tcl_UtfToUpper(str)
 	 * the conversion (thereby causing a segfault), only copy the
 	 * upper case char to dst if its size is <= the original char.
 	 */
-	
+
 	if (bytes < UtfCount(upChar)) {
 	    memcpy(dst, src, (size_t) bytes);
 	    dst += bytes;
@@ -877,7 +876,7 @@ Tcl_UtfToLower(str)
     Tcl_UniChar ch, lowChar;
     char *src, *dst;
     int bytes;
-    
+
     /*
      * Iterate over the string until we hit the terminating null.
      */
@@ -892,7 +891,7 @@ Tcl_UtfToLower(str)
 	 * the conversion (thereby causing a segfault), only copy the
 	 * lower case char to dst if its size is <= the original char.
 	 */
-	
+
 	if (bytes < UtfCount(lowChar)) {
 	    memcpy(dst, src, (size_t) bytes);
 	    dst += bytes;
@@ -931,7 +930,7 @@ Tcl_UtfToTitle(str)
     Tcl_UniChar ch, titleChar, lowChar;
     char *src, *dst;
     int bytes;
-    
+
     /*
      * Capitalize the first character and then lowercase the rest of the
      * characters until we get to a null.
@@ -1216,7 +1215,7 @@ Tcl_UniCharLen(str)
     CONST Tcl_UniChar *str;	/* Unicode string to find length of. */
 {
     int len = 0;
-    
+
     while (*str != '\0') {
 	len++;
 	str++;
@@ -1322,9 +1321,7 @@ int
 Tcl_UniCharIsAlnum(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
-    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1347,8 +1344,7 @@ int
 Tcl_UniCharIsAlpha(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return ((ALPHA_BITS >> category) & 1);
+    return ((ALPHA_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1371,7 +1367,7 @@ int
 Tcl_UniCharIsControl(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+    return ((CONTROL_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1394,8 +1390,7 @@ int
 Tcl_UniCharIsDigit(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
-	    == DECIMAL_DIGIT_NUMBER);
+    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
 }
 
 /*
@@ -1418,8 +1413,7 @@ int
 Tcl_UniCharIsGraph(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return (((PRINT_BITS >> category) & 1) && (ch != ' '));
+    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1442,7 +1436,7 @@ int
 Tcl_UniCharIsLower(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+    return (GetCategory(ch) == LOWERCASE_LETTER);
 }
 
 /*
@@ -1465,8 +1459,7 @@ int
 Tcl_UniCharIsPrint(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return ((PRINT_BITS >> category) & 1);
+    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1489,8 +1482,7 @@ int
 Tcl_UniCharIsPunct(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-    return ((PUNCT_BITS >> category) & 1);
+    return ((PUNCT_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1513,18 +1505,15 @@ int
 Tcl_UniCharIsSpace(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category;
-
     /*
      * If the character is within the first 127 characters, just use the
      * standard C function, otherwise consult the Unicode table.
      */
 
-    if (ch < 0x80) {
+    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
 	return isspace(UCHAR(ch)); /* INTL: ISO space */
     } else {
-	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-	return ((SPACE_BITS >> category) & 1);
+	return ((SPACE_BITS >> GetCategory(ch)) & 1);
     }
 }
 
@@ -1548,7 +1537,7 @@ int
 Tcl_UniCharIsUpper(ch)
     int ch;			/* Unicode character to test. */
 {
-    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+    return (GetCategory(ch) == UPPERCASE_LETTER);
 }
 
 /*
@@ -1572,9 +1561,7 @@ int
 Tcl_UniCharIsWordChar(ch)
     int ch;			/* Unicode character to test. */
 {
-    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
-
-    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+    return ((WORD_BITS >> GetCategory(ch)) & 1);
 }
 
 /*
@@ -1609,16 +1596,16 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
     int nocase;			/* 0 for case sensitive, 1 for insensitive */
 {
     Tcl_UniChar ch1, p;
-    
+
     while (1) {
 	p = *pattern;
-	
+
 	/*
 	 * See if we're at the end of both the pattern and the string.  If
 	 * so, we succeeded.  If we're at the end of the pattern but not at
 	 * the end of the string, we failed.
 	 */
-	
+
 	if (p == 0) {
 	    return (*string == 0);
 	}
@@ -1633,7 +1620,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
 	 * recursively for each postfix of string, until either we match or we
 	 * reach the end of the string.
 	 */
-	
+
 	if (p == '*') {
 	    /*
 	     * Skip all successive *'s in the pattern
@@ -1688,7 +1675,7 @@ Tcl_UniCharCaseMatch(string, pattern, nocase)
 	 * by a list of characters that are acceptable, or by a range
 	 * (two characters separated by "-").
 	 */
-	
+
 	if (p == '[') {
 	    Tcl_UniChar startChar, endChar;
 
@@ -1818,7 +1805,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
 	 * recursively for each postfix of string, until either we match or we
 	 * reach the end of the string.
 	 */
-	
+
 	if (p == '*') {
 	    /*
 	     * Skip all successive *'s in the pattern
@@ -1876,7 +1863,7 @@ TclUniCharMatch(string, strLen, pattern, ptnLen, nocase)
 	 * by a list of characters that are acceptable, or by a range
 	 * (two characters separated by "-").
 	 */
-	
+
 	if (p == '[') {
 	    Tcl_UniChar ch1, startChar, endChar;
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2012-01-09 19:59:47 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2012-01-09 19:59:47 (GMT)
commit	05a1c1c8d017993fac4875d1baed7e62ece0bd93 (patch)
tree	e9c70fecb67f1316d832759eafcec6f8df633692 /generic
parent	e1074be4bf98ca2d9b91651693f0da2ee6c1042d (diff)
download	tcl-05a1c1c8d017993fac4875d1baed7e62ece0bd93.zip tcl-05a1c1c8d017993fac4875d1baed7e62ece0bd93.tar.gz tcl-05a1c1c8d017993fac4875d1baed7e62ece0bd93.tar.bz2